xref: /qemu/block/qcow2-cluster.c (revision 8f897341)
145aba42fSKevin Wolf /*
245aba42fSKevin Wolf  * Block driver for the QCOW version 2 format
345aba42fSKevin Wolf  *
445aba42fSKevin Wolf  * Copyright (c) 2004-2006 Fabrice Bellard
545aba42fSKevin Wolf  *
645aba42fSKevin Wolf  * Permission is hereby granted, free of charge, to any person obtaining a copy
745aba42fSKevin Wolf  * of this software and associated documentation files (the "Software"), to deal
845aba42fSKevin Wolf  * in the Software without restriction, including without limitation the rights
945aba42fSKevin Wolf  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1045aba42fSKevin Wolf  * copies of the Software, and to permit persons to whom the Software is
1145aba42fSKevin Wolf  * furnished to do so, subject to the following conditions:
1245aba42fSKevin Wolf  *
1345aba42fSKevin Wolf  * The above copyright notice and this permission notice shall be included in
1445aba42fSKevin Wolf  * all copies or substantial portions of the Software.
1545aba42fSKevin Wolf  *
1645aba42fSKevin Wolf  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1745aba42fSKevin Wolf  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1845aba42fSKevin Wolf  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
1945aba42fSKevin Wolf  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2045aba42fSKevin Wolf  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2145aba42fSKevin Wolf  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2245aba42fSKevin Wolf  * THE SOFTWARE.
2345aba42fSKevin Wolf  */
2445aba42fSKevin Wolf 
2580c71a24SPeter Maydell #include "qemu/osdep.h"
2645aba42fSKevin Wolf #include <zlib.h>
2745aba42fSKevin Wolf 
28e2c1c34fSMarkus Armbruster #include "block/block-io.h"
29c9a442e4SAlberto Garcia #include "qapi/error.h"
300d8c41daSMichael S. Tsirkin #include "qcow2.h"
3158369e22SPaolo Bonzini #include "qemu/bswap.h"
325df022cfSPeter Maydell #include "qemu/memalign.h"
333cce16f4SKevin Wolf #include "trace.h"
3445aba42fSKevin Wolf 
qcow2_shrink_l1_table(BlockDriverState * bs,uint64_t exact_size)35a1b4ecfdSPaolo Bonzini int coroutine_fn qcow2_shrink_l1_table(BlockDriverState *bs,
36a1b4ecfdSPaolo Bonzini                                        uint64_t exact_size)
3746b732cdSPavel Butsykin {
3846b732cdSPavel Butsykin     BDRVQcow2State *s = bs->opaque;
3946b732cdSPavel Butsykin     int new_l1_size, i, ret;
4046b732cdSPavel Butsykin 
4146b732cdSPavel Butsykin     if (exact_size >= s->l1_size) {
4246b732cdSPavel Butsykin         return 0;
4346b732cdSPavel Butsykin     }
4446b732cdSPavel Butsykin 
4546b732cdSPavel Butsykin     new_l1_size = exact_size;
4646b732cdSPavel Butsykin 
4746b732cdSPavel Butsykin #ifdef DEBUG_ALLOC2
4846b732cdSPavel Butsykin     fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size);
4946b732cdSPavel Butsykin #endif
5046b732cdSPavel Butsykin 
5117362398SPaolo Bonzini     BLKDBG_CO_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
5238505e2aSAlberto Faria     ret = bdrv_co_pwrite_zeroes(bs->file,
5338505e2aSAlberto Faria                                 s->l1_table_offset + new_l1_size * L1E_SIZE,
5402b1ecfaSAlberto Garcia                                 (s->l1_size - new_l1_size) * L1E_SIZE, 0);
5546b732cdSPavel Butsykin     if (ret < 0) {
5646b732cdSPavel Butsykin         goto fail;
5746b732cdSPavel Butsykin     }
5846b732cdSPavel Butsykin 
5938505e2aSAlberto Faria     ret = bdrv_co_flush(bs->file->bs);
6046b732cdSPavel Butsykin     if (ret < 0) {
6146b732cdSPavel Butsykin         goto fail;
6246b732cdSPavel Butsykin     }
6346b732cdSPavel Butsykin 
6417362398SPaolo Bonzini     BLKDBG_CO_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
6546b732cdSPavel Butsykin     for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
6646b732cdSPavel Butsykin         if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
6746b732cdSPavel Butsykin             continue;
6846b732cdSPavel Butsykin         }
6946b732cdSPavel Butsykin         qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
7046b732cdSPavel Butsykin                             s->cluster_size, QCOW2_DISCARD_ALWAYS);
7146b732cdSPavel Butsykin         s->l1_table[i] = 0;
7246b732cdSPavel Butsykin     }
7346b732cdSPavel Butsykin     return 0;
7446b732cdSPavel Butsykin 
7546b732cdSPavel Butsykin fail:
7646b732cdSPavel Butsykin     /*
7746b732cdSPavel Butsykin      * If the write in the l1_table failed the image may contain a partially
7846b732cdSPavel Butsykin      * overwritten l1_table. In this case it would be better to clear the
7946b732cdSPavel Butsykin      * l1_table in memory to avoid possible image corruption.
8046b732cdSPavel Butsykin      */
8146b732cdSPavel Butsykin     memset(s->l1_table + new_l1_size, 0,
8202b1ecfaSAlberto Garcia            (s->l1_size - new_l1_size) * L1E_SIZE);
8346b732cdSPavel Butsykin     return ret;
8446b732cdSPavel Butsykin }
8546b732cdSPavel Butsykin 
qcow2_grow_l1_table(BlockDriverState * bs,uint64_t min_size,bool exact_size)862cf7cfa1SKevin Wolf int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
872cf7cfa1SKevin Wolf                         bool exact_size)
8845aba42fSKevin Wolf {
89ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
902cf7cfa1SKevin Wolf     int new_l1_size2, ret, i;
9145aba42fSKevin Wolf     uint64_t *new_l1_table;
92fda74f82SMax Reitz     int64_t old_l1_table_offset, old_l1_size;
932cf7cfa1SKevin Wolf     int64_t new_l1_table_offset, new_l1_size;
9445aba42fSKevin Wolf     uint8_t data[12];
9545aba42fSKevin Wolf 
9672893756SStefan Hajnoczi     if (min_size <= s->l1_size)
9745aba42fSKevin Wolf         return 0;
9872893756SStefan Hajnoczi 
99b93f9950SMax Reitz     /* Do a sanity check on min_size before trying to calculate new_l1_size
100b93f9950SMax Reitz      * (this prevents overflows during the while loop for the calculation of
101b93f9950SMax Reitz      * new_l1_size) */
10202b1ecfaSAlberto Garcia     if (min_size > INT_MAX / L1E_SIZE) {
103b93f9950SMax Reitz         return -EFBIG;
104b93f9950SMax Reitz     }
105b93f9950SMax Reitz 
10672893756SStefan Hajnoczi     if (exact_size) {
10772893756SStefan Hajnoczi         new_l1_size = min_size;
10872893756SStefan Hajnoczi     } else {
10972893756SStefan Hajnoczi         /* Bump size up to reduce the number of times we have to grow */
11072893756SStefan Hajnoczi         new_l1_size = s->l1_size;
111d191d12dSStefan Weil         if (new_l1_size == 0) {
112d191d12dSStefan Weil             new_l1_size = 1;
113d191d12dSStefan Weil         }
11445aba42fSKevin Wolf         while (min_size > new_l1_size) {
11521cf3e12SMarc-André Lureau             new_l1_size = DIV_ROUND_UP(new_l1_size * 3, 2);
11645aba42fSKevin Wolf         }
11772893756SStefan Hajnoczi     }
11872893756SStefan Hajnoczi 
11984c26520SMax Reitz     QEMU_BUILD_BUG_ON(QCOW_MAX_L1_SIZE > INT_MAX);
12002b1ecfaSAlberto Garcia     if (new_l1_size > QCOW_MAX_L1_SIZE / L1E_SIZE) {
1212cf7cfa1SKevin Wolf         return -EFBIG;
1222cf7cfa1SKevin Wolf     }
1232cf7cfa1SKevin Wolf 
12445aba42fSKevin Wolf #ifdef DEBUG_ALLOC2
1252cf7cfa1SKevin Wolf     fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n",
1262cf7cfa1SKevin Wolf             s->l1_size, new_l1_size);
12745aba42fSKevin Wolf #endif
12845aba42fSKevin Wolf 
12902b1ecfaSAlberto Garcia     new_l1_size2 = L1E_SIZE * new_l1_size;
130ef97d608SAlberto Garcia     new_l1_table = qemu_try_blockalign(bs->file->bs, new_l1_size2);
131de82815dSKevin Wolf     if (new_l1_table == NULL) {
132de82815dSKevin Wolf         return -ENOMEM;
133de82815dSKevin Wolf     }
134ef97d608SAlberto Garcia     memset(new_l1_table, 0, new_l1_size2);
135de82815dSKevin Wolf 
1360647d47cSStefan Hajnoczi     if (s->l1_size) {
13702b1ecfaSAlberto Garcia         memcpy(new_l1_table, s->l1_table, s->l1_size * L1E_SIZE);
1380647d47cSStefan Hajnoczi     }
13945aba42fSKevin Wolf 
14045aba42fSKevin Wolf     /* write new table (align to cluster) */
14166f82ceeSKevin Wolf     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE);
142ed6ccf0fSKevin Wolf     new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
1435d757b56SKevin Wolf     if (new_l1_table_offset < 0) {
144de82815dSKevin Wolf         qemu_vfree(new_l1_table);
1455d757b56SKevin Wolf         return new_l1_table_offset;
1465d757b56SKevin Wolf     }
14729c1a730SKevin Wolf 
14829c1a730SKevin Wolf     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
14929c1a730SKevin Wolf     if (ret < 0) {
15080fa3341SKevin Wolf         goto fail;
15129c1a730SKevin Wolf     }
15245aba42fSKevin Wolf 
153cf93980eSMax Reitz     /* the L1 position has not yet been updated, so these clusters must
154cf93980eSMax Reitz      * indeed be completely free */
155231bb267SMax Reitz     ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset,
156966b000fSKevin Wolf                                         new_l1_size2, false);
157cf93980eSMax Reitz     if (ret < 0) {
158cf93980eSMax Reitz         goto fail;
159cf93980eSMax Reitz     }
160cf93980eSMax Reitz 
16166f82ceeSKevin Wolf     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
16245aba42fSKevin Wolf     for(i = 0; i < s->l1_size; i++)
16345aba42fSKevin Wolf         new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
16432cc71deSAlberto Faria     ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, new_l1_size2,
16532cc71deSAlberto Faria                            new_l1_table, 0);
1668b3b7206SKevin Wolf     if (ret < 0)
16745aba42fSKevin Wolf         goto fail;
16845aba42fSKevin Wolf     for(i = 0; i < s->l1_size; i++)
16945aba42fSKevin Wolf         new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
17045aba42fSKevin Wolf 
17145aba42fSKevin Wolf     /* set new table */
17266f82ceeSKevin Wolf     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
173f1f7a1ddSPeter Maydell     stl_be_p(data, new_l1_size);
174e4ef9f46SPeter Maydell     stq_be_p(data + 4, new_l1_table_offset);
17532cc71deSAlberto Faria     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size),
17632cc71deSAlberto Faria                            sizeof(data), data, 0);
1778b3b7206SKevin Wolf     if (ret < 0) {
17845aba42fSKevin Wolf         goto fail;
179fb8fa77cSKevin Wolf     }
180de82815dSKevin Wolf     qemu_vfree(s->l1_table);
181fda74f82SMax Reitz     old_l1_table_offset = s->l1_table_offset;
18245aba42fSKevin Wolf     s->l1_table_offset = new_l1_table_offset;
18345aba42fSKevin Wolf     s->l1_table = new_l1_table;
184fda74f82SMax Reitz     old_l1_size = s->l1_size;
18545aba42fSKevin Wolf     s->l1_size = new_l1_size;
18602b1ecfaSAlberto Garcia     qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * L1E_SIZE,
187fda74f82SMax Reitz                         QCOW2_DISCARD_OTHER);
18845aba42fSKevin Wolf     return 0;
18945aba42fSKevin Wolf  fail:
190de82815dSKevin Wolf     qemu_vfree(new_l1_table);
1916cfcb9b8SKevin Wolf     qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
1926cfcb9b8SKevin Wolf                         QCOW2_DISCARD_OTHER);
1938b3b7206SKevin Wolf     return ret;
19445aba42fSKevin Wolf }
19545aba42fSKevin Wolf 
19645aba42fSKevin Wolf /*
19745aba42fSKevin Wolf  * l2_load
19845aba42fSKevin Wolf  *
199e2b5713eSAlberto Garcia  * @bs: The BlockDriverState
200e2b5713eSAlberto Garcia  * @offset: A guest offset, used to calculate what slice of the L2
201e2b5713eSAlberto Garcia  *          table to load.
202e2b5713eSAlberto Garcia  * @l2_offset: Offset to the L2 table in the image file.
203e2b5713eSAlberto Garcia  * @l2_slice: Location to store the pointer to the L2 slice.
20445aba42fSKevin Wolf  *
205e2b5713eSAlberto Garcia  * Loads a L2 slice into memory (L2 slices are the parts of L2 tables
206e2b5713eSAlberto Garcia  * that are loaded by the qcow2 cache). If the slice is in the cache,
207e2b5713eSAlberto Garcia  * the cache is used; otherwise the L2 slice is loaded from the image
208e2b5713eSAlberto Garcia  * file.
20945aba42fSKevin Wolf  */
2100bb79c97SKevin Wolf static int GRAPH_RDLOCK
l2_load(BlockDriverState * bs,uint64_t offset,uint64_t l2_offset,uint64_t ** l2_slice)2110bb79c97SKevin Wolf l2_load(BlockDriverState *bs, uint64_t offset,
212e2b5713eSAlberto Garcia         uint64_t l2_offset, uint64_t **l2_slice)
21345aba42fSKevin Wolf {
214ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
215c8fd8554SAlberto Garcia     int start_of_slice = l2_entry_size(s) *
216e2b5713eSAlberto Garcia         (offset_to_l2_index(s, offset) - offset_to_l2_slice_index(s, offset));
21745aba42fSKevin Wolf 
218e2b5713eSAlberto Garcia     return qcow2_cache_get(bs, s->l2_table_cache, l2_offset + start_of_slice,
219e2b5713eSAlberto Garcia                            (void **)l2_slice);
22055c17e98SKevin Wolf }
22155c17e98SKevin Wolf 
22245aba42fSKevin Wolf /*
223da86f8cbSAlberto Garcia  * Writes an L1 entry to disk (note that depending on the alignment
224da86f8cbSAlberto Garcia  * requirements this function may write more that just one entry in
225da86f8cbSAlberto Garcia  * order to prevent bdrv_pwrite from performing a read-modify-write)
2266583e3c7SKevin Wolf  */
qcow2_write_l1_entry(BlockDriverState * bs,int l1_index)227e23e400eSMax Reitz int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index)
2286583e3c7SKevin Wolf {
229ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
2306583e3c7SKevin Wolf     int l1_start_index;
231f7defcb6SKevin Wolf     int i, ret;
23202b1ecfaSAlberto Garcia     int bufsize = MAX(L1E_SIZE,
233da86f8cbSAlberto Garcia                       MIN(bs->file->bs->bl.request_alignment, s->cluster_size));
23402b1ecfaSAlberto Garcia     int nentries = bufsize / L1E_SIZE;
235da86f8cbSAlberto Garcia     g_autofree uint64_t *buf = g_try_new0(uint64_t, nentries);
2366583e3c7SKevin Wolf 
237da86f8cbSAlberto Garcia     if (buf == NULL) {
238da86f8cbSAlberto Garcia         return -ENOMEM;
239da86f8cbSAlberto Garcia     }
240da86f8cbSAlberto Garcia 
241da86f8cbSAlberto Garcia     l1_start_index = QEMU_ALIGN_DOWN(l1_index, nentries);
242da86f8cbSAlberto Garcia     for (i = 0; i < MIN(nentries, s->l1_size - l1_start_index); i++) {
2436583e3c7SKevin Wolf         buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]);
2446583e3c7SKevin Wolf     }
2456583e3c7SKevin Wolf 
246231bb267SMax Reitz     ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1,
247c508c73dSAlberto Garcia             s->l1_table_offset + L1E_SIZE * l1_start_index, bufsize, false);
248cf93980eSMax Reitz     if (ret < 0) {
249cf93980eSMax Reitz         return ret;
250cf93980eSMax Reitz     }
251cf93980eSMax Reitz 
25266f82ceeSKevin Wolf     BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
253d9ca2ea2SKevin Wolf     ret = bdrv_pwrite_sync(bs->file,
254c508c73dSAlberto Garcia                            s->l1_table_offset + L1E_SIZE * l1_start_index,
25532cc71deSAlberto Faria                            bufsize, buf, 0);
256f7defcb6SKevin Wolf     if (ret < 0) {
257f7defcb6SKevin Wolf         return ret;
2586583e3c7SKevin Wolf     }
2596583e3c7SKevin Wolf 
2606583e3c7SKevin Wolf     return 0;
2616583e3c7SKevin Wolf }
2626583e3c7SKevin Wolf 
2636583e3c7SKevin Wolf /*
26445aba42fSKevin Wolf  * l2_allocate
26545aba42fSKevin Wolf  *
26645aba42fSKevin Wolf  * Allocate a new l2 entry in the file. If l1_index points to an already
26745aba42fSKevin Wolf  * used entry in the L2 table (i.e. we are doing a copy on write for the L2
26845aba42fSKevin Wolf  * table) copy the contents of the old L2 table into the newly allocated one.
26945aba42fSKevin Wolf  * Otherwise the new table is initialized with zeros.
27045aba42fSKevin Wolf  *
27145aba42fSKevin Wolf  */
27245aba42fSKevin Wolf 
l2_allocate(BlockDriverState * bs,int l1_index)2730bb79c97SKevin Wolf static int GRAPH_RDLOCK l2_allocate(BlockDriverState *bs, int l1_index)
27445aba42fSKevin Wolf {
275ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
2766583e3c7SKevin Wolf     uint64_t old_l2_offset;
2773861946aSAlberto Garcia     uint64_t *l2_slice = NULL;
2783861946aSAlberto Garcia     unsigned slice, slice_size2, n_slices;
279f4f0d391SKevin Wolf     int64_t l2_offset;
280c46e1167SKevin Wolf     int ret;
28145aba42fSKevin Wolf 
28245aba42fSKevin Wolf     old_l2_offset = s->l1_table[l1_index];
28345aba42fSKevin Wolf 
2843cce16f4SKevin Wolf     trace_qcow2_l2_allocate(bs, l1_index);
2853cce16f4SKevin Wolf 
28645aba42fSKevin Wolf     /* allocate a new l2 entry */
28745aba42fSKevin Wolf 
288c8fd8554SAlberto Garcia     l2_offset = qcow2_alloc_clusters(bs, s->l2_size * l2_entry_size(s));
2895d757b56SKevin Wolf     if (l2_offset < 0) {
290be0b742eSMax Reitz         ret = l2_offset;
291be0b742eSMax Reitz         goto fail;
2925d757b56SKevin Wolf     }
29329c1a730SKevin Wolf 
294c1c43990SAlberto Garcia     /* The offset must fit in the offset field of the L1 table entry */
295c1c43990SAlberto Garcia     assert((l2_offset & L1E_OFFSET_MASK) == l2_offset);
296c1c43990SAlberto Garcia 
29798839750SAlberto Garcia     /* If we're allocating the table at offset 0 then something is wrong */
29898839750SAlberto Garcia     if (l2_offset == 0) {
29998839750SAlberto Garcia         qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
30098839750SAlberto Garcia                                 "allocation of L2 table at offset 0");
30198839750SAlberto Garcia         ret = -EIO;
30298839750SAlberto Garcia         goto fail;
30398839750SAlberto Garcia     }
30498839750SAlberto Garcia 
30529c1a730SKevin Wolf     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
30629c1a730SKevin Wolf     if (ret < 0) {
30729c1a730SKevin Wolf         goto fail;
30829c1a730SKevin Wolf     }
30945aba42fSKevin Wolf 
31045aba42fSKevin Wolf     /* allocate a new entry in the l2 cache */
31145aba42fSKevin Wolf 
312c8fd8554SAlberto Garcia     slice_size2 = s->l2_slice_size * l2_entry_size(s);
3133861946aSAlberto Garcia     n_slices = s->cluster_size / slice_size2;
3143861946aSAlberto Garcia 
3153cce16f4SKevin Wolf     trace_qcow2_l2_allocate_get_empty(bs, l1_index);
3163861946aSAlberto Garcia     for (slice = 0; slice < n_slices; slice++) {
3176580bb09SAlberto Garcia         ret = qcow2_cache_get_empty(bs, s->l2_table_cache,
3183861946aSAlberto Garcia                                     l2_offset + slice * slice_size2,
3193861946aSAlberto Garcia                                     (void **) &l2_slice);
32029c1a730SKevin Wolf         if (ret < 0) {
321be0b742eSMax Reitz             goto fail;
32229c1a730SKevin Wolf         }
32329c1a730SKevin Wolf 
3248e37f681SKevin Wolf         if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
3253861946aSAlberto Garcia             /* if there was no old l2 table, clear the new slice */
3263861946aSAlberto Garcia             memset(l2_slice, 0, slice_size2);
32745aba42fSKevin Wolf         } else {
3283861946aSAlberto Garcia             uint64_t *old_slice;
3293861946aSAlberto Garcia             uint64_t old_l2_slice_offset =
3303861946aSAlberto Garcia                 (old_l2_offset & L1E_OFFSET_MASK) + slice * slice_size2;
33129c1a730SKevin Wolf 
3323861946aSAlberto Garcia             /* if there was an old l2 table, read a slice from the disk */
33366f82ceeSKevin Wolf             BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
3343861946aSAlberto Garcia             ret = qcow2_cache_get(bs, s->l2_table_cache, old_l2_slice_offset,
3353861946aSAlberto Garcia                                   (void **) &old_slice);
33629c1a730SKevin Wolf             if (ret < 0) {
33729c1a730SKevin Wolf                 goto fail;
33829c1a730SKevin Wolf             }
33929c1a730SKevin Wolf 
3403861946aSAlberto Garcia             memcpy(l2_slice, old_slice, slice_size2);
34129c1a730SKevin Wolf 
3423861946aSAlberto Garcia             qcow2_cache_put(s->l2_table_cache, (void **) &old_slice);
34345aba42fSKevin Wolf         }
34429c1a730SKevin Wolf 
3453861946aSAlberto Garcia         /* write the l2 slice to the file */
34666f82ceeSKevin Wolf         BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
34729c1a730SKevin Wolf 
3483cce16f4SKevin Wolf         trace_qcow2_l2_allocate_write_l2(bs, l1_index);
3493861946aSAlberto Garcia         qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
3503861946aSAlberto Garcia         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
3516580bb09SAlberto Garcia     }
3526580bb09SAlberto Garcia 
35329c1a730SKevin Wolf     ret = qcow2_cache_flush(bs, s->l2_table_cache);
354c46e1167SKevin Wolf     if (ret < 0) {
355175e1152SKevin Wolf         goto fail;
356175e1152SKevin Wolf     }
357175e1152SKevin Wolf 
358175e1152SKevin Wolf     /* update the L1 entry */
3593cce16f4SKevin Wolf     trace_qcow2_l2_allocate_write_l1(bs, l1_index);
360175e1152SKevin Wolf     s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
361e23e400eSMax Reitz     ret = qcow2_write_l1_entry(bs, l1_index);
362175e1152SKevin Wolf     if (ret < 0) {
363175e1152SKevin Wolf         goto fail;
364c46e1167SKevin Wolf     }
36545aba42fSKevin Wolf 
3663cce16f4SKevin Wolf     trace_qcow2_l2_allocate_done(bs, l1_index, 0);
367c46e1167SKevin Wolf     return 0;
368175e1152SKevin Wolf 
369175e1152SKevin Wolf fail:
3703cce16f4SKevin Wolf     trace_qcow2_l2_allocate_done(bs, l1_index, ret);
3713861946aSAlberto Garcia     if (l2_slice != NULL) {
3723861946aSAlberto Garcia         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
3738585afd8SMax Reitz     }
37468dba0bfSKevin Wolf     s->l1_table[l1_index] = old_l2_offset;
375e3b21ef9SMax Reitz     if (l2_offset > 0) {
376c8fd8554SAlberto Garcia         qcow2_free_clusters(bs, l2_offset, s->l2_size * l2_entry_size(s),
377e3b21ef9SMax Reitz                             QCOW2_DISCARD_ALWAYS);
378e3b21ef9SMax Reitz     }
379175e1152SKevin Wolf     return ret;
38045aba42fSKevin Wolf }
38145aba42fSKevin Wolf 
3822bfcc4a0SKevin Wolf /*
38370d1cbaeSAlberto Garcia  * For a given L2 entry, count the number of contiguous subclusters of
38470d1cbaeSAlberto Garcia  * the same type starting from @sc_from. Compressed clusters are
38570d1cbaeSAlberto Garcia  * treated as if they were divided into subclusters of size
38670d1cbaeSAlberto Garcia  * s->subcluster_size.
38770d1cbaeSAlberto Garcia  *
38870d1cbaeSAlberto Garcia  * Return the number of contiguous subclusters and set @type to the
38970d1cbaeSAlberto Garcia  * subcluster type.
39070d1cbaeSAlberto Garcia  *
39170d1cbaeSAlberto Garcia  * If the L2 entry is invalid return -errno and set @type to
39270d1cbaeSAlberto Garcia  * QCOW2_SUBCLUSTER_INVALID.
39370d1cbaeSAlberto Garcia  */
3948f897341SKevin Wolf static int GRAPH_RDLOCK
qcow2_get_subcluster_range_type(BlockDriverState * bs,uint64_t l2_entry,uint64_t l2_bitmap,unsigned sc_from,QCow2SubclusterType * type)3958f897341SKevin Wolf qcow2_get_subcluster_range_type(BlockDriverState *bs, uint64_t l2_entry,
3968f897341SKevin Wolf                                 uint64_t l2_bitmap, unsigned sc_from,
39770d1cbaeSAlberto Garcia                                 QCow2SubclusterType *type)
39870d1cbaeSAlberto Garcia {
39970d1cbaeSAlberto Garcia     BDRVQcow2State *s = bs->opaque;
40070d1cbaeSAlberto Garcia     uint32_t val;
40170d1cbaeSAlberto Garcia 
40270d1cbaeSAlberto Garcia     *type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_from);
40370d1cbaeSAlberto Garcia 
40470d1cbaeSAlberto Garcia     if (*type == QCOW2_SUBCLUSTER_INVALID) {
40570d1cbaeSAlberto Garcia         return -EINVAL;
40670d1cbaeSAlberto Garcia     } else if (!has_subclusters(s) || *type == QCOW2_SUBCLUSTER_COMPRESSED) {
40770d1cbaeSAlberto Garcia         return s->subclusters_per_cluster - sc_from;
40870d1cbaeSAlberto Garcia     }
40970d1cbaeSAlberto Garcia 
41070d1cbaeSAlberto Garcia     switch (*type) {
41170d1cbaeSAlberto Garcia     case QCOW2_SUBCLUSTER_NORMAL:
41270d1cbaeSAlberto Garcia         val = l2_bitmap | QCOW_OFLAG_SUB_ALLOC_RANGE(0, sc_from);
41370d1cbaeSAlberto Garcia         return cto32(val) - sc_from;
41470d1cbaeSAlberto Garcia 
41570d1cbaeSAlberto Garcia     case QCOW2_SUBCLUSTER_ZERO_PLAIN:
41670d1cbaeSAlberto Garcia     case QCOW2_SUBCLUSTER_ZERO_ALLOC:
41770d1cbaeSAlberto Garcia         val = (l2_bitmap | QCOW_OFLAG_SUB_ZERO_RANGE(0, sc_from)) >> 32;
41870d1cbaeSAlberto Garcia         return cto32(val) - sc_from;
41970d1cbaeSAlberto Garcia 
42070d1cbaeSAlberto Garcia     case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
42170d1cbaeSAlberto Garcia     case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
42270d1cbaeSAlberto Garcia         val = ((l2_bitmap >> 32) | l2_bitmap)
42370d1cbaeSAlberto Garcia             & ~QCOW_OFLAG_SUB_ALLOC_RANGE(0, sc_from);
42470d1cbaeSAlberto Garcia         return ctz32(val) - sc_from;
42570d1cbaeSAlberto Garcia 
42670d1cbaeSAlberto Garcia     default:
42770d1cbaeSAlberto Garcia         g_assert_not_reached();
42870d1cbaeSAlberto Garcia     }
42970d1cbaeSAlberto Garcia }
43070d1cbaeSAlberto Garcia 
43170d1cbaeSAlberto Garcia /*
4323f9c6b3bSAlberto Garcia  * Return the number of contiguous subclusters of the exact same type
4333f9c6b3bSAlberto Garcia  * in a given L2 slice, starting from cluster @l2_index, subcluster
4343f9c6b3bSAlberto Garcia  * @sc_index. Allocated subclusters are required to be contiguous in
4353f9c6b3bSAlberto Garcia  * the image file.
4363f9c6b3bSAlberto Garcia  * At most @nb_clusters are checked (note that this means clusters,
4373f9c6b3bSAlberto Garcia  * not subclusters).
4383f9c6b3bSAlberto Garcia  * Compressed clusters are always processed one by one but for the
4393f9c6b3bSAlberto Garcia  * purpose of this count they are treated as if they were divided into
4403f9c6b3bSAlberto Garcia  * subclusters of size s->subcluster_size.
4413f9c6b3bSAlberto Garcia  * On failure return -errno and update @l2_index to point to the
4423f9c6b3bSAlberto Garcia  * invalid entry.
4432bfcc4a0SKevin Wolf  */
4448f897341SKevin Wolf static int GRAPH_RDLOCK
count_contiguous_subclusters(BlockDriverState * bs,int nb_clusters,unsigned sc_index,uint64_t * l2_slice,unsigned * l2_index)4458f897341SKevin Wolf count_contiguous_subclusters(BlockDriverState *bs, int nb_clusters,
4463f9c6b3bSAlberto Garcia                              unsigned sc_index, uint64_t *l2_slice,
4473f9c6b3bSAlberto Garcia                              unsigned *l2_index)
44845aba42fSKevin Wolf {
44912c6aebeSAlberto Garcia     BDRVQcow2State *s = bs->opaque;
4503f9c6b3bSAlberto Garcia     int i, count = 0;
4513f9c6b3bSAlberto Garcia     bool check_offset = false;
4523f9c6b3bSAlberto Garcia     uint64_t expected_offset = 0;
4533f9c6b3bSAlberto Garcia     QCow2SubclusterType expected_type = QCOW2_SUBCLUSTER_NORMAL, type;
45445aba42fSKevin Wolf 
4553f9c6b3bSAlberto Garcia     assert(*l2_index + nb_clusters <= s->l2_slice_size);
45615684a47SMax Reitz 
45761653008SKevin Wolf     for (i = 0; i < nb_clusters; i++) {
4583f9c6b3bSAlberto Garcia         unsigned first_sc = (i == 0) ? sc_index : 0;
4593f9c6b3bSAlberto Garcia         uint64_t l2_entry = get_l2_entry(s, l2_slice, *l2_index + i);
4603f9c6b3bSAlberto Garcia         uint64_t l2_bitmap = get_l2_bitmap(s, l2_slice, *l2_index + i);
4613f9c6b3bSAlberto Garcia         int ret = qcow2_get_subcluster_range_type(bs, l2_entry, l2_bitmap,
4623f9c6b3bSAlberto Garcia                                                   first_sc, &type);
4633f9c6b3bSAlberto Garcia         if (ret < 0) {
4643f9c6b3bSAlberto Garcia             *l2_index += i; /* Point to the invalid entry */
4653f9c6b3bSAlberto Garcia             return -EIO;
4663f9c6b3bSAlberto Garcia         }
4673f9c6b3bSAlberto Garcia         if (i == 0) {
4683f9c6b3bSAlberto Garcia             if (type == QCOW2_SUBCLUSTER_COMPRESSED) {
4693f9c6b3bSAlberto Garcia                 /* Compressed clusters are always processed one by one */
4703f9c6b3bSAlberto Garcia                 return ret;
4713f9c6b3bSAlberto Garcia             }
4723f9c6b3bSAlberto Garcia             expected_type = type;
4733f9c6b3bSAlberto Garcia             expected_offset = l2_entry & L2E_OFFSET_MASK;
4743f9c6b3bSAlberto Garcia             check_offset = (type == QCOW2_SUBCLUSTER_NORMAL ||
4753f9c6b3bSAlberto Garcia                             type == QCOW2_SUBCLUSTER_ZERO_ALLOC ||
4763f9c6b3bSAlberto Garcia                             type == QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC);
4773f9c6b3bSAlberto Garcia         } else if (type != expected_type) {
4783f9c6b3bSAlberto Garcia             break;
4793f9c6b3bSAlberto Garcia         } else if (check_offset) {
4803f9c6b3bSAlberto Garcia             expected_offset += s->cluster_size;
4813f9c6b3bSAlberto Garcia             if (expected_offset != (l2_entry & L2E_OFFSET_MASK)) {
4823f9c6b3bSAlberto Garcia                 break;
4833f9c6b3bSAlberto Garcia             }
4843f9c6b3bSAlberto Garcia         }
4853f9c6b3bSAlberto Garcia         count += ret;
4863f9c6b3bSAlberto Garcia         /* Stop if there are type changes before the end of the cluster */
4873f9c6b3bSAlberto Garcia         if (first_sc + ret < s->subclusters_per_cluster) {
48845aba42fSKevin Wolf             break;
4892bfcc4a0SKevin Wolf         }
4902bfcc4a0SKevin Wolf     }
49145aba42fSKevin Wolf 
4923f9c6b3bSAlberto Garcia     return count;
49345aba42fSKevin Wolf }
49445aba42fSKevin Wolf 
4957b1fb72eSKevin Wolf static int coroutine_fn GRAPH_RDLOCK
do_perform_cow_read(BlockDriverState * bs,uint64_t src_cluster_offset,unsigned offset_in_cluster,QEMUIOVector * qiov)4967b1fb72eSKevin Wolf do_perform_cow_read(BlockDriverState *bs, uint64_t src_cluster_offset,
4977b1fb72eSKevin Wolf                     unsigned offset_in_cluster, QEMUIOVector *qiov)
49845aba42fSKevin Wolf {
499aaa4d20bSKevin Wolf     int ret;
5001b9f1491SKevin Wolf 
50186b862c4SAlberto Garcia     if (qiov->size == 0) {
50299450c6fSAlberto Garcia         return 0;
50399450c6fSAlberto Garcia     }
50499450c6fSAlberto Garcia 
50517362398SPaolo Bonzini     BLKDBG_CO_EVENT(bs->file, BLKDBG_COW_READ);
506aef4acb6SStefan Hajnoczi 
507dba28555SMax Reitz     if (!bs->drv) {
508672f0f2cSAlberto Garcia         return -ENOMEDIUM;
509dba28555SMax Reitz     }
510dba28555SMax Reitz 
511f7ef38ddSVladimir Sementsov-Ogievskiy     /*
512f7ef38ddSVladimir Sementsov-Ogievskiy      * We never deal with requests that don't satisfy
513f7ef38ddSVladimir Sementsov-Ogievskiy      * bdrv_check_qiov_request(), and aligning requests to clusters never
514f7ef38ddSVladimir Sementsov-Ogievskiy      * breaks this condition. So, do some assertions before calling
515f7ef38ddSVladimir Sementsov-Ogievskiy      * bs->drv->bdrv_co_preadv_part() which has int64_t arguments.
516f7ef38ddSVladimir Sementsov-Ogievskiy      */
517f7ef38ddSVladimir Sementsov-Ogievskiy     assert(src_cluster_offset <= INT64_MAX);
518f7ef38ddSVladimir Sementsov-Ogievskiy     assert(src_cluster_offset + offset_in_cluster <= INT64_MAX);
519e7e588d4SHanna Reitz     /* Cast qiov->size to uint64_t to silence a compiler warning on -m32 */
520e7e588d4SHanna Reitz     assert((uint64_t)qiov->size <= INT64_MAX);
521f7ef38ddSVladimir Sementsov-Ogievskiy     bdrv_check_qiov_request(src_cluster_offset + offset_in_cluster, qiov->size,
522f7ef38ddSVladimir Sementsov-Ogievskiy                             qiov, 0, &error_abort);
523f7ef38ddSVladimir Sementsov-Ogievskiy     /*
524f7ef38ddSVladimir Sementsov-Ogievskiy      * Call .bdrv_co_readv() directly instead of using the public block-layer
525aef4acb6SStefan Hajnoczi      * interface.  This avoids double I/O throttling and request tracking,
526aef4acb6SStefan Hajnoczi      * which can lead to deadlock when block layer copy-on-read is enabled.
527aef4acb6SStefan Hajnoczi      */
528df893d25SVladimir Sementsov-Ogievskiy     ret = bs->drv->bdrv_co_preadv_part(bs,
529df893d25SVladimir Sementsov-Ogievskiy                                        src_cluster_offset + offset_in_cluster,
530df893d25SVladimir Sementsov-Ogievskiy                                        qiov->size, qiov, 0, 0);
5311b9f1491SKevin Wolf     if (ret < 0) {
532672f0f2cSAlberto Garcia         return ret;
5331b9f1491SKevin Wolf     }
5341b9f1491SKevin Wolf 
535672f0f2cSAlberto Garcia     return 0;
536672f0f2cSAlberto Garcia }
537672f0f2cSAlberto Garcia 
538b9b10c35SKevin Wolf static int coroutine_fn GRAPH_RDLOCK
do_perform_cow_write(BlockDriverState * bs,uint64_t cluster_offset,unsigned offset_in_cluster,QEMUIOVector * qiov)539b9b10c35SKevin Wolf do_perform_cow_write(BlockDriverState *bs, uint64_t cluster_offset,
540b9b10c35SKevin Wolf                      unsigned offset_in_cluster, QEMUIOVector *qiov)
541672f0f2cSAlberto Garcia {
542966b000fSKevin Wolf     BDRVQcow2State *s = bs->opaque;
543672f0f2cSAlberto Garcia     int ret;
544672f0f2cSAlberto Garcia 
54586b862c4SAlberto Garcia     if (qiov->size == 0) {
546672f0f2cSAlberto Garcia         return 0;
547672f0f2cSAlberto Garcia     }
548672f0f2cSAlberto Garcia 
549231bb267SMax Reitz     ret = qcow2_pre_write_overlap_check(bs, 0,
550966b000fSKevin Wolf             cluster_offset + offset_in_cluster, qiov->size, true);
551cf93980eSMax Reitz     if (ret < 0) {
552672f0f2cSAlberto Garcia         return ret;
553cf93980eSMax Reitz     }
554cf93980eSMax Reitz 
55517362398SPaolo Bonzini     BLKDBG_CO_EVENT(bs->file, BLKDBG_COW_WRITE);
556966b000fSKevin Wolf     ret = bdrv_co_pwritev(s->data_file, cluster_offset + offset_in_cluster,
55786b862c4SAlberto Garcia                           qiov->size, qiov, 0);
5581b9f1491SKevin Wolf     if (ret < 0) {
559672f0f2cSAlberto Garcia         return ret;
5601b9f1491SKevin Wolf     }
5611b9f1491SKevin Wolf 
562672f0f2cSAlberto Garcia     return 0;
56345aba42fSKevin Wolf }
56445aba42fSKevin Wolf 
56545aba42fSKevin Wolf 
56645aba42fSKevin Wolf /*
567388e5816SAlberto Garcia  * get_host_offset
56845aba42fSKevin Wolf  *
569388e5816SAlberto Garcia  * For a given offset of the virtual disk find the equivalent host
570388e5816SAlberto Garcia  * offset in the qcow2 file and store it in *host_offset. Neither
571388e5816SAlberto Garcia  * offset needs to be aligned to a cluster boundary.
572388e5816SAlberto Garcia  *
573388e5816SAlberto Garcia  * If the cluster is unallocated then *host_offset will be 0.
5749a3978a4SVladimir Sementsov-Ogievskiy  * If the cluster is compressed then *host_offset will contain the l2 entry.
57545aba42fSKevin Wolf  *
576ecfe1863SKevin Wolf  * On entry, *bytes is the maximum number of contiguous bytes starting at
577ecfe1863SKevin Wolf  * offset that we are interested in.
57845aba42fSKevin Wolf  *
579ecfe1863SKevin Wolf  * On exit, *bytes is the number of bytes starting at offset that have the same
58010dabdc5SAlberto Garcia  * subcluster type and (if applicable) are stored contiguously in the image
58110dabdc5SAlberto Garcia  * file. The subcluster type is stored in *subcluster_type.
58210dabdc5SAlberto Garcia  * Compressed clusters are always processed one by one.
58345aba42fSKevin Wolf  *
584ca4a0bb8SAlberto Garcia  * Returns 0 on success, -errno in error cases.
58545aba42fSKevin Wolf  */
qcow2_get_host_offset(BlockDriverState * bs,uint64_t offset,unsigned int * bytes,uint64_t * host_offset,QCow2SubclusterType * subcluster_type)586388e5816SAlberto Garcia int qcow2_get_host_offset(BlockDriverState *bs, uint64_t offset,
587ca4a0bb8SAlberto Garcia                           unsigned int *bytes, uint64_t *host_offset,
58810dabdc5SAlberto Garcia                           QCow2SubclusterType *subcluster_type)
58945aba42fSKevin Wolf {
590ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
5913f9c6b3bSAlberto Garcia     unsigned int l2_index, sc_index;
5923f9c6b3bSAlberto Garcia     uint64_t l1_index, l2_offset, *l2_slice, l2_entry, l2_bitmap;
5933f9c6b3bSAlberto Garcia     int sc;
594c834cba9SMax Reitz     unsigned int offset_in_cluster;
595c834cba9SMax Reitz     uint64_t bytes_available, bytes_needed, nb_clusters;
5963f9c6b3bSAlberto Garcia     QCow2SubclusterType type;
59755c17e98SKevin Wolf     int ret;
598b2f65d6bSKevin Wolf 
599b2f65d6bSKevin Wolf     offset_in_cluster = offset_into_cluster(s, offset);
600ecfe1863SKevin Wolf     bytes_needed = (uint64_t) *bytes + offset_in_cluster;
60145aba42fSKevin Wolf 
602b2f65d6bSKevin Wolf     /* compute how many bytes there are between the start of the cluster
603fd630039SAlberto Garcia      * containing offset and the end of the l2 slice that contains
604fd630039SAlberto Garcia      * the entry pointing to it */
605fd630039SAlberto Garcia     bytes_available =
606fd630039SAlberto Garcia         ((uint64_t) (s->l2_slice_size - offset_to_l2_slice_index(s, offset)))
607fd630039SAlberto Garcia         << s->cluster_bits;
60845aba42fSKevin Wolf 
609b2f65d6bSKevin Wolf     if (bytes_needed > bytes_available) {
610b2f65d6bSKevin Wolf         bytes_needed = bytes_available;
61145aba42fSKevin Wolf     }
61245aba42fSKevin Wolf 
613388e5816SAlberto Garcia     *host_offset = 0;
61445aba42fSKevin Wolf 
615b6af0975SDaniel P. Berrange     /* seek to the l2 offset in the l1 table */
61645aba42fSKevin Wolf 
61705b5b6eeSAlberto Garcia     l1_index = offset_to_l1_index(s, offset);
61868d000a3SKevin Wolf     if (l1_index >= s->l1_size) {
6193f9c6b3bSAlberto Garcia         type = QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN;
62045aba42fSKevin Wolf         goto out;
62168d000a3SKevin Wolf     }
62245aba42fSKevin Wolf 
62368d000a3SKevin Wolf     l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
62468d000a3SKevin Wolf     if (!l2_offset) {
6253f9c6b3bSAlberto Garcia         type = QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN;
62645aba42fSKevin Wolf         goto out;
62768d000a3SKevin Wolf     }
62845aba42fSKevin Wolf 
629a97c67eeSMax Reitz     if (offset_into_cluster(s, l2_offset)) {
630a97c67eeSMax Reitz         qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
631a97c67eeSMax Reitz                                 " unaligned (L1 index: %#" PRIx64 ")",
632a97c67eeSMax Reitz                                 l2_offset, l1_index);
633a97c67eeSMax Reitz         return -EIO;
634a97c67eeSMax Reitz     }
635a97c67eeSMax Reitz 
636fd630039SAlberto Garcia     /* load the l2 slice in memory */
63745aba42fSKevin Wolf 
638fd630039SAlberto Garcia     ret = l2_load(bs, offset, l2_offset, &l2_slice);
63955c17e98SKevin Wolf     if (ret < 0) {
64055c17e98SKevin Wolf         return ret;
6411c46efaaSKevin Wolf     }
64245aba42fSKevin Wolf 
64345aba42fSKevin Wolf     /* find the cluster offset for the given disk offset */
64445aba42fSKevin Wolf 
645fd630039SAlberto Garcia     l2_index = offset_to_l2_slice_index(s, offset);
6463f9c6b3bSAlberto Garcia     sc_index = offset_to_sc_index(s, offset);
64712c6aebeSAlberto Garcia     l2_entry = get_l2_entry(s, l2_slice, l2_index);
6483f9c6b3bSAlberto Garcia     l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
649b6d36defSMax Reitz 
650b2f65d6bSKevin Wolf     nb_clusters = size_to_clusters(s, bytes_needed);
651c834cba9SMax Reitz     /* bytes_needed <= *bytes + offset_in_cluster, both of which are unsigned
652c834cba9SMax Reitz      * integers; the minimum cluster size is 512, so this assertion is always
653c834cba9SMax Reitz      * true */
654c834cba9SMax Reitz     assert(nb_clusters <= INT_MAX);
65545aba42fSKevin Wolf 
6563f9c6b3bSAlberto Garcia     type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index);
6573f9c6b3bSAlberto Garcia     if (s->qcow_version < 3 && (type == QCOW2_SUBCLUSTER_ZERO_PLAIN ||
6583f9c6b3bSAlberto Garcia                                 type == QCOW2_SUBCLUSTER_ZERO_ALLOC)) {
659a97c67eeSMax Reitz         qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found"
660a97c67eeSMax Reitz                                 " in pre-v3 image (L2 offset: %#" PRIx64
661a97c67eeSMax Reitz                                 ", L2 index: %#x)", l2_offset, l2_index);
662a97c67eeSMax Reitz         ret = -EIO;
663a97c67eeSMax Reitz         goto fail;
664381b487dSPaolo Bonzini     }
665fdfab37dSEric Blake     switch (type) {
6663f9c6b3bSAlberto Garcia     case QCOW2_SUBCLUSTER_INVALID:
6673f9c6b3bSAlberto Garcia         break; /* This is handled by count_contiguous_subclusters() below */
6683f9c6b3bSAlberto Garcia     case QCOW2_SUBCLUSTER_COMPRESSED:
669966b000fSKevin Wolf         if (has_data_file(bs)) {
670966b000fSKevin Wolf             qcow2_signal_corruption(bs, true, -1, -1, "Compressed cluster "
671966b000fSKevin Wolf                                     "entry found in image with external data "
672966b000fSKevin Wolf                                     "file (L2 offset: %#" PRIx64 ", L2 index: "
673966b000fSKevin Wolf                                     "%#x)", l2_offset, l2_index);
674966b000fSKevin Wolf             ret = -EIO;
675966b000fSKevin Wolf             goto fail;
676966b000fSKevin Wolf         }
6779a3978a4SVladimir Sementsov-Ogievskiy         *host_offset = l2_entry;
6786377af48SKevin Wolf         break;
6793f9c6b3bSAlberto Garcia     case QCOW2_SUBCLUSTER_ZERO_PLAIN:
6803f9c6b3bSAlberto Garcia     case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
68168d000a3SKevin Wolf         break;
6823f9c6b3bSAlberto Garcia     case QCOW2_SUBCLUSTER_ZERO_ALLOC:
6833f9c6b3bSAlberto Garcia     case QCOW2_SUBCLUSTER_NORMAL:
6843f9c6b3bSAlberto Garcia     case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC: {
685388e5816SAlberto Garcia         uint64_t host_cluster_offset = l2_entry & L2E_OFFSET_MASK;
686388e5816SAlberto Garcia         *host_offset = host_cluster_offset + offset_in_cluster;
687388e5816SAlberto Garcia         if (offset_into_cluster(s, host_cluster_offset)) {
688fdfab37dSEric Blake             qcow2_signal_corruption(bs, true, -1, -1,
689fdfab37dSEric Blake                                     "Cluster allocation offset %#"
690a97c67eeSMax Reitz                                     PRIx64 " unaligned (L2 offset: %#" PRIx64
691388e5816SAlberto Garcia                                     ", L2 index: %#x)", host_cluster_offset,
692a97c67eeSMax Reitz                                     l2_offset, l2_index);
693a97c67eeSMax Reitz             ret = -EIO;
694a97c67eeSMax Reitz             goto fail;
695a97c67eeSMax Reitz         }
696388e5816SAlberto Garcia         if (has_data_file(bs) && *host_offset != offset) {
697966b000fSKevin Wolf             qcow2_signal_corruption(bs, true, -1, -1,
698966b000fSKevin Wolf                                     "External data file host cluster offset %#"
699966b000fSKevin Wolf                                     PRIx64 " does not match guest cluster "
700966b000fSKevin Wolf                                     "offset: %#" PRIx64
701388e5816SAlberto Garcia                                     ", L2 index: %#x)", host_cluster_offset,
702966b000fSKevin Wolf                                     offset - offset_in_cluster, l2_index);
703966b000fSKevin Wolf             ret = -EIO;
704966b000fSKevin Wolf             goto fail;
705966b000fSKevin Wolf         }
70668d000a3SKevin Wolf         break;
707388e5816SAlberto Garcia     }
7081417d7e4SKevin Wolf     default:
7091417d7e4SKevin Wolf         abort();
71045aba42fSKevin Wolf     }
71145aba42fSKevin Wolf 
7123f9c6b3bSAlberto Garcia     sc = count_contiguous_subclusters(bs, nb_clusters, sc_index,
7133f9c6b3bSAlberto Garcia                                       l2_slice, &l2_index);
7143f9c6b3bSAlberto Garcia     if (sc < 0) {
7153f9c6b3bSAlberto Garcia         qcow2_signal_corruption(bs, true, -1, -1, "Invalid cluster entry found "
7163f9c6b3bSAlberto Garcia                                 " (L2 offset: %#" PRIx64 ", L2 index: %#x)",
7173f9c6b3bSAlberto Garcia                                 l2_offset, l2_index);
7183f9c6b3bSAlberto Garcia         ret = -EIO;
7193f9c6b3bSAlberto Garcia         goto fail;
7203f9c6b3bSAlberto Garcia     }
721fd630039SAlberto Garcia     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
72229c1a730SKevin Wolf 
7233f9c6b3bSAlberto Garcia     bytes_available = ((int64_t)sc + sc_index) << s->subcluster_bits;
72468d000a3SKevin Wolf 
72545aba42fSKevin Wolf out:
726b2f65d6bSKevin Wolf     if (bytes_available > bytes_needed) {
727b2f65d6bSKevin Wolf         bytes_available = bytes_needed;
728b2f65d6bSKevin Wolf     }
72945aba42fSKevin Wolf 
730c834cba9SMax Reitz     /* bytes_available <= bytes_needed <= *bytes + offset_in_cluster;
731c834cba9SMax Reitz      * subtracting offset_in_cluster will therefore definitely yield something
732c834cba9SMax Reitz      * not exceeding UINT_MAX */
733c834cba9SMax Reitz     assert(bytes_available - offset_in_cluster <= UINT_MAX);
734ecfe1863SKevin Wolf     *bytes = bytes_available - offset_in_cluster;
73545aba42fSKevin Wolf 
7363f9c6b3bSAlberto Garcia     *subcluster_type = type;
737ca4a0bb8SAlberto Garcia 
738ca4a0bb8SAlberto Garcia     return 0;
739a97c67eeSMax Reitz 
740a97c67eeSMax Reitz fail:
741fd630039SAlberto Garcia     qcow2_cache_put(s->l2_table_cache, (void **)&l2_slice);
742a97c67eeSMax Reitz     return ret;
74345aba42fSKevin Wolf }
74445aba42fSKevin Wolf 
74545aba42fSKevin Wolf /*
74645aba42fSKevin Wolf  * get_cluster_table
74745aba42fSKevin Wolf  *
74845aba42fSKevin Wolf  * for a given disk offset, load (and allocate if needed)
749c03bfc5bSAlberto Garcia  * the appropriate slice of its l2 table.
75045aba42fSKevin Wolf  *
751c03bfc5bSAlberto Garcia  * the cluster index in the l2 slice is given to the caller.
75245aba42fSKevin Wolf  *
7531e3e8f1aSKevin Wolf  * Returns 0 on success, -errno in failure case
75445aba42fSKevin Wolf  */
7550bb79c97SKevin Wolf static int GRAPH_RDLOCK
get_cluster_table(BlockDriverState * bs,uint64_t offset,uint64_t ** new_l2_slice,int * new_l2_index)7560bb79c97SKevin Wolf get_cluster_table(BlockDriverState *bs, uint64_t offset,
7570bb79c97SKevin Wolf                   uint64_t **new_l2_slice, int *new_l2_index)
75845aba42fSKevin Wolf {
759ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
7602cf7cfa1SKevin Wolf     unsigned int l2_index;
7612cf7cfa1SKevin Wolf     uint64_t l1_index, l2_offset;
762c03bfc5bSAlberto Garcia     uint64_t *l2_slice = NULL;
76380ee15a6SKevin Wolf     int ret;
76445aba42fSKevin Wolf 
765b6af0975SDaniel P. Berrange     /* seek to the l2 offset in the l1 table */
76645aba42fSKevin Wolf 
76705b5b6eeSAlberto Garcia     l1_index = offset_to_l1_index(s, offset);
76845aba42fSKevin Wolf     if (l1_index >= s->l1_size) {
76972893756SStefan Hajnoczi         ret = qcow2_grow_l1_table(bs, l1_index + 1, false);
7701e3e8f1aSKevin Wolf         if (ret < 0) {
7711e3e8f1aSKevin Wolf             return ret;
7721e3e8f1aSKevin Wolf         }
77345aba42fSKevin Wolf     }
7748e37f681SKevin Wolf 
7752cf7cfa1SKevin Wolf     assert(l1_index < s->l1_size);
7768e37f681SKevin Wolf     l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
777a97c67eeSMax Reitz     if (offset_into_cluster(s, l2_offset)) {
778a97c67eeSMax Reitz         qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
779a97c67eeSMax Reitz                                 " unaligned (L1 index: %#" PRIx64 ")",
780a97c67eeSMax Reitz                                 l2_offset, l1_index);
781a97c67eeSMax Reitz         return -EIO;
782a97c67eeSMax Reitz     }
78345aba42fSKevin Wolf 
78405f9ee46SAlberto Garcia     if (!(s->l1_table[l1_index] & QCOW_OFLAG_COPIED)) {
78516fde5f2SKevin Wolf         /* First allocate a new L2 table (and do COW if needed) */
7863861946aSAlberto Garcia         ret = l2_allocate(bs, l1_index);
787c46e1167SKevin Wolf         if (ret < 0) {
788c46e1167SKevin Wolf             return ret;
7891e3e8f1aSKevin Wolf         }
79016fde5f2SKevin Wolf 
79116fde5f2SKevin Wolf         /* Then decrease the refcount of the old table */
79216fde5f2SKevin Wolf         if (l2_offset) {
793c8fd8554SAlberto Garcia             qcow2_free_clusters(bs, l2_offset, s->l2_size * l2_entry_size(s),
7946cfcb9b8SKevin Wolf                                 QCOW2_DISCARD_OTHER);
79516fde5f2SKevin Wolf         }
7963861946aSAlberto Garcia 
7973861946aSAlberto Garcia         /* Get the offset of the newly-allocated l2 table */
7983861946aSAlberto Garcia         l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
7993861946aSAlberto Garcia         assert(offset_into_cluster(s, l2_offset) == 0);
80005f9ee46SAlberto Garcia     }
80105f9ee46SAlberto Garcia 
802c03bfc5bSAlberto Garcia     /* load the l2 slice in memory */
803c03bfc5bSAlberto Garcia     ret = l2_load(bs, offset, l2_offset, &l2_slice);
8043861946aSAlberto Garcia     if (ret < 0) {
8053861946aSAlberto Garcia         return ret;
8063861946aSAlberto Garcia     }
80745aba42fSKevin Wolf 
80845aba42fSKevin Wolf     /* find the cluster offset for the given disk offset */
80945aba42fSKevin Wolf 
810c03bfc5bSAlberto Garcia     l2_index = offset_to_l2_slice_index(s, offset);
81145aba42fSKevin Wolf 
812c03bfc5bSAlberto Garcia     *new_l2_slice = l2_slice;
81345aba42fSKevin Wolf     *new_l2_index = l2_index;
81445aba42fSKevin Wolf 
8151e3e8f1aSKevin Wolf     return 0;
81645aba42fSKevin Wolf }
81745aba42fSKevin Wolf 
81845aba42fSKevin Wolf /*
81945aba42fSKevin Wolf  * alloc_compressed_cluster_offset
82045aba42fSKevin Wolf  *
82177e023ffSKevin Wolf  * For a given offset on the virtual disk, allocate a new compressed cluster
82277e023ffSKevin Wolf  * and put the host offset of the cluster into *host_offset. If a cluster is
82377e023ffSKevin Wolf  * already allocated at the offset, return an error.
82445aba42fSKevin Wolf  *
82577e023ffSKevin Wolf  * Return 0 on success and -errno in error cases
82645aba42fSKevin Wolf  */
82770bacc44SPaolo Bonzini int coroutine_fn GRAPH_RDLOCK
qcow2_alloc_compressed_cluster_offset(BlockDriverState * bs,uint64_t offset,int compressed_size,uint64_t * host_offset)82870bacc44SPaolo Bonzini qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, uint64_t offset,
82970bacc44SPaolo Bonzini                                       int compressed_size, uint64_t *host_offset)
83045aba42fSKevin Wolf {
831ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
83245aba42fSKevin Wolf     int l2_index, ret;
833e4e72548SAlberto Garcia     uint64_t *l2_slice;
834f4f0d391SKevin Wolf     int64_t cluster_offset;
83545aba42fSKevin Wolf     int nb_csectors;
83645aba42fSKevin Wolf 
837966b000fSKevin Wolf     if (has_data_file(bs)) {
838966b000fSKevin Wolf         return 0;
839966b000fSKevin Wolf     }
840966b000fSKevin Wolf 
841e4e72548SAlberto Garcia     ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
8421e3e8f1aSKevin Wolf     if (ret < 0) {
84377e023ffSKevin Wolf         return ret;
8441e3e8f1aSKevin Wolf     }
84545aba42fSKevin Wolf 
846b0b6862eSKevin Wolf     /* Compression can't overwrite anything. Fail if the cluster was already
847b0b6862eSKevin Wolf      * allocated. */
84812c6aebeSAlberto Garcia     cluster_offset = get_l2_entry(s, l2_slice, l2_index);
849b0b6862eSKevin Wolf     if (cluster_offset & L2E_OFFSET_MASK) {
850e4e72548SAlberto Garcia         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
85177e023ffSKevin Wolf         return -EIO;
8528f1efd00SKevin Wolf     }
85345aba42fSKevin Wolf 
854ed6ccf0fSKevin Wolf     cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
8555d757b56SKevin Wolf     if (cluster_offset < 0) {
856e4e72548SAlberto Garcia         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
85777e023ffSKevin Wolf         return cluster_offset;
8585d757b56SKevin Wolf     }
8595d757b56SKevin Wolf 
860b6c24694SAlberto Garcia     nb_csectors =
861b6c24694SAlberto Garcia         (cluster_offset + compressed_size - 1) / QCOW2_COMPRESSED_SECTOR_SIZE -
862b6c24694SAlberto Garcia         (cluster_offset / QCOW2_COMPRESSED_SECTOR_SIZE);
86345aba42fSKevin Wolf 
8643a75a870SAlberto Garcia     /* The offset and size must fit in their fields of the L2 table entry */
8653a75a870SAlberto Garcia     assert((cluster_offset & s->cluster_offset_mask) == cluster_offset);
8663a75a870SAlberto Garcia     assert((nb_csectors & s->csize_mask) == nb_csectors);
8673a75a870SAlberto Garcia 
86845aba42fSKevin Wolf     cluster_offset |= QCOW_OFLAG_COMPRESSED |
86945aba42fSKevin Wolf                       ((uint64_t)nb_csectors << s->csize_shift);
87045aba42fSKevin Wolf 
87145aba42fSKevin Wolf     /* update L2 table */
87245aba42fSKevin Wolf 
87345aba42fSKevin Wolf     /* compressed clusters never have the copied flag */
87445aba42fSKevin Wolf 
87517362398SPaolo Bonzini     BLKDBG_CO_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
876e4e72548SAlberto Garcia     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
87712c6aebeSAlberto Garcia     set_l2_entry(s, l2_slice, l2_index, cluster_offset);
878ff4cdec7SAlberto Garcia     if (has_subclusters(s)) {
879ff4cdec7SAlberto Garcia         set_l2_bitmap(s, l2_slice, l2_index, 0);
880ff4cdec7SAlberto Garcia     }
881e4e72548SAlberto Garcia     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
88245aba42fSKevin Wolf 
88377e023ffSKevin Wolf     *host_offset = cluster_offset & s->cluster_offset_mask;
88477e023ffSKevin Wolf     return 0;
88545aba42fSKevin Wolf }
88645aba42fSKevin Wolf 
8877b1fb72eSKevin Wolf static int coroutine_fn GRAPH_RDLOCK
perform_cow(BlockDriverState * bs,QCowL2Meta * m)8887b1fb72eSKevin Wolf perform_cow(BlockDriverState *bs, QCowL2Meta *m)
889593fb83cSKevin Wolf {
890ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
89199450c6fSAlberto Garcia     Qcow2COWRegion *start = &m->cow_start;
89299450c6fSAlberto Garcia     Qcow2COWRegion *end = &m->cow_end;
893672f0f2cSAlberto Garcia     unsigned buffer_size;
894b3cf1c7cSAlberto Garcia     unsigned data_bytes = end->offset - (start->offset + start->nb_bytes);
895b3cf1c7cSAlberto Garcia     bool merge_reads;
896672f0f2cSAlberto Garcia     uint8_t *start_buffer, *end_buffer;
89786b862c4SAlberto Garcia     QEMUIOVector qiov;
898593fb83cSKevin Wolf     int ret;
899593fb83cSKevin Wolf 
900672f0f2cSAlberto Garcia     assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
901b3cf1c7cSAlberto Garcia     assert(start->nb_bytes + end->nb_bytes <= UINT_MAX - data_bytes);
902b3cf1c7cSAlberto Garcia     assert(start->offset + start->nb_bytes <= end->offset);
903672f0f2cSAlberto Garcia 
904c8bb23cbSAnton Nefedov     if ((start->nb_bytes == 0 && end->nb_bytes == 0) || m->skip_cow) {
905593fb83cSKevin Wolf         return 0;
906593fb83cSKevin Wolf     }
907593fb83cSKevin Wolf 
908b3cf1c7cSAlberto Garcia     /* If we have to read both the start and end COW regions and the
909b3cf1c7cSAlberto Garcia      * middle region is not too large then perform just one read
910b3cf1c7cSAlberto Garcia      * operation */
911b3cf1c7cSAlberto Garcia     merge_reads = start->nb_bytes && end->nb_bytes && data_bytes <= 16384;
912b3cf1c7cSAlberto Garcia     if (merge_reads) {
913b3cf1c7cSAlberto Garcia         buffer_size = start->nb_bytes + data_bytes + end->nb_bytes;
914b3cf1c7cSAlberto Garcia     } else {
915b3cf1c7cSAlberto Garcia         /* If we have to do two reads, add some padding in the middle
916b3cf1c7cSAlberto Garcia          * if necessary to make sure that the end region is optimally
917b3cf1c7cSAlberto Garcia          * aligned. */
918b3cf1c7cSAlberto Garcia         size_t align = bdrv_opt_mem_align(bs);
919b3cf1c7cSAlberto Garcia         assert(align > 0 && align <= UINT_MAX);
920b3cf1c7cSAlberto Garcia         assert(QEMU_ALIGN_UP(start->nb_bytes, align) <=
921b3cf1c7cSAlberto Garcia                UINT_MAX - end->nb_bytes);
922b3cf1c7cSAlberto Garcia         buffer_size = QEMU_ALIGN_UP(start->nb_bytes, align) + end->nb_bytes;
923b3cf1c7cSAlberto Garcia     }
924b3cf1c7cSAlberto Garcia 
925b3cf1c7cSAlberto Garcia     /* Reserve a buffer large enough to store all the data that we're
926b3cf1c7cSAlberto Garcia      * going to read */
927672f0f2cSAlberto Garcia     start_buffer = qemu_try_blockalign(bs, buffer_size);
928672f0f2cSAlberto Garcia     if (start_buffer == NULL) {
929672f0f2cSAlberto Garcia         return -ENOMEM;
930672f0f2cSAlberto Garcia     }
931672f0f2cSAlberto Garcia     /* The part of the buffer where the end region is located */
932672f0f2cSAlberto Garcia     end_buffer = start_buffer + buffer_size - end->nb_bytes;
933672f0f2cSAlberto Garcia 
9345396234bSVladimir Sementsov-Ogievskiy     qemu_iovec_init(&qiov, 2 + (m->data_qiov ?
9355396234bSVladimir Sementsov-Ogievskiy                                 qemu_iovec_subvec_niov(m->data_qiov,
9365396234bSVladimir Sementsov-Ogievskiy                                                        m->data_qiov_offset,
9375396234bSVladimir Sementsov-Ogievskiy                                                        data_bytes)
9385396234bSVladimir Sementsov-Ogievskiy                                 : 0));
93986b862c4SAlberto Garcia 
940593fb83cSKevin Wolf     qemu_co_mutex_unlock(&s->lock);
941b3cf1c7cSAlberto Garcia     /* First we read the existing data from both COW regions. We
942b3cf1c7cSAlberto Garcia      * either read the whole region in one go, or the start and end
943b3cf1c7cSAlberto Garcia      * regions separately. */
944b3cf1c7cSAlberto Garcia     if (merge_reads) {
94586b862c4SAlberto Garcia         qemu_iovec_add(&qiov, start_buffer, buffer_size);
94686b862c4SAlberto Garcia         ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
947b3cf1c7cSAlberto Garcia     } else {
94886b862c4SAlberto Garcia         qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
94986b862c4SAlberto Garcia         ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
950593fb83cSKevin Wolf         if (ret < 0) {
95199450c6fSAlberto Garcia             goto fail;
952593fb83cSKevin Wolf         }
953593fb83cSKevin Wolf 
95486b862c4SAlberto Garcia         qemu_iovec_reset(&qiov);
95586b862c4SAlberto Garcia         qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
95686b862c4SAlberto Garcia         ret = do_perform_cow_read(bs, m->offset, end->offset, &qiov);
957b3cf1c7cSAlberto Garcia     }
958672f0f2cSAlberto Garcia     if (ret < 0) {
959672f0f2cSAlberto Garcia         goto fail;
960672f0f2cSAlberto Garcia     }
96199450c6fSAlberto Garcia 
962672f0f2cSAlberto Garcia     /* Encrypt the data if necessary before writing it */
963672f0f2cSAlberto Garcia     if (bs->encrypted) {
964603fbd07SMaxim Levitsky         ret = qcow2_co_encrypt(bs,
965603fbd07SMaxim Levitsky                                m->alloc_offset + start->offset,
966603fbd07SMaxim Levitsky                                m->offset + start->offset,
967603fbd07SMaxim Levitsky                                start_buffer, start->nb_bytes);
968603fbd07SMaxim Levitsky         if (ret < 0) {
969603fbd07SMaxim Levitsky             goto fail;
970603fbd07SMaxim Levitsky         }
971603fbd07SMaxim Levitsky 
972603fbd07SMaxim Levitsky         ret = qcow2_co_encrypt(bs,
973603fbd07SMaxim Levitsky                                m->alloc_offset + end->offset,
974603fbd07SMaxim Levitsky                                m->offset + end->offset,
975603fbd07SMaxim Levitsky                                end_buffer, end->nb_bytes);
976603fbd07SMaxim Levitsky         if (ret < 0) {
977672f0f2cSAlberto Garcia             goto fail;
978672f0f2cSAlberto Garcia         }
979672f0f2cSAlberto Garcia     }
980672f0f2cSAlberto Garcia 
981ee22a9d8SAlberto Garcia     /* And now we can write everything. If we have the guest data we
982ee22a9d8SAlberto Garcia      * can write everything in one single operation */
983ee22a9d8SAlberto Garcia     if (m->data_qiov) {
984ee22a9d8SAlberto Garcia         qemu_iovec_reset(&qiov);
985ee22a9d8SAlberto Garcia         if (start->nb_bytes) {
986ee22a9d8SAlberto Garcia             qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
987ee22a9d8SAlberto Garcia         }
9885396234bSVladimir Sementsov-Ogievskiy         qemu_iovec_concat(&qiov, m->data_qiov, m->data_qiov_offset, data_bytes);
989ee22a9d8SAlberto Garcia         if (end->nb_bytes) {
990ee22a9d8SAlberto Garcia             qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
991ee22a9d8SAlberto Garcia         }
992ee22a9d8SAlberto Garcia         /* NOTE: we have a write_aio blkdebug event here followed by
993ee22a9d8SAlberto Garcia          * a cow_write one in do_perform_cow_write(), but there's only
994ee22a9d8SAlberto Garcia          * one single I/O operation */
99517362398SPaolo Bonzini         BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_AIO);
996ee22a9d8SAlberto Garcia         ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
997ee22a9d8SAlberto Garcia     } else {
998ee22a9d8SAlberto Garcia         /* If there's no guest data then write both COW regions separately */
99986b862c4SAlberto Garcia         qemu_iovec_reset(&qiov);
100086b862c4SAlberto Garcia         qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
100186b862c4SAlberto Garcia         ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
1002672f0f2cSAlberto Garcia         if (ret < 0) {
1003672f0f2cSAlberto Garcia             goto fail;
1004672f0f2cSAlberto Garcia         }
1005672f0f2cSAlberto Garcia 
100686b862c4SAlberto Garcia         qemu_iovec_reset(&qiov);
100786b862c4SAlberto Garcia         qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
100886b862c4SAlberto Garcia         ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov);
1009ee22a9d8SAlberto Garcia     }
1010ee22a9d8SAlberto Garcia 
101199450c6fSAlberto Garcia fail:
101299450c6fSAlberto Garcia     qemu_co_mutex_lock(&s->lock);
101399450c6fSAlberto Garcia 
1014593fb83cSKevin Wolf     /*
1015593fb83cSKevin Wolf      * Before we update the L2 table to actually point to the new cluster, we
1016593fb83cSKevin Wolf      * need to be sure that the refcounts have been increased and COW was
1017593fb83cSKevin Wolf      * handled.
1018593fb83cSKevin Wolf      */
101999450c6fSAlberto Garcia     if (ret == 0) {
1020593fb83cSKevin Wolf         qcow2_cache_depends_on_flush(s->l2_table_cache);
102199450c6fSAlberto Garcia     }
1022593fb83cSKevin Wolf 
1023672f0f2cSAlberto Garcia     qemu_vfree(start_buffer);
102486b862c4SAlberto Garcia     qemu_iovec_destroy(&qiov);
102599450c6fSAlberto Garcia     return ret;
1026593fb83cSKevin Wolf }
1027593fb83cSKevin Wolf 
qcow2_alloc_cluster_link_l2(BlockDriverState * bs,QCowL2Meta * m)1028050ed2e7SPaolo Bonzini int coroutine_fn qcow2_alloc_cluster_link_l2(BlockDriverState *bs,
1029050ed2e7SPaolo Bonzini                                              QCowL2Meta *m)
103045aba42fSKevin Wolf {
1031ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
103245aba42fSKevin Wolf     int i, j = 0, l2_index, ret;
1033a002c0b0SAlberto Garcia     uint64_t *old_cluster, *l2_slice;
1034250196f1SKevin Wolf     uint64_t cluster_offset = m->alloc_offset;
103545aba42fSKevin Wolf 
10363cce16f4SKevin Wolf     trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
1037f50f88b9SKevin Wolf     assert(m->nb_clusters > 0);
103845aba42fSKevin Wolf 
10395839e53bSMarkus Armbruster     old_cluster = g_try_new(uint64_t, m->nb_clusters);
1040de82815dSKevin Wolf     if (old_cluster == NULL) {
1041de82815dSKevin Wolf         ret = -ENOMEM;
1042de82815dSKevin Wolf         goto err;
1043de82815dSKevin Wolf     }
104445aba42fSKevin Wolf 
104545aba42fSKevin Wolf     /* copy content of unmodified sectors */
104699450c6fSAlberto Garcia     ret = perform_cow(bs, m);
1047593fb83cSKevin Wolf     if (ret < 0) {
104845aba42fSKevin Wolf         goto err;
104945aba42fSKevin Wolf     }
105045aba42fSKevin Wolf 
1051593fb83cSKevin Wolf     /* Update L2 table. */
105274c4510aSKevin Wolf     if (s->use_lazy_refcounts) {
1053280d3735SKevin Wolf         qcow2_mark_dirty(bs);
1054280d3735SKevin Wolf     }
1055bfe8043eSStefan Hajnoczi     if (qcow2_need_accurate_refcounts(s)) {
1056bfe8043eSStefan Hajnoczi         qcow2_cache_set_dependency(bs, s->l2_table_cache,
1057bfe8043eSStefan Hajnoczi                                    s->refcount_block_cache);
1058bfe8043eSStefan Hajnoczi     }
1059280d3735SKevin Wolf 
1060a002c0b0SAlberto Garcia     ret = get_cluster_table(bs, m->offset, &l2_slice, &l2_index);
10611e3e8f1aSKevin Wolf     if (ret < 0) {
106245aba42fSKevin Wolf         goto err;
10631e3e8f1aSKevin Wolf     }
1064a002c0b0SAlberto Garcia     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
106545aba42fSKevin Wolf 
1066a002c0b0SAlberto Garcia     assert(l2_index + m->nb_clusters <= s->l2_slice_size);
10673441ad4bSAlberto Garcia     assert(m->cow_end.offset + m->cow_end.nb_bytes <=
10683441ad4bSAlberto Garcia            m->nb_clusters << s->cluster_bits);
106945aba42fSKevin Wolf     for (i = 0; i < m->nb_clusters; i++) {
1070348fcc4fSTuguoyi         uint64_t offset = cluster_offset + ((uint64_t)i << s->cluster_bits);
107145aba42fSKevin Wolf         /* if two concurrent writes happen to the same unallocated cluster
107245aba42fSKevin Wolf          * each write allocates separate cluster and writes data concurrently.
107345aba42fSKevin Wolf          * The first one to complete updates l2 table with pointer to its
107445aba42fSKevin Wolf          * cluster the second one has to do RMW (which is done above by
1075aaa4d20bSKevin Wolf          * perform_cow()), update l2 table with its cluster pointer and free
107645aba42fSKevin Wolf          * old cluster. This is what this loop does */
107712c6aebeSAlberto Garcia         if (get_l2_entry(s, l2_slice, l2_index + i) != 0) {
107812c6aebeSAlberto Garcia             old_cluster[j++] = get_l2_entry(s, l2_slice, l2_index + i);
1079aaa4d20bSKevin Wolf         }
108045aba42fSKevin Wolf 
10813a75a870SAlberto Garcia         /* The offset must fit in the offset field of the L2 table entry */
10823a75a870SAlberto Garcia         assert((offset & L2E_OFFSET_MASK) == offset);
10833a75a870SAlberto Garcia 
108412c6aebeSAlberto Garcia         set_l2_entry(s, l2_slice, l2_index + i, offset | QCOW_OFLAG_COPIED);
1085aca00cd9SAlberto Garcia 
1086aca00cd9SAlberto Garcia         /* Update bitmap with the subclusters that were just written */
108740dee943SAlberto Garcia         if (has_subclusters(s) && !m->prealloc) {
1088aca00cd9SAlberto Garcia             uint64_t l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
1089aca00cd9SAlberto Garcia             unsigned written_from = m->cow_start.offset;
10903441ad4bSAlberto Garcia             unsigned written_to = m->cow_end.offset + m->cow_end.nb_bytes;
1091aca00cd9SAlberto Garcia             int first_sc, last_sc;
1092aca00cd9SAlberto Garcia             /* Narrow written_from and written_to down to the current cluster */
1093aca00cd9SAlberto Garcia             written_from = MAX(written_from, i << s->cluster_bits);
1094aca00cd9SAlberto Garcia             written_to   = MIN(written_to, (i + 1) << s->cluster_bits);
1095aca00cd9SAlberto Garcia             assert(written_from < written_to);
1096aca00cd9SAlberto Garcia             first_sc = offset_to_sc_index(s, written_from);
1097aca00cd9SAlberto Garcia             last_sc  = offset_to_sc_index(s, written_to - 1);
1098aca00cd9SAlberto Garcia             l2_bitmap |= QCOW_OFLAG_SUB_ALLOC_RANGE(first_sc, last_sc + 1);
1099aca00cd9SAlberto Garcia             l2_bitmap &= ~QCOW_OFLAG_SUB_ZERO_RANGE(first_sc, last_sc + 1);
1100aca00cd9SAlberto Garcia             set_l2_bitmap(s, l2_slice, l2_index + i, l2_bitmap);
1101aca00cd9SAlberto Garcia         }
110245aba42fSKevin Wolf      }
110345aba42fSKevin Wolf 
11049f8e668eSKevin Wolf 
1105a002c0b0SAlberto Garcia     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
110645aba42fSKevin Wolf 
11077ec5e6a4SKevin Wolf     /*
11087ec5e6a4SKevin Wolf      * If this was a COW, we need to decrease the refcount of the old cluster.
11096cfcb9b8SKevin Wolf      *
11106cfcb9b8SKevin Wolf      * Don't discard clusters that reach a refcount of 0 (e.g. compressed
11116cfcb9b8SKevin Wolf      * clusters), the next write will reuse them anyway.
11127ec5e6a4SKevin Wolf      */
1113564a6b69SMax Reitz     if (!m->keep_old_clusters && j != 0) {
11147ec5e6a4SKevin Wolf         for (i = 0; i < j; i++) {
11153fec237fSAlberto Garcia             qcow2_free_any_cluster(bs, old_cluster[i], QCOW2_DISCARD_NEVER);
11167ec5e6a4SKevin Wolf         }
11177ec5e6a4SKevin Wolf     }
111845aba42fSKevin Wolf 
111945aba42fSKevin Wolf     ret = 0;
112045aba42fSKevin Wolf err:
11217267c094SAnthony Liguori     g_free(old_cluster);
112245aba42fSKevin Wolf     return ret;
112345aba42fSKevin Wolf  }
112445aba42fSKevin Wolf 
11258b24cd14SKevin Wolf /**
11268b24cd14SKevin Wolf  * Frees the allocated clusters because the request failed and they won't
11278b24cd14SKevin Wolf  * actually be linked.
11288b24cd14SKevin Wolf  */
qcow2_alloc_cluster_abort(BlockDriverState * bs,QCowL2Meta * m)1129a39bae4eSPaolo Bonzini void coroutine_fn qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m)
11308b24cd14SKevin Wolf {
11318b24cd14SKevin Wolf     BDRVQcow2State *s = bs->opaque;
11323ede935fSMax Reitz     if (!has_data_file(bs) && !m->keep_old_clusters) {
1133c3b6658cSKevin Wolf         qcow2_free_clusters(bs, m->alloc_offset,
1134c3b6658cSKevin Wolf                             m->nb_clusters << s->cluster_bits,
11358b24cd14SKevin Wolf                             QCOW2_DISCARD_NEVER);
11368b24cd14SKevin Wolf     }
1137c3b6658cSKevin Wolf }
11388b24cd14SKevin Wolf 
113945aba42fSKevin Wolf /*
11408f91d690SAlberto Garcia  * For a given write request, create a new QCowL2Meta structure, add
114157538c86SAlberto Garcia  * it to @m and the BDRVQcow2State.cluster_allocs list. If the write
114257538c86SAlberto Garcia  * request does not need copy-on-write or changes to the L2 metadata
114357538c86SAlberto Garcia  * then this function does nothing.
11448f91d690SAlberto Garcia  *
11458f91d690SAlberto Garcia  * @host_cluster_offset points to the beginning of the first cluster.
11468f91d690SAlberto Garcia  *
11478f91d690SAlberto Garcia  * @guest_offset and @bytes indicate the offset and length of the
11488f91d690SAlberto Garcia  * request.
11498f91d690SAlberto Garcia  *
115057538c86SAlberto Garcia  * @l2_slice contains the L2 entries of all clusters involved in this
115157538c86SAlberto Garcia  * write request.
115257538c86SAlberto Garcia  *
11538f91d690SAlberto Garcia  * If @keep_old is true it means that the clusters were already
11548f91d690SAlberto Garcia  * allocated and will be overwritten. If false then the clusters are
11558f91d690SAlberto Garcia  * new and we have to decrease the reference count of the old ones.
1156d53ec3d8SAlberto Garcia  *
1157d53ec3d8SAlberto Garcia  * Returns 0 on success, -errno on failure.
11588f91d690SAlberto Garcia  */
11590bb79c97SKevin Wolf static int coroutine_fn GRAPH_RDLOCK
calculate_l2_meta(BlockDriverState * bs,uint64_t host_cluster_offset,uint64_t guest_offset,unsigned bytes,uint64_t * l2_slice,QCowL2Meta ** m,bool keep_old)11600bb79c97SKevin Wolf calculate_l2_meta(BlockDriverState *bs, uint64_t host_cluster_offset,
11610bb79c97SKevin Wolf                   uint64_t guest_offset, unsigned bytes, uint64_t *l2_slice,
11620bb79c97SKevin Wolf                   QCowL2Meta **m, bool keep_old)
11638f91d690SAlberto Garcia {
11648f91d690SAlberto Garcia     BDRVQcow2State *s = bs->opaque;
1165d53ec3d8SAlberto Garcia     int sc_index, l2_index = offset_to_l2_slice_index(s, guest_offset);
1166d53ec3d8SAlberto Garcia     uint64_t l2_entry, l2_bitmap;
116757538c86SAlberto Garcia     unsigned cow_start_from, cow_end_to;
11688f91d690SAlberto Garcia     unsigned cow_start_to = offset_into_cluster(s, guest_offset);
11698f91d690SAlberto Garcia     unsigned cow_end_from = cow_start_to + bytes;
11708f91d690SAlberto Garcia     unsigned nb_clusters = size_to_clusters(s, cow_end_from);
11718f91d690SAlberto Garcia     QCowL2Meta *old_m = *m;
1172d53ec3d8SAlberto Garcia     QCow2SubclusterType type;
1173d53ec3d8SAlberto Garcia     int i;
1174d53ec3d8SAlberto Garcia     bool skip_cow = keep_old;
117557538c86SAlberto Garcia 
117657538c86SAlberto Garcia     assert(nb_clusters <= s->l2_slice_size - l2_index);
117757538c86SAlberto Garcia 
1178d53ec3d8SAlberto Garcia     /* Check the type of all affected subclusters */
117957538c86SAlberto Garcia     for (i = 0; i < nb_clusters; i++) {
118012c6aebeSAlberto Garcia         l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
1181d53ec3d8SAlberto Garcia         l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
1182d53ec3d8SAlberto Garcia         if (skip_cow) {
1183d53ec3d8SAlberto Garcia             unsigned write_from = MAX(cow_start_to, i << s->cluster_bits);
1184d53ec3d8SAlberto Garcia             unsigned write_to = MIN(cow_end_from, (i + 1) << s->cluster_bits);
1185d53ec3d8SAlberto Garcia             int first_sc = offset_to_sc_index(s, write_from);
1186d53ec3d8SAlberto Garcia             int last_sc = offset_to_sc_index(s, write_to - 1);
1187d53ec3d8SAlberto Garcia             int cnt = qcow2_get_subcluster_range_type(bs, l2_entry, l2_bitmap,
1188d53ec3d8SAlberto Garcia                                                       first_sc, &type);
1189d53ec3d8SAlberto Garcia             /* Is any of the subclusters of type != QCOW2_SUBCLUSTER_NORMAL ? */
1190d53ec3d8SAlberto Garcia             if (type != QCOW2_SUBCLUSTER_NORMAL || first_sc + cnt <= last_sc) {
1191d53ec3d8SAlberto Garcia                 skip_cow = false;
1192d53ec3d8SAlberto Garcia             }
1193d53ec3d8SAlberto Garcia         } else {
1194d53ec3d8SAlberto Garcia             /* If we can't skip the cow we can still look for invalid entries */
1195d53ec3d8SAlberto Garcia             type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, 0);
1196d53ec3d8SAlberto Garcia         }
1197d53ec3d8SAlberto Garcia         if (type == QCOW2_SUBCLUSTER_INVALID) {
1198d53ec3d8SAlberto Garcia             int l1_index = offset_to_l1_index(s, guest_offset);
1199d53ec3d8SAlberto Garcia             uint64_t l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
1200d53ec3d8SAlberto Garcia             qcow2_signal_corruption(bs, true, -1, -1, "Invalid cluster "
1201d53ec3d8SAlberto Garcia                                     "entry found (L2 offset: %#" PRIx64
1202d53ec3d8SAlberto Garcia                                     ", L2 index: %#x)",
1203d53ec3d8SAlberto Garcia                                     l2_offset, l2_index + i);
1204d53ec3d8SAlberto Garcia             return -EIO;
120557538c86SAlberto Garcia         }
120657538c86SAlberto Garcia     }
1207d53ec3d8SAlberto Garcia 
1208d53ec3d8SAlberto Garcia     if (skip_cow) {
1209d53ec3d8SAlberto Garcia         return 0;
121057538c86SAlberto Garcia     }
121157538c86SAlberto Garcia 
121257538c86SAlberto Garcia     /* Get the L2 entry of the first cluster */
121312c6aebeSAlberto Garcia     l2_entry = get_l2_entry(s, l2_slice, l2_index);
1214d53ec3d8SAlberto Garcia     l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
1215d53ec3d8SAlberto Garcia     sc_index = offset_to_sc_index(s, guest_offset);
1216d53ec3d8SAlberto Garcia     type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index);
121757538c86SAlberto Garcia 
1218d53ec3d8SAlberto Garcia     if (!keep_old) {
1219d53ec3d8SAlberto Garcia         switch (type) {
1220d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_COMPRESSED:
1221d53ec3d8SAlberto Garcia             cow_start_from = 0;
1222d53ec3d8SAlberto Garcia             break;
1223d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_NORMAL:
1224d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_ZERO_ALLOC:
1225d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
1226d53ec3d8SAlberto Garcia             if (has_subclusters(s)) {
1227d53ec3d8SAlberto Garcia                 /* Skip all leading zero and unallocated subclusters */
1228d53ec3d8SAlberto Garcia                 uint32_t alloc_bitmap = l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC;
1229d53ec3d8SAlberto Garcia                 cow_start_from =
1230d53ec3d8SAlberto Garcia                     MIN(sc_index, ctz32(alloc_bitmap)) << s->subcluster_bits;
123157538c86SAlberto Garcia             } else {
123257538c86SAlberto Garcia                 cow_start_from = 0;
123357538c86SAlberto Garcia             }
1234d53ec3d8SAlberto Garcia             break;
1235d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_ZERO_PLAIN:
1236d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
1237d53ec3d8SAlberto Garcia             cow_start_from = sc_index << s->subcluster_bits;
1238d53ec3d8SAlberto Garcia             break;
1239d53ec3d8SAlberto Garcia         default:
1240d53ec3d8SAlberto Garcia             g_assert_not_reached();
1241d53ec3d8SAlberto Garcia         }
1242d53ec3d8SAlberto Garcia     } else {
1243d53ec3d8SAlberto Garcia         switch (type) {
1244d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_NORMAL:
1245d53ec3d8SAlberto Garcia             cow_start_from = cow_start_to;
1246d53ec3d8SAlberto Garcia             break;
1247d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_ZERO_ALLOC:
1248d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
1249d53ec3d8SAlberto Garcia             cow_start_from = sc_index << s->subcluster_bits;
1250d53ec3d8SAlberto Garcia             break;
1251d53ec3d8SAlberto Garcia         default:
1252d53ec3d8SAlberto Garcia             g_assert_not_reached();
1253d53ec3d8SAlberto Garcia         }
1254d53ec3d8SAlberto Garcia     }
125557538c86SAlberto Garcia 
125657538c86SAlberto Garcia     /* Get the L2 entry of the last cluster */
1257d53ec3d8SAlberto Garcia     l2_index += nb_clusters - 1;
1258d53ec3d8SAlberto Garcia     l2_entry = get_l2_entry(s, l2_slice, l2_index);
1259d53ec3d8SAlberto Garcia     l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
1260d53ec3d8SAlberto Garcia     sc_index = offset_to_sc_index(s, guest_offset + bytes - 1);
1261d53ec3d8SAlberto Garcia     type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index);
126257538c86SAlberto Garcia 
1263d53ec3d8SAlberto Garcia     if (!keep_old) {
1264d53ec3d8SAlberto Garcia         switch (type) {
1265d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_COMPRESSED:
126657538c86SAlberto Garcia             cow_end_to = ROUND_UP(cow_end_from, s->cluster_size);
1267d53ec3d8SAlberto Garcia             break;
1268d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_NORMAL:
1269d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_ZERO_ALLOC:
1270d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
1271d53ec3d8SAlberto Garcia             cow_end_to = ROUND_UP(cow_end_from, s->cluster_size);
1272d53ec3d8SAlberto Garcia             if (has_subclusters(s)) {
1273d53ec3d8SAlberto Garcia                 /* Skip all trailing zero and unallocated subclusters */
1274d53ec3d8SAlberto Garcia                 uint32_t alloc_bitmap = l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC;
1275d53ec3d8SAlberto Garcia                 cow_end_to -=
1276d53ec3d8SAlberto Garcia                     MIN(s->subclusters_per_cluster - sc_index - 1,
1277d53ec3d8SAlberto Garcia                         clz32(alloc_bitmap)) << s->subcluster_bits;
1278d53ec3d8SAlberto Garcia             }
1279d53ec3d8SAlberto Garcia             break;
1280d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_ZERO_PLAIN:
1281d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
1282d53ec3d8SAlberto Garcia             cow_end_to = ROUND_UP(cow_end_from, s->subcluster_size);
1283d53ec3d8SAlberto Garcia             break;
1284d53ec3d8SAlberto Garcia         default:
1285d53ec3d8SAlberto Garcia             g_assert_not_reached();
1286d53ec3d8SAlberto Garcia         }
1287d53ec3d8SAlberto Garcia     } else {
1288d53ec3d8SAlberto Garcia         switch (type) {
1289d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_NORMAL:
1290d53ec3d8SAlberto Garcia             cow_end_to = cow_end_from;
1291d53ec3d8SAlberto Garcia             break;
1292d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_ZERO_ALLOC:
1293d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
1294d53ec3d8SAlberto Garcia             cow_end_to = ROUND_UP(cow_end_from, s->subcluster_size);
1295d53ec3d8SAlberto Garcia             break;
1296d53ec3d8SAlberto Garcia         default:
1297d53ec3d8SAlberto Garcia             g_assert_not_reached();
1298d53ec3d8SAlberto Garcia         }
129957538c86SAlberto Garcia     }
13008f91d690SAlberto Garcia 
13018f91d690SAlberto Garcia     *m = g_malloc0(sizeof(**m));
13028f91d690SAlberto Garcia     **m = (QCowL2Meta) {
13038f91d690SAlberto Garcia         .next           = old_m,
13048f91d690SAlberto Garcia 
13058f91d690SAlberto Garcia         .alloc_offset   = host_cluster_offset,
13068f91d690SAlberto Garcia         .offset         = start_of_cluster(s, guest_offset),
13078f91d690SAlberto Garcia         .nb_clusters    = nb_clusters,
13088f91d690SAlberto Garcia 
13098f91d690SAlberto Garcia         .keep_old_clusters = keep_old,
13108f91d690SAlberto Garcia 
13118f91d690SAlberto Garcia         .cow_start = {
13128f91d690SAlberto Garcia             .offset     = cow_start_from,
13138f91d690SAlberto Garcia             .nb_bytes   = cow_start_to - cow_start_from,
13148f91d690SAlberto Garcia         },
13158f91d690SAlberto Garcia         .cow_end = {
13168f91d690SAlberto Garcia             .offset     = cow_end_from,
13178f91d690SAlberto Garcia             .nb_bytes   = cow_end_to - cow_end_from,
13188f91d690SAlberto Garcia         },
13198f91d690SAlberto Garcia     };
13208f91d690SAlberto Garcia 
13218f91d690SAlberto Garcia     qemu_co_queue_init(&(*m)->dependent_requests);
13228f91d690SAlberto Garcia     QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
1323d53ec3d8SAlberto Garcia 
1324d53ec3d8SAlberto Garcia     return 0;
13258f91d690SAlberto Garcia }
13268f91d690SAlberto Garcia 
132757538c86SAlberto Garcia /*
132857538c86SAlberto Garcia  * Returns true if writing to the cluster pointed to by @l2_entry
132957538c86SAlberto Garcia  * requires a new allocation (that is, if the cluster is unallocated
133057538c86SAlberto Garcia  * or has refcount > 1 and therefore cannot be written in-place).
133157538c86SAlberto Garcia  */
13328f897341SKevin Wolf static bool GRAPH_RDLOCK
cluster_needs_new_alloc(BlockDriverState * bs,uint64_t l2_entry)13338f897341SKevin Wolf cluster_needs_new_alloc(BlockDriverState *bs, uint64_t l2_entry)
1334c1587d87SAlberto Garcia {
1335c1587d87SAlberto Garcia     switch (qcow2_get_cluster_type(bs, l2_entry)) {
1336c1587d87SAlberto Garcia     case QCOW2_CLUSTER_NORMAL:
133757538c86SAlberto Garcia     case QCOW2_CLUSTER_ZERO_ALLOC:
1338c1587d87SAlberto Garcia         if (l2_entry & QCOW_OFLAG_COPIED) {
1339c1587d87SAlberto Garcia             return false;
1340c1587d87SAlberto Garcia         }
1341b9be6faeSThomas Huth         /* fallthrough */
1342c1587d87SAlberto Garcia     case QCOW2_CLUSTER_UNALLOCATED:
1343c1587d87SAlberto Garcia     case QCOW2_CLUSTER_COMPRESSED:
1344c1587d87SAlberto Garcia     case QCOW2_CLUSTER_ZERO_PLAIN:
1345c1587d87SAlberto Garcia         return true;
1346c1587d87SAlberto Garcia     default:
1347c1587d87SAlberto Garcia         abort();
1348c1587d87SAlberto Garcia     }
1349c1587d87SAlberto Garcia }
1350c1587d87SAlberto Garcia 
13518f91d690SAlberto Garcia /*
135257538c86SAlberto Garcia  * Returns the number of contiguous clusters that can be written to
135357538c86SAlberto Garcia  * using one single write request, starting from @l2_index.
135457538c86SAlberto Garcia  * At most @nb_clusters are checked.
135557538c86SAlberto Garcia  *
135657538c86SAlberto Garcia  * If @new_alloc is true this counts clusters that are either
135757538c86SAlberto Garcia  * unallocated, or allocated but with refcount > 1 (so they need to be
135857538c86SAlberto Garcia  * newly allocated and COWed).
135957538c86SAlberto Garcia  *
136057538c86SAlberto Garcia  * If @new_alloc is false this counts clusters that are already
136157538c86SAlberto Garcia  * allocated and can be overwritten in-place (this includes clusters
136257538c86SAlberto Garcia  * of type QCOW2_CLUSTER_ZERO_ALLOC).
1363bf319eceSKevin Wolf  */
13648f897341SKevin Wolf static int GRAPH_RDLOCK
count_single_write_clusters(BlockDriverState * bs,int nb_clusters,uint64_t * l2_slice,int l2_index,bool new_alloc)13658f897341SKevin Wolf count_single_write_clusters(BlockDriverState *bs, int nb_clusters,
13668f897341SKevin Wolf                             uint64_t *l2_slice, int l2_index, bool new_alloc)
1367bf319eceSKevin Wolf {
136857538c86SAlberto Garcia     BDRVQcow2State *s = bs->opaque;
136912c6aebeSAlberto Garcia     uint64_t l2_entry = get_l2_entry(s, l2_slice, l2_index);
137057538c86SAlberto Garcia     uint64_t expected_offset = l2_entry & L2E_OFFSET_MASK;
1371143550a8SKevin Wolf     int i;
1372bf319eceSKevin Wolf 
1373143550a8SKevin Wolf     for (i = 0; i < nb_clusters; i++) {
137412c6aebeSAlberto Garcia         l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
137557538c86SAlberto Garcia         if (cluster_needs_new_alloc(bs, l2_entry) != new_alloc) {
1376bf319eceSKevin Wolf             break;
1377143550a8SKevin Wolf         }
137857538c86SAlberto Garcia         if (!new_alloc) {
137957538c86SAlberto Garcia             if (expected_offset != (l2_entry & L2E_OFFSET_MASK)) {
138057538c86SAlberto Garcia                 break;
138157538c86SAlberto Garcia             }
138257538c86SAlberto Garcia             expected_offset += s->cluster_size;
138357538c86SAlberto Garcia         }
1384bf319eceSKevin Wolf     }
1385bf319eceSKevin Wolf 
1386bf319eceSKevin Wolf     assert(i <= nb_clusters);
1387bf319eceSKevin Wolf     return i;
1388bf319eceSKevin Wolf }
1389bf319eceSKevin Wolf 
1390bf319eceSKevin Wolf /*
1391250196f1SKevin Wolf  * Check if there already is an AIO write request in flight which allocates
1392250196f1SKevin Wolf  * the same cluster. In this case we need to wait until the previous
1393250196f1SKevin Wolf  * request has completed and updated the L2 table accordingly.
139465eb2e35SKevin Wolf  *
139565eb2e35SKevin Wolf  * Returns:
139665eb2e35SKevin Wolf  *   0       if there was no dependency. *cur_bytes indicates the number of
139765eb2e35SKevin Wolf  *           bytes from guest_offset that can be read before the next
139865eb2e35SKevin Wolf  *           dependency must be processed (or the request is complete)
139965eb2e35SKevin Wolf  *
140065eb2e35SKevin Wolf  *   -EAGAIN if we had to wait for another request, previously gathered
140165eb2e35SKevin Wolf  *           information on cluster allocation may be invalid now. The caller
140265eb2e35SKevin Wolf  *           must start over anyway, so consider *cur_bytes undefined.
1403250196f1SKevin Wolf  */
handle_dependencies(BlockDriverState * bs,uint64_t guest_offset,uint64_t * cur_bytes,QCowL2Meta ** m)1404050ed2e7SPaolo Bonzini static int coroutine_fn handle_dependencies(BlockDriverState *bs,
1405050ed2e7SPaolo Bonzini                                             uint64_t guest_offset,
1406ecdd5333SKevin Wolf                                             uint64_t *cur_bytes, QCowL2Meta **m)
1407226c3c26SKevin Wolf {
1408ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
1409226c3c26SKevin Wolf     QCowL2Meta *old_alloc;
141065eb2e35SKevin Wolf     uint64_t bytes = *cur_bytes;
1411226c3c26SKevin Wolf 
1412250196f1SKevin Wolf     QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {
1413250196f1SKevin Wolf 
141465eb2e35SKevin Wolf         uint64_t start = guest_offset;
141565eb2e35SKevin Wolf         uint64_t end = start + bytes;
1416d53ec3d8SAlberto Garcia         uint64_t old_start = start_of_cluster(s, l2meta_cow_start(old_alloc));
1417d53ec3d8SAlberto Garcia         uint64_t old_end = ROUND_UP(l2meta_cow_end(old_alloc), s->cluster_size);
1418250196f1SKevin Wolf 
1419d9d74f41SKevin Wolf         if (end <= old_start || start >= old_end) {
1420250196f1SKevin Wolf             /* No intersection */
14216d207d35SVladimir Sementsov-Ogievskiy             continue;
14226d207d35SVladimir Sementsov-Ogievskiy         }
14236d207d35SVladimir Sementsov-Ogievskiy 
1424ff812c55SVladimir Sementsov-Ogievskiy         if (old_alloc->keep_old_clusters &&
1425ff812c55SVladimir Sementsov-Ogievskiy             (end <= l2meta_cow_start(old_alloc) ||
1426ff812c55SVladimir Sementsov-Ogievskiy              start >= l2meta_cow_end(old_alloc)))
1427ff812c55SVladimir Sementsov-Ogievskiy         {
1428ff812c55SVladimir Sementsov-Ogievskiy             /*
1429ff812c55SVladimir Sementsov-Ogievskiy              * Clusters intersect but COW areas don't. And cluster itself is
1430ff812c55SVladimir Sementsov-Ogievskiy              * already allocated. So, there is no actual conflict.
1431ff812c55SVladimir Sementsov-Ogievskiy              */
1432ff812c55SVladimir Sementsov-Ogievskiy             continue;
1433ff812c55SVladimir Sementsov-Ogievskiy         }
1434ff812c55SVladimir Sementsov-Ogievskiy 
14356d207d35SVladimir Sementsov-Ogievskiy         /* Conflict */
14366d207d35SVladimir Sementsov-Ogievskiy 
1437250196f1SKevin Wolf         if (start < old_start) {
1438250196f1SKevin Wolf             /* Stop at the start of a running allocation */
143965eb2e35SKevin Wolf             bytes = old_start - start;
1440250196f1SKevin Wolf         } else {
144165eb2e35SKevin Wolf             bytes = 0;
1442250196f1SKevin Wolf         }
1443250196f1SKevin Wolf 
14446d207d35SVladimir Sementsov-Ogievskiy         /*
14456d207d35SVladimir Sementsov-Ogievskiy          * Stop if an l2meta already exists. After yielding, it wouldn't
1446ecdd5333SKevin Wolf          * be valid any more, so we'd have to clean up the old L2Metas
1447ecdd5333SKevin Wolf          * and deal with requests depending on them before starting to
14486d207d35SVladimir Sementsov-Ogievskiy          * gather new ones. Not worth the trouble.
14496d207d35SVladimir Sementsov-Ogievskiy          */
1450ecdd5333SKevin Wolf         if (bytes == 0 && *m) {
1451ecdd5333SKevin Wolf             *cur_bytes = 0;
1452ecdd5333SKevin Wolf             return 0;
1453ecdd5333SKevin Wolf         }
1454ecdd5333SKevin Wolf 
145565eb2e35SKevin Wolf         if (bytes == 0) {
14566d207d35SVladimir Sementsov-Ogievskiy             /*
14576d207d35SVladimir Sementsov-Ogievskiy              * Wait for the dependency to complete. We need to recheck
14586d207d35SVladimir Sementsov-Ogievskiy              * the free/allocated clusters when we continue.
14596d207d35SVladimir Sementsov-Ogievskiy              */
14601ace7ceaSPaolo Bonzini             qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
1461250196f1SKevin Wolf             return -EAGAIN;
1462250196f1SKevin Wolf         }
1463250196f1SKevin Wolf     }
1464250196f1SKevin Wolf 
146565eb2e35SKevin Wolf     /* Make sure that existing clusters and new allocations are only used up to
146665eb2e35SKevin Wolf      * the next dependency if we shortened the request above */
146765eb2e35SKevin Wolf     *cur_bytes = bytes;
1468250196f1SKevin Wolf 
1469226c3c26SKevin Wolf     return 0;
1470226c3c26SKevin Wolf }
1471226c3c26SKevin Wolf 
1472226c3c26SKevin Wolf /*
147357538c86SAlberto Garcia  * Checks how many already allocated clusters that don't require a new
147457538c86SAlberto Garcia  * allocation there are at the given guest_offset (up to *bytes).
147557538c86SAlberto Garcia  * If *host_offset is not INV_OFFSET, only physically contiguous clusters
147657538c86SAlberto Garcia  * beginning at this host offset are counted.
14770af729ecSKevin Wolf  *
1478411d62b0SKevin Wolf  * Note that guest_offset may not be cluster aligned. In this case, the
1479411d62b0SKevin Wolf  * returned *host_offset points to exact byte referenced by guest_offset and
1480411d62b0SKevin Wolf  * therefore isn't cluster aligned as well.
14810af729ecSKevin Wolf  *
14820af729ecSKevin Wolf  * Returns:
14830af729ecSKevin Wolf  *   0:     if no allocated clusters are available at the given offset.
14840af729ecSKevin Wolf  *          *bytes is normally unchanged. It is set to 0 if the cluster
148557538c86SAlberto Garcia  *          is allocated and can be overwritten in-place but doesn't have
148657538c86SAlberto Garcia  *          the right physical offset.
14870af729ecSKevin Wolf  *
148857538c86SAlberto Garcia  *   1:     if allocated clusters that can be overwritten in place are
148957538c86SAlberto Garcia  *          available at the requested offset. *bytes may have decreased
149057538c86SAlberto Garcia  *          and describes the length of the area that can be written to.
14910af729ecSKevin Wolf  *
14920af729ecSKevin Wolf  *  -errno: in error cases
14930af729ecSKevin Wolf  */
14940bb79c97SKevin Wolf static int coroutine_fn GRAPH_RDLOCK
handle_copied(BlockDriverState * bs,uint64_t guest_offset,uint64_t * host_offset,uint64_t * bytes,QCowL2Meta ** m)14950bb79c97SKevin Wolf handle_copied(BlockDriverState *bs, uint64_t guest_offset,
14960bb79c97SKevin Wolf               uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
14970af729ecSKevin Wolf {
1498ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
14990af729ecSKevin Wolf     int l2_index;
150057538c86SAlberto Garcia     uint64_t l2_entry, cluster_offset;
1501cde91766SAlberto Garcia     uint64_t *l2_slice;
1502b6d36defSMax Reitz     uint64_t nb_clusters;
1503c53ede9fSKevin Wolf     unsigned int keep_clusters;
1504a3f1afb4SAlberto Garcia     int ret;
15050af729ecSKevin Wolf 
15060af729ecSKevin Wolf     trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
15070af729ecSKevin Wolf                               *bytes);
15080af729ecSKevin Wolf 
1509c6d619ccSKevin Wolf     assert(*host_offset == INV_OFFSET || offset_into_cluster(s, guest_offset)
1510411d62b0SKevin Wolf                                       == offset_into_cluster(s, *host_offset));
1511411d62b0SKevin Wolf 
1512acb0467fSKevin Wolf     /*
1513cde91766SAlberto Garcia      * Calculate the number of clusters to look for. We stop at L2 slice
1514acb0467fSKevin Wolf      * boundaries to keep things simple.
1515acb0467fSKevin Wolf      */
1516acb0467fSKevin Wolf     nb_clusters =
1517acb0467fSKevin Wolf         size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
1518acb0467fSKevin Wolf 
1519cde91766SAlberto Garcia     l2_index = offset_to_l2_slice_index(s, guest_offset);
1520cde91766SAlberto Garcia     nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
152157538c86SAlberto Garcia     /* Limit total byte count to BDRV_REQUEST_MAX_BYTES */
152257538c86SAlberto Garcia     nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits);
1523acb0467fSKevin Wolf 
15240af729ecSKevin Wolf     /* Find L2 entry for the first involved cluster */
1525cde91766SAlberto Garcia     ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
15260af729ecSKevin Wolf     if (ret < 0) {
15270af729ecSKevin Wolf         return ret;
15280af729ecSKevin Wolf     }
15290af729ecSKevin Wolf 
153012c6aebeSAlberto Garcia     l2_entry = get_l2_entry(s, l2_slice, l2_index);
153157538c86SAlberto Garcia     cluster_offset = l2_entry & L2E_OFFSET_MASK;
15320af729ecSKevin Wolf 
153357538c86SAlberto Garcia     if (!cluster_needs_new_alloc(bs, l2_entry)) {
153457538c86SAlberto Garcia         if (offset_into_cluster(s, cluster_offset)) {
153557538c86SAlberto Garcia             qcow2_signal_corruption(bs, true, -1, -1, "%s cluster offset "
153657538c86SAlberto Garcia                                     "%#" PRIx64 " unaligned (guest offset: %#"
153757538c86SAlberto Garcia                                     PRIx64 ")", l2_entry & QCOW_OFLAG_ZERO ?
153857538c86SAlberto Garcia                                     "Preallocated zero" : "Data",
153957538c86SAlberto Garcia                                     cluster_offset, guest_offset);
1540a97c67eeSMax Reitz             ret = -EIO;
1541a97c67eeSMax Reitz             goto out;
1542a97c67eeSMax Reitz         }
1543a97c67eeSMax Reitz 
154457538c86SAlberto Garcia         /* If a specific host_offset is required, check it */
154557538c86SAlberto Garcia         if (*host_offset != INV_OFFSET && cluster_offset != *host_offset) {
1546e62daaf6SKevin Wolf             *bytes = 0;
1547e62daaf6SKevin Wolf             ret = 0;
1548e62daaf6SKevin Wolf             goto out;
1549e62daaf6SKevin Wolf         }
1550e62daaf6SKevin Wolf 
15510af729ecSKevin Wolf         /* We keep all QCOW_OFLAG_COPIED clusters */
155257538c86SAlberto Garcia         keep_clusters = count_single_write_clusters(bs, nb_clusters, l2_slice,
155357538c86SAlberto Garcia                                                     l2_index, false);
1554c53ede9fSKevin Wolf         assert(keep_clusters <= nb_clusters);
1555c53ede9fSKevin Wolf 
1556c53ede9fSKevin Wolf         *bytes = MIN(*bytes,
1557c53ede9fSKevin Wolf                  keep_clusters * s->cluster_size
1558c53ede9fSKevin Wolf                  - offset_into_cluster(s, guest_offset));
155957538c86SAlberto Garcia         assert(*bytes != 0);
156057538c86SAlberto Garcia 
1561d53ec3d8SAlberto Garcia         ret = calculate_l2_meta(bs, cluster_offset, guest_offset,
156257538c86SAlberto Garcia                                 *bytes, l2_slice, m, true);
1563d53ec3d8SAlberto Garcia         if (ret < 0) {
1564d53ec3d8SAlberto Garcia             goto out;
1565d53ec3d8SAlberto Garcia         }
15660af729ecSKevin Wolf 
15670af729ecSKevin Wolf         ret = 1;
15680af729ecSKevin Wolf     } else {
15690af729ecSKevin Wolf         ret = 0;
15700af729ecSKevin Wolf     }
15710af729ecSKevin Wolf 
15720af729ecSKevin Wolf     /* Cleanup */
1573e62daaf6SKevin Wolf out:
1574cde91766SAlberto Garcia     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
15750af729ecSKevin Wolf 
1576e62daaf6SKevin Wolf     /* Only return a host offset if we actually made progress. Otherwise we
1577e62daaf6SKevin Wolf      * would make requirements for handle_alloc() that it can't fulfill */
1578a97c67eeSMax Reitz     if (ret > 0) {
157957538c86SAlberto Garcia         *host_offset = cluster_offset + offset_into_cluster(s, guest_offset);
1580e62daaf6SKevin Wolf     }
1581e62daaf6SKevin Wolf 
15820af729ecSKevin Wolf     return ret;
15830af729ecSKevin Wolf }
15840af729ecSKevin Wolf 
15850af729ecSKevin Wolf /*
1586226c3c26SKevin Wolf  * Allocates new clusters for the given guest_offset.
1587226c3c26SKevin Wolf  *
1588226c3c26SKevin Wolf  * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
1589226c3c26SKevin Wolf  * contain the number of clusters that have been allocated and are contiguous
1590226c3c26SKevin Wolf  * in the image file.
1591226c3c26SKevin Wolf  *
1592c6d619ccSKevin Wolf  * If *host_offset is not INV_OFFSET, it specifies the offset in the image file
1593c6d619ccSKevin Wolf  * at which the new clusters must start. *nb_clusters can be 0 on return in
1594c6d619ccSKevin Wolf  * this case if the cluster at host_offset is already in use. If *host_offset
1595c6d619ccSKevin Wolf  * is INV_OFFSET, the clusters can be allocated anywhere in the image file.
1596226c3c26SKevin Wolf  *
1597226c3c26SKevin Wolf  * *host_offset is updated to contain the offset into the image file at which
1598226c3c26SKevin Wolf  * the first allocated cluster starts.
1599226c3c26SKevin Wolf  *
1600226c3c26SKevin Wolf  * Return 0 on success and -errno in error cases. -EAGAIN means that the
1601226c3c26SKevin Wolf  * function has been waiting for another request and the allocation must be
1602226c3c26SKevin Wolf  * restarted, but the whole request should not be failed.
1603226c3c26SKevin Wolf  */
16040bb79c97SKevin Wolf static int coroutine_fn GRAPH_RDLOCK
do_alloc_cluster_offset(BlockDriverState * bs,uint64_t guest_offset,uint64_t * host_offset,uint64_t * nb_clusters)16050bb79c97SKevin Wolf do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
16060bb79c97SKevin Wolf                         uint64_t *host_offset, uint64_t *nb_clusters)
1607226c3c26SKevin Wolf {
1608ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
1609226c3c26SKevin Wolf 
1610226c3c26SKevin Wolf     trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
1611226c3c26SKevin Wolf                                          *host_offset, *nb_clusters);
1612226c3c26SKevin Wolf 
1613966b000fSKevin Wolf     if (has_data_file(bs)) {
1614966b000fSKevin Wolf         assert(*host_offset == INV_OFFSET ||
1615966b000fSKevin Wolf                *host_offset == start_of_cluster(s, guest_offset));
1616966b000fSKevin Wolf         *host_offset = start_of_cluster(s, guest_offset);
1617966b000fSKevin Wolf         return 0;
1618966b000fSKevin Wolf     }
1619966b000fSKevin Wolf 
1620250196f1SKevin Wolf     /* Allocate new clusters */
1621250196f1SKevin Wolf     trace_qcow2_cluster_alloc_phys(qemu_coroutine_self());
1622c6d619ccSKevin Wolf     if (*host_offset == INV_OFFSET) {
1623df021791SKevin Wolf         int64_t cluster_offset =
1624df021791SKevin Wolf             qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size);
1625250196f1SKevin Wolf         if (cluster_offset < 0) {
1626250196f1SKevin Wolf             return cluster_offset;
1627250196f1SKevin Wolf         }
1628250196f1SKevin Wolf         *host_offset = cluster_offset;
1629250196f1SKevin Wolf         return 0;
1630df021791SKevin Wolf     } else {
1631b6d36defSMax Reitz         int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
1632df021791SKevin Wolf         if (ret < 0) {
1633df021791SKevin Wolf             return ret;
1634df021791SKevin Wolf         }
1635df021791SKevin Wolf         *nb_clusters = ret;
1636df021791SKevin Wolf         return 0;
1637df021791SKevin Wolf     }
1638250196f1SKevin Wolf }
1639250196f1SKevin Wolf 
1640250196f1SKevin Wolf /*
164157538c86SAlberto Garcia  * Allocates new clusters for an area that is either still unallocated or
164257538c86SAlberto Garcia  * cannot be overwritten in-place. If *host_offset is not INV_OFFSET,
164357538c86SAlberto Garcia  * clusters are only allocated if the new allocation can match the specified
164457538c86SAlberto Garcia  * host offset.
164510f0ed8bSKevin Wolf  *
1646411d62b0SKevin Wolf  * Note that guest_offset may not be cluster aligned. In this case, the
1647411d62b0SKevin Wolf  * returned *host_offset points to exact byte referenced by guest_offset and
1648411d62b0SKevin Wolf  * therefore isn't cluster aligned as well.
164910f0ed8bSKevin Wolf  *
165010f0ed8bSKevin Wolf  * Returns:
165110f0ed8bSKevin Wolf  *   0:     if no clusters could be allocated. *bytes is set to 0,
165210f0ed8bSKevin Wolf  *          *host_offset is left unchanged.
165310f0ed8bSKevin Wolf  *
165410f0ed8bSKevin Wolf  *   1:     if new clusters were allocated. *bytes may be decreased if the
165510f0ed8bSKevin Wolf  *          new allocation doesn't cover all of the requested area.
165610f0ed8bSKevin Wolf  *          *host_offset is updated to contain the host offset of the first
165710f0ed8bSKevin Wolf  *          newly allocated cluster.
165810f0ed8bSKevin Wolf  *
165910f0ed8bSKevin Wolf  *  -errno: in error cases
166010f0ed8bSKevin Wolf  */
16610bb79c97SKevin Wolf static int coroutine_fn GRAPH_RDLOCK
handle_alloc(BlockDriverState * bs,uint64_t guest_offset,uint64_t * host_offset,uint64_t * bytes,QCowL2Meta ** m)16620bb79c97SKevin Wolf handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
16630bb79c97SKevin Wolf              uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
166410f0ed8bSKevin Wolf {
1665ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
166610f0ed8bSKevin Wolf     int l2_index;
16676d99a344SAlberto Garcia     uint64_t *l2_slice;
1668b6d36defSMax Reitz     uint64_t nb_clusters;
166910f0ed8bSKevin Wolf     int ret;
167010f0ed8bSKevin Wolf 
167157538c86SAlberto Garcia     uint64_t alloc_cluster_offset;
167210f0ed8bSKevin Wolf 
167310f0ed8bSKevin Wolf     trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
167410f0ed8bSKevin Wolf                              *bytes);
167510f0ed8bSKevin Wolf     assert(*bytes > 0);
167610f0ed8bSKevin Wolf 
1677f5bc6350SKevin Wolf     /*
16786d99a344SAlberto Garcia      * Calculate the number of clusters to look for. We stop at L2 slice
1679f5bc6350SKevin Wolf      * boundaries to keep things simple.
1680f5bc6350SKevin Wolf      */
1681c37f4cd7SKevin Wolf     nb_clusters =
1682c37f4cd7SKevin Wolf         size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
1683c37f4cd7SKevin Wolf 
16846d99a344SAlberto Garcia     l2_index = offset_to_l2_slice_index(s, guest_offset);
16856d99a344SAlberto Garcia     nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
168657538c86SAlberto Garcia     /* Limit total allocation byte count to BDRV_REQUEST_MAX_BYTES */
168757538c86SAlberto Garcia     nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits);
1688d1b9d19fSMax Reitz 
168910f0ed8bSKevin Wolf     /* Find L2 entry for the first involved cluster */
16906d99a344SAlberto Garcia     ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
169110f0ed8bSKevin Wolf     if (ret < 0) {
169210f0ed8bSKevin Wolf         return ret;
169310f0ed8bSKevin Wolf     }
169410f0ed8bSKevin Wolf 
169557538c86SAlberto Garcia     nb_clusters = count_single_write_clusters(bs, nb_clusters,
169657538c86SAlberto Garcia                                               l2_slice, l2_index, true);
169710f0ed8bSKevin Wolf 
1698ecdd5333SKevin Wolf     /* This function is only called when there were no non-COW clusters, so if
1699ecdd5333SKevin Wolf      * we can't find any unallocated or COW clusters either, something is
1700ecdd5333SKevin Wolf      * wrong with our code. */
1701ecdd5333SKevin Wolf     assert(nb_clusters > 0);
1702ecdd5333SKevin Wolf 
170357538c86SAlberto Garcia     /* Allocate at a given offset in the image file */
1704c6d619ccSKevin Wolf     alloc_cluster_offset = *host_offset == INV_OFFSET ? INV_OFFSET :
1705c6d619ccSKevin Wolf         start_of_cluster(s, *host_offset);
170683baa9a4SKevin Wolf     ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
170710f0ed8bSKevin Wolf                                   &nb_clusters);
170810f0ed8bSKevin Wolf     if (ret < 0) {
170957538c86SAlberto Garcia         goto out;
171010f0ed8bSKevin Wolf     }
171110f0ed8bSKevin Wolf 
171283baa9a4SKevin Wolf     /* Can't extend contiguous allocation */
171383baa9a4SKevin Wolf     if (nb_clusters == 0) {
171483baa9a4SKevin Wolf         *bytes = 0;
171557538c86SAlberto Garcia         ret = 0;
171657538c86SAlberto Garcia         goto out;
171783baa9a4SKevin Wolf     }
171883baa9a4SKevin Wolf 
1719c6d619ccSKevin Wolf     assert(alloc_cluster_offset != INV_OFFSET);
1720ff52aab2SMax Reitz 
172110f0ed8bSKevin Wolf     /*
172283baa9a4SKevin Wolf      * Save info needed for meta data update.
172383baa9a4SKevin Wolf      *
172485567393SKevin Wolf      * requested_bytes: Number of bytes from the start of the first
172510f0ed8bSKevin Wolf      * newly allocated cluster to the end of the (possibly shortened
172610f0ed8bSKevin Wolf      * before) write request.
172710f0ed8bSKevin Wolf      *
172885567393SKevin Wolf      * avail_bytes: Number of bytes from the start of the first
172910f0ed8bSKevin Wolf      * newly allocated to the end of the last newly allocated cluster.
173010f0ed8bSKevin Wolf      *
173185567393SKevin Wolf      * nb_bytes: The number of bytes from the start of the first
173283baa9a4SKevin Wolf      * newly allocated cluster to the end of the area that the write
173310f0ed8bSKevin Wolf      * request actually writes to (excluding COW at the end)
173410f0ed8bSKevin Wolf      */
173585567393SKevin Wolf     uint64_t requested_bytes = *bytes + offset_into_cluster(s, guest_offset);
1736d1b9d19fSMax Reitz     int avail_bytes = nb_clusters << s->cluster_bits;
173785567393SKevin Wolf     int nb_bytes = MIN(requested_bytes, avail_bytes);
173810f0ed8bSKevin Wolf 
1739411d62b0SKevin Wolf     *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
174085567393SKevin Wolf     *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset));
1741c37f4cd7SKevin Wolf     assert(*bytes != 0);
174210f0ed8bSKevin Wolf 
1743d53ec3d8SAlberto Garcia     ret = calculate_l2_meta(bs, alloc_cluster_offset, guest_offset, *bytes,
1744d53ec3d8SAlberto Garcia                             l2_slice, m, false);
1745d53ec3d8SAlberto Garcia     if (ret < 0) {
1746d53ec3d8SAlberto Garcia         goto out;
1747d53ec3d8SAlberto Garcia     }
17488f91d690SAlberto Garcia 
174957538c86SAlberto Garcia     ret = 1;
175010f0ed8bSKevin Wolf 
175157538c86SAlberto Garcia out:
175257538c86SAlberto Garcia     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
175310f0ed8bSKevin Wolf     return ret;
175410f0ed8bSKevin Wolf }
175510f0ed8bSKevin Wolf 
175610f0ed8bSKevin Wolf /*
17572b60c5b9SAlberto Garcia  * For a given area on the virtual disk defined by @offset and @bytes,
17582b60c5b9SAlberto Garcia  * find the corresponding area on the qcow2 image, allocating new
17592b60c5b9SAlberto Garcia  * clusters (or subclusters) if necessary. The result can span a
17602b60c5b9SAlberto Garcia  * combination of allocated and previously unallocated clusters.
176145aba42fSKevin Wolf  *
1762bfd0989aSAlberto Garcia  * Note that offset may not be cluster aligned. In this case, the returned
1763bfd0989aSAlberto Garcia  * *host_offset points to exact byte referenced by offset and therefore
1764bfd0989aSAlberto Garcia  * isn't cluster aligned as well.
1765bfd0989aSAlberto Garcia  *
17662b60c5b9SAlberto Garcia  * On return, @host_offset is set to the beginning of the requested
17672b60c5b9SAlberto Garcia  * area. This area is guaranteed to be contiguous on the qcow2 file
17682b60c5b9SAlberto Garcia  * but it can be smaller than initially requested. In this case @bytes
17692b60c5b9SAlberto Garcia  * is updated with the actual size.
177045aba42fSKevin Wolf  *
17712b60c5b9SAlberto Garcia  * If any clusters or subclusters were allocated then @m contains a
17722b60c5b9SAlberto Garcia  * list with the information of all the affected regions. Note that
17732b60c5b9SAlberto Garcia  * this can happen regardless of whether this function succeeds or
17742b60c5b9SAlberto Garcia  * not. The caller is responsible for updating the L2 metadata of the
17752b60c5b9SAlberto Garcia  * allocated clusters (on success) or freeing them (on failure), and
17762b60c5b9SAlberto Garcia  * for clearing the contents of @m afterwards in both cases.
1777148da7eaSKevin Wolf  *
177868d100e9SKevin Wolf  * If the request conflicts with another write request in flight, the coroutine
177968d100e9SKevin Wolf  * is queued and will be reentered when the dependency has completed.
1780148da7eaSKevin Wolf  *
1781148da7eaSKevin Wolf  * Return 0 on success and -errno in error cases
178245aba42fSKevin Wolf  */
qcow2_alloc_host_offset(BlockDriverState * bs,uint64_t offset,unsigned int * bytes,uint64_t * host_offset,QCowL2Meta ** m)1783050ed2e7SPaolo Bonzini int coroutine_fn qcow2_alloc_host_offset(BlockDriverState *bs, uint64_t offset,
1784050ed2e7SPaolo Bonzini                                          unsigned int *bytes,
1785050ed2e7SPaolo Bonzini                                          uint64_t *host_offset,
1786d46a0bb2SKevin Wolf                                          QCowL2Meta **m)
178745aba42fSKevin Wolf {
1788ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
1789710c2496SKevin Wolf     uint64_t start, remaining;
1790250196f1SKevin Wolf     uint64_t cluster_offset;
179165eb2e35SKevin Wolf     uint64_t cur_bytes;
1792710c2496SKevin Wolf     int ret;
179345aba42fSKevin Wolf 
1794d46a0bb2SKevin Wolf     trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *bytes);
1795710c2496SKevin Wolf 
179672424114SKevin Wolf again:
179716f0587eSHu Tao     start = offset;
1798d46a0bb2SKevin Wolf     remaining = *bytes;
1799c6d619ccSKevin Wolf     cluster_offset = INV_OFFSET;
1800c6d619ccSKevin Wolf     *host_offset = INV_OFFSET;
1801ecdd5333SKevin Wolf     cur_bytes = 0;
1802ecdd5333SKevin Wolf     *m = NULL;
18030af729ecSKevin Wolf 
18042c3b32d2SKevin Wolf     while (true) {
1805ecdd5333SKevin Wolf 
1806c6d619ccSKevin Wolf         if (*host_offset == INV_OFFSET && cluster_offset != INV_OFFSET) {
1807bfd0989aSAlberto Garcia             *host_offset = cluster_offset;
1808ecdd5333SKevin Wolf         }
1809ecdd5333SKevin Wolf 
1810ecdd5333SKevin Wolf         assert(remaining >= cur_bytes);
1811ecdd5333SKevin Wolf 
1812ecdd5333SKevin Wolf         start           += cur_bytes;
1813ecdd5333SKevin Wolf         remaining       -= cur_bytes;
1814c6d619ccSKevin Wolf 
1815c6d619ccSKevin Wolf         if (cluster_offset != INV_OFFSET) {
1816ecdd5333SKevin Wolf             cluster_offset += cur_bytes;
1817c6d619ccSKevin Wolf         }
1818ecdd5333SKevin Wolf 
1819ecdd5333SKevin Wolf         if (remaining == 0) {
1820ecdd5333SKevin Wolf             break;
1821ecdd5333SKevin Wolf         }
1822ecdd5333SKevin Wolf 
1823ecdd5333SKevin Wolf         cur_bytes = remaining;
1824ecdd5333SKevin Wolf 
1825250196f1SKevin Wolf         /*
182617a71e58SKevin Wolf          * Now start gathering as many contiguous clusters as possible:
182717a71e58SKevin Wolf          *
182817a71e58SKevin Wolf          * 1. Check for overlaps with in-flight allocations
182917a71e58SKevin Wolf          *
18302c3b32d2SKevin Wolf          *      a) Overlap not in the first cluster -> shorten this request and
18312c3b32d2SKevin Wolf          *         let the caller handle the rest in its next loop iteration.
183217a71e58SKevin Wolf          *
18332c3b32d2SKevin Wolf          *      b) Real overlaps of two requests. Yield and restart the search
18342c3b32d2SKevin Wolf          *         for contiguous clusters (the situation could have changed
18352c3b32d2SKevin Wolf          *         while we were sleeping)
183617a71e58SKevin Wolf          *
183717a71e58SKevin Wolf          *      c) TODO: Request starts in the same cluster as the in-flight
18382c3b32d2SKevin Wolf          *         allocation ends. Shorten the COW of the in-fight allocation,
18392c3b32d2SKevin Wolf          *         set cluster_offset to write to the same cluster and set up
18402c3b32d2SKevin Wolf          *         the right synchronisation between the in-flight request and
18412c3b32d2SKevin Wolf          *         the new one.
184217a71e58SKevin Wolf          */
1843ecdd5333SKevin Wolf         ret = handle_dependencies(bs, start, &cur_bytes, m);
184417a71e58SKevin Wolf         if (ret == -EAGAIN) {
1845ecdd5333SKevin Wolf             /* Currently handle_dependencies() doesn't yield if we already had
1846ecdd5333SKevin Wolf              * an allocation. If it did, we would have to clean up the L2Meta
1847ecdd5333SKevin Wolf              * structs before starting over. */
1848ecdd5333SKevin Wolf             assert(*m == NULL);
184917a71e58SKevin Wolf             goto again;
185017a71e58SKevin Wolf         } else if (ret < 0) {
185117a71e58SKevin Wolf             return ret;
1852ecdd5333SKevin Wolf         } else if (cur_bytes == 0) {
1853ecdd5333SKevin Wolf             break;
185417a71e58SKevin Wolf         } else {
185517a71e58SKevin Wolf             /* handle_dependencies() may have decreased cur_bytes (shortened
185617a71e58SKevin Wolf              * the allocations below) so that the next dependency is processed
185717a71e58SKevin Wolf              * correctly during the next loop iteration. */
185817a71e58SKevin Wolf         }
185917a71e58SKevin Wolf 
186072424114SKevin Wolf         /*
18610af729ecSKevin Wolf          * 2. Count contiguous COPIED clusters.
186272424114SKevin Wolf          */
1863710c2496SKevin Wolf         ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m);
186472424114SKevin Wolf         if (ret < 0) {
186572424114SKevin Wolf             return ret;
18660af729ecSKevin Wolf         } else if (ret) {
1867ecdd5333SKevin Wolf             continue;
1868e62daaf6SKevin Wolf         } else if (cur_bytes == 0) {
18692c3b32d2SKevin Wolf             break;
187072424114SKevin Wolf         }
187172424114SKevin Wolf 
18720af729ecSKevin Wolf         /*
18730af729ecSKevin Wolf          * 3. If the request still hasn't completed, allocate new clusters,
18740af729ecSKevin Wolf          *    considering any cluster_offset of steps 1c or 2.
18750af729ecSKevin Wolf          */
1876710c2496SKevin Wolf         ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m);
1877037689d8SKevin Wolf         if (ret < 0) {
1878037689d8SKevin Wolf             return ret;
1879710c2496SKevin Wolf         } else if (ret) {
1880ecdd5333SKevin Wolf             continue;
18812c3b32d2SKevin Wolf         } else {
18822c3b32d2SKevin Wolf             assert(cur_bytes == 0);
18832c3b32d2SKevin Wolf             break;
18842c3b32d2SKevin Wolf         }
1885710c2496SKevin Wolf     }
1886250196f1SKevin Wolf 
1887d46a0bb2SKevin Wolf     *bytes -= remaining;
1888d46a0bb2SKevin Wolf     assert(*bytes > 0);
1889c6d619ccSKevin Wolf     assert(*host_offset != INV_OFFSET);
1890bfd0989aSAlberto Garcia     assert(offset_into_cluster(s, *host_offset) ==
1891bfd0989aSAlberto Garcia            offset_into_cluster(s, offset));
189245aba42fSKevin Wolf 
1893148da7eaSKevin Wolf     return 0;
189445aba42fSKevin Wolf }
189545aba42fSKevin Wolf 
18965ea929e3SKevin Wolf /*
18975ea929e3SKevin Wolf  * This discards as many clusters of nb_clusters as possible at once (i.e.
189821ab3addSAlberto Garcia  * all clusters in the same L2 slice) and returns the number of discarded
18995ea929e3SKevin Wolf  * clusters.
19005ea929e3SKevin Wolf  */
19010bb79c97SKevin Wolf static int GRAPH_RDLOCK
discard_in_l2_slice(BlockDriverState * bs,uint64_t offset,uint64_t nb_clusters,enum qcow2_discard_type type,bool full_discard)19020bb79c97SKevin Wolf discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t nb_clusters,
190321ab3addSAlberto Garcia                     enum qcow2_discard_type type, bool full_discard)
19045ea929e3SKevin Wolf {
1905ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
190621ab3addSAlberto Garcia     uint64_t *l2_slice;
19075ea929e3SKevin Wolf     int l2_index;
19085ea929e3SKevin Wolf     int ret;
19095ea929e3SKevin Wolf     int i;
19105ea929e3SKevin Wolf 
191121ab3addSAlberto Garcia     ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
19125ea929e3SKevin Wolf     if (ret < 0) {
19135ea929e3SKevin Wolf         return ret;
19145ea929e3SKevin Wolf     }
19155ea929e3SKevin Wolf 
191621ab3addSAlberto Garcia     /* Limit nb_clusters to one L2 slice */
191721ab3addSAlberto Garcia     nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
1918b6d36defSMax Reitz     assert(nb_clusters <= INT_MAX);
19195ea929e3SKevin Wolf 
19205ea929e3SKevin Wolf     for (i = 0; i < nb_clusters; i++) {
1921a68cd703SAlberto Garcia         uint64_t old_l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
1922a68cd703SAlberto Garcia         uint64_t old_l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
1923a68cd703SAlberto Garcia         uint64_t new_l2_entry = old_l2_entry;
1924a68cd703SAlberto Garcia         uint64_t new_l2_bitmap = old_l2_bitmap;
1925a68cd703SAlberto Garcia         QCow2ClusterType cluster_type =
1926a68cd703SAlberto Garcia             qcow2_get_cluster_type(bs, old_l2_entry);
192742a2890aSJean-Louis Dupond         bool keep_reference = (cluster_type != QCOW2_CLUSTER_COMPRESSED) &&
192842a2890aSJean-Louis Dupond                               !full_discard &&
192942a2890aSJean-Louis Dupond                               (s->discard_no_unref &&
193042a2890aSJean-Louis Dupond                                type == QCOW2_DISCARD_REQUEST);
1931a71835a0SKevin Wolf 
1932a71835a0SKevin Wolf         /*
1933a68cd703SAlberto Garcia          * If full_discard is true, the cluster should not read back as zeroes,
1934a68cd703SAlberto Garcia          * but rather fall through to the backing file.
1935a68cd703SAlberto Garcia          *
1936808c4b6fSMax Reitz          * If full_discard is false, make sure that a discarded area reads back
1937808c4b6fSMax Reitz          * as zeroes for v3 images (we cannot do it for v2 without actually
1938808c4b6fSMax Reitz          * writing a zero-filled buffer). We can skip the operation if the
1939808c4b6fSMax Reitz          * cluster is already marked as zero, or if it's unallocated and we
1940808c4b6fSMax Reitz          * don't have a backing file.
1941a71835a0SKevin Wolf          *
1942237d78f8SEric Blake          * TODO We might want to use bdrv_block_status(bs) here, but we're
1943a71835a0SKevin Wolf          * holding s->lock, so that doesn't work today.
1944a71835a0SKevin Wolf          */
1945a68cd703SAlberto Garcia         if (full_discard) {
1946a68cd703SAlberto Garcia             new_l2_entry = new_l2_bitmap = 0;
1947a68cd703SAlberto Garcia         } else if (bs->backing || qcow2_cluster_is_allocated(cluster_type)) {
1948a68cd703SAlberto Garcia             if (has_subclusters(s)) {
194942a2890aSJean-Louis Dupond                 if (keep_reference) {
195042a2890aSJean-Louis Dupond                     new_l2_entry = old_l2_entry;
195142a2890aSJean-Louis Dupond                 } else {
1952a68cd703SAlberto Garcia                     new_l2_entry = 0;
195342a2890aSJean-Louis Dupond                 }
1954a68cd703SAlberto Garcia                 new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES;
1955a68cd703SAlberto Garcia             } else {
195642a2890aSJean-Louis Dupond                 if (s->qcow_version >= 3) {
195742a2890aSJean-Louis Dupond                     if (keep_reference) {
195842a2890aSJean-Louis Dupond                         new_l2_entry |= QCOW_OFLAG_ZERO;
195942a2890aSJean-Louis Dupond                     } else {
196042a2890aSJean-Louis Dupond                         new_l2_entry = QCOW_OFLAG_ZERO;
196142a2890aSJean-Louis Dupond                     }
196242a2890aSJean-Louis Dupond                 } else {
196342a2890aSJean-Louis Dupond                     new_l2_entry = 0;
196442a2890aSJean-Louis Dupond                 }
1965a71835a0SKevin Wolf             }
1966808c4b6fSMax Reitz         }
1967c883db0dSMax Reitz 
1968a68cd703SAlberto Garcia         if (old_l2_entry == new_l2_entry && old_l2_bitmap == new_l2_bitmap) {
1969a68cd703SAlberto Garcia             continue;
19705ea929e3SKevin Wolf         }
19715ea929e3SKevin Wolf 
19725ea929e3SKevin Wolf         /* First remove L2 entries */
197321ab3addSAlberto Garcia         qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
1974a68cd703SAlberto Garcia         set_l2_entry(s, l2_slice, l2_index + i, new_l2_entry);
1975a68cd703SAlberto Garcia         if (has_subclusters(s)) {
1976a68cd703SAlberto Garcia             set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap);
1977a71835a0SKevin Wolf         }
197842a2890aSJean-Louis Dupond         if (!keep_reference) {
19795ea929e3SKevin Wolf             /* Then decrease the refcount */
19803fec237fSAlberto Garcia             qcow2_free_any_cluster(bs, old_l2_entry, type);
198142a2890aSJean-Louis Dupond         } else if (s->discard_passthrough[type] &&
198242a2890aSJean-Louis Dupond                    (cluster_type == QCOW2_CLUSTER_NORMAL ||
198342a2890aSJean-Louis Dupond                     cluster_type == QCOW2_CLUSTER_ZERO_ALLOC)) {
198442a2890aSJean-Louis Dupond             /* If we keep the reference, pass on the discard still */
198542a2890aSJean-Louis Dupond             bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK,
198642a2890aSJean-Louis Dupond                           s->cluster_size);
198742a2890aSJean-Louis Dupond         }
19885ea929e3SKevin Wolf     }
19895ea929e3SKevin Wolf 
199021ab3addSAlberto Garcia     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
19915ea929e3SKevin Wolf 
19925ea929e3SKevin Wolf     return nb_clusters;
19935ea929e3SKevin Wolf }
19945ea929e3SKevin Wolf 
qcow2_cluster_discard(BlockDriverState * bs,uint64_t offset,uint64_t bytes,enum qcow2_discard_type type,bool full_discard)1995d2cb36afSEric Blake int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
1996d2cb36afSEric Blake                           uint64_t bytes, enum qcow2_discard_type type,
1997d2cb36afSEric Blake                           bool full_discard)
19985ea929e3SKevin Wolf {
1999ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
2000d2cb36afSEric Blake     uint64_t end_offset = offset + bytes;
2001b6d36defSMax Reitz     uint64_t nb_clusters;
2002d2cb36afSEric Blake     int64_t cleared;
20035ea929e3SKevin Wolf     int ret;
20045ea929e3SKevin Wolf 
2005f10ee139SEric Blake     /* Caller must pass aligned values, except at image end */
20060c1bd469SEric Blake     assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
2007f10ee139SEric Blake     assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
2008f10ee139SEric Blake            end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
20095ea929e3SKevin Wolf 
2010d2cb36afSEric Blake     nb_clusters = size_to_clusters(s, bytes);
20115ea929e3SKevin Wolf 
20120b919faeSKevin Wolf     s->cache_discards = true;
20130b919faeSKevin Wolf 
201421ab3addSAlberto Garcia     /* Each L2 slice is handled by its own loop iteration */
20155ea929e3SKevin Wolf     while (nb_clusters > 0) {
201621ab3addSAlberto Garcia         cleared = discard_in_l2_slice(bs, offset, nb_clusters, type,
2017d2cb36afSEric Blake                                       full_discard);
2018d2cb36afSEric Blake         if (cleared < 0) {
2019d2cb36afSEric Blake             ret = cleared;
20200b919faeSKevin Wolf             goto fail;
20215ea929e3SKevin Wolf         }
20225ea929e3SKevin Wolf 
2023d2cb36afSEric Blake         nb_clusters -= cleared;
2024d2cb36afSEric Blake         offset += (cleared * s->cluster_size);
20255ea929e3SKevin Wolf     }
20265ea929e3SKevin Wolf 
20270b919faeSKevin Wolf     ret = 0;
20280b919faeSKevin Wolf fail:
20290b919faeSKevin Wolf     s->cache_discards = false;
20300b919faeSKevin Wolf     qcow2_process_discards(bs, ret);
20310b919faeSKevin Wolf 
20320b919faeSKevin Wolf     return ret;
20335ea929e3SKevin Wolf }
2034621f0589SKevin Wolf 
2035621f0589SKevin Wolf /*
2036621f0589SKevin Wolf  * This zeroes as many clusters of nb_clusters as possible at once (i.e.
2037a9a9f8f0SAlberto Garcia  * all clusters in the same L2 slice) and returns the number of zeroed
2038621f0589SKevin Wolf  * clusters.
2039621f0589SKevin Wolf  */
20400bb79c97SKevin Wolf static int coroutine_fn GRAPH_RDLOCK
zero_in_l2_slice(BlockDriverState * bs,uint64_t offset,uint64_t nb_clusters,int flags)204170bacc44SPaolo Bonzini zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
2042170f4b2eSFam Zheng                  uint64_t nb_clusters, int flags)
2043621f0589SKevin Wolf {
2044ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
2045a9a9f8f0SAlberto Garcia     uint64_t *l2_slice;
2046621f0589SKevin Wolf     int l2_index;
2047621f0589SKevin Wolf     int ret;
2048621f0589SKevin Wolf     int i;
2049621f0589SKevin Wolf 
2050a9a9f8f0SAlberto Garcia     ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
2051621f0589SKevin Wolf     if (ret < 0) {
2052621f0589SKevin Wolf         return ret;
2053621f0589SKevin Wolf     }
2054621f0589SKevin Wolf 
2055a9a9f8f0SAlberto Garcia     /* Limit nb_clusters to one L2 slice */
2056a9a9f8f0SAlberto Garcia     nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
2057b6d36defSMax Reitz     assert(nb_clusters <= INT_MAX);
2058621f0589SKevin Wolf 
2059621f0589SKevin Wolf     for (i = 0; i < nb_clusters; i++) {
2060205fa507SAlberto Garcia         uint64_t old_l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
2061205fa507SAlberto Garcia         uint64_t old_l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
2062205fa507SAlberto Garcia         QCow2ClusterType type = qcow2_get_cluster_type(bs, old_l2_entry);
2063205fa507SAlberto Garcia         bool unmap = (type == QCOW2_CLUSTER_COMPRESSED) ||
2064205fa507SAlberto Garcia             ((flags & BDRV_REQ_MAY_UNMAP) && qcow2_cluster_is_allocated(type));
2065b2b10904SJean-Louis Dupond         bool keep_reference =
2066b2b10904SJean-Louis Dupond             (s->discard_no_unref && type != QCOW2_CLUSTER_COMPRESSED);
2067b2b10904SJean-Louis Dupond         uint64_t new_l2_entry = old_l2_entry;
2068205fa507SAlberto Garcia         uint64_t new_l2_bitmap = old_l2_bitmap;
2069621f0589SKevin Wolf 
2070b2b10904SJean-Louis Dupond         if (unmap && !keep_reference) {
2071b2b10904SJean-Louis Dupond             new_l2_entry = 0;
2072b2b10904SJean-Louis Dupond         }
2073b2b10904SJean-Louis Dupond 
2074205fa507SAlberto Garcia         if (has_subclusters(s)) {
2075205fa507SAlberto Garcia             new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES;
2076205fa507SAlberto Garcia         } else {
2077205fa507SAlberto Garcia             new_l2_entry |= QCOW_OFLAG_ZERO;
2078205fa507SAlberto Garcia         }
2079621f0589SKevin Wolf 
2080205fa507SAlberto Garcia         if (old_l2_entry == new_l2_entry && old_l2_bitmap == new_l2_bitmap) {
208106cc5e2bSEric Blake             continue;
208206cc5e2bSEric Blake         }
208306cc5e2bSEric Blake 
2084c8bf9a91SMaxim Levitsky         /* First update L2 entries */
2085a9a9f8f0SAlberto Garcia         qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
2086205fa507SAlberto Garcia         set_l2_entry(s, l2_slice, l2_index + i, new_l2_entry);
2087205fa507SAlberto Garcia         if (has_subclusters(s)) {
2088205fa507SAlberto Garcia             set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap);
2089621f0589SKevin Wolf         }
2090c8bf9a91SMaxim Levitsky 
2091c8bf9a91SMaxim Levitsky         if (unmap) {
2092b2b10904SJean-Louis Dupond             if (!keep_reference) {
2093b2b10904SJean-Louis Dupond                 /* Then decrease the refcount */
2094c8bf9a91SMaxim Levitsky                 qcow2_free_any_cluster(bs, old_l2_entry, QCOW2_DISCARD_REQUEST);
2095b2b10904SJean-Louis Dupond             } else if (s->discard_passthrough[QCOW2_DISCARD_REQUEST] &&
2096b2b10904SJean-Louis Dupond                        (type == QCOW2_CLUSTER_NORMAL ||
2097b2b10904SJean-Louis Dupond                         type == QCOW2_CLUSTER_ZERO_ALLOC)) {
2098b2b10904SJean-Louis Dupond                 /* If we keep the reference, pass on the discard still */
2099b2b10904SJean-Louis Dupond                 bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK,
2100b2b10904SJean-Louis Dupond                             s->cluster_size);
2101b2b10904SJean-Louis Dupond             }
2102c8bf9a91SMaxim Levitsky         }
2103621f0589SKevin Wolf     }
2104621f0589SKevin Wolf 
2105a9a9f8f0SAlberto Garcia     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
2106621f0589SKevin Wolf 
2107621f0589SKevin Wolf     return nb_clusters;
2108621f0589SKevin Wolf }
2109621f0589SKevin Wolf 
21100bb79c97SKevin Wolf static int coroutine_fn GRAPH_RDLOCK
zero_l2_subclusters(BlockDriverState * bs,uint64_t offset,unsigned nb_subclusters)2111a39bae4eSPaolo Bonzini zero_l2_subclusters(BlockDriverState *bs, uint64_t offset,
2112a6841a2dSAlberto Garcia                     unsigned nb_subclusters)
2113a6841a2dSAlberto Garcia {
2114a6841a2dSAlberto Garcia     BDRVQcow2State *s = bs->opaque;
2115a6841a2dSAlberto Garcia     uint64_t *l2_slice;
2116a6841a2dSAlberto Garcia     uint64_t old_l2_bitmap, l2_bitmap;
2117a6841a2dSAlberto Garcia     int l2_index, ret, sc = offset_to_sc_index(s, offset);
2118a6841a2dSAlberto Garcia 
2119a6841a2dSAlberto Garcia     /* For full clusters use zero_in_l2_slice() instead */
2120a6841a2dSAlberto Garcia     assert(nb_subclusters > 0 && nb_subclusters < s->subclusters_per_cluster);
2121a6841a2dSAlberto Garcia     assert(sc + nb_subclusters <= s->subclusters_per_cluster);
2122a6841a2dSAlberto Garcia     assert(offset_into_subcluster(s, offset) == 0);
2123a6841a2dSAlberto Garcia 
2124a6841a2dSAlberto Garcia     ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
2125a6841a2dSAlberto Garcia     if (ret < 0) {
2126a6841a2dSAlberto Garcia         return ret;
2127a6841a2dSAlberto Garcia     }
2128a6841a2dSAlberto Garcia 
2129a6841a2dSAlberto Garcia     switch (qcow2_get_cluster_type(bs, get_l2_entry(s, l2_slice, l2_index))) {
2130a6841a2dSAlberto Garcia     case QCOW2_CLUSTER_COMPRESSED:
2131a6841a2dSAlberto Garcia         ret = -ENOTSUP; /* We cannot partially zeroize compressed clusters */
2132a6841a2dSAlberto Garcia         goto out;
2133a6841a2dSAlberto Garcia     case QCOW2_CLUSTER_NORMAL:
2134a6841a2dSAlberto Garcia     case QCOW2_CLUSTER_UNALLOCATED:
2135a6841a2dSAlberto Garcia         break;
2136a6841a2dSAlberto Garcia     default:
2137a6841a2dSAlberto Garcia         g_assert_not_reached();
2138a6841a2dSAlberto Garcia     }
2139a6841a2dSAlberto Garcia 
2140a6841a2dSAlberto Garcia     old_l2_bitmap = l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
2141a6841a2dSAlberto Garcia 
2142a6841a2dSAlberto Garcia     l2_bitmap |=  QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters);
2143a6841a2dSAlberto Garcia     l2_bitmap &= ~QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters);
2144a6841a2dSAlberto Garcia 
2145a6841a2dSAlberto Garcia     if (old_l2_bitmap != l2_bitmap) {
2146a6841a2dSAlberto Garcia         set_l2_bitmap(s, l2_slice, l2_index, l2_bitmap);
2147a6841a2dSAlberto Garcia         qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
2148a6841a2dSAlberto Garcia     }
2149a6841a2dSAlberto Garcia 
2150a6841a2dSAlberto Garcia     ret = 0;
2151a6841a2dSAlberto Garcia out:
2152a6841a2dSAlberto Garcia     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
2153a6841a2dSAlberto Garcia 
2154a6841a2dSAlberto Garcia     return ret;
2155a6841a2dSAlberto Garcia }
2156a6841a2dSAlberto Garcia 
qcow2_subcluster_zeroize(BlockDriverState * bs,uint64_t offset,uint64_t bytes,int flags)2157050ed2e7SPaolo Bonzini int coroutine_fn qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset,
2158d2cb36afSEric Blake                                           uint64_t bytes, int flags)
2159621f0589SKevin Wolf {
2160ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
2161d2cb36afSEric Blake     uint64_t end_offset = offset + bytes;
2162b6d36defSMax Reitz     uint64_t nb_clusters;
2163a6841a2dSAlberto Garcia     unsigned head, tail;
2164d2cb36afSEric Blake     int64_t cleared;
2165621f0589SKevin Wolf     int ret;
2166621f0589SKevin Wolf 
21676c3944dcSKevin Wolf     /* If we have to stay in sync with an external data file, zero out
21686c3944dcSKevin Wolf      * s->data_file first. */
21696c3944dcSKevin Wolf     if (data_file_is_raw(bs)) {
21706c3944dcSKevin Wolf         assert(has_data_file(bs));
21716c3944dcSKevin Wolf         ret = bdrv_co_pwrite_zeroes(s->data_file, offset, bytes, flags);
21726c3944dcSKevin Wolf         if (ret < 0) {
21736c3944dcSKevin Wolf             return ret;
21746c3944dcSKevin Wolf         }
21756c3944dcSKevin Wolf     }
21766c3944dcSKevin Wolf 
2177f10ee139SEric Blake     /* Caller must pass aligned values, except at image end */
2178a6841a2dSAlberto Garcia     assert(offset_into_subcluster(s, offset) == 0);
2179a6841a2dSAlberto Garcia     assert(offset_into_subcluster(s, end_offset) == 0 ||
2180f01643fbSKevin Wolf            end_offset >= bs->total_sectors << BDRV_SECTOR_BITS);
2181f10ee139SEric Blake 
218261b30439SKevin Wolf     /*
218361b30439SKevin Wolf      * The zero flag is only supported by version 3 and newer. However, if we
218461b30439SKevin Wolf      * have no backing file, we can resort to discard in version 2.
218561b30439SKevin Wolf      */
2186621f0589SKevin Wolf     if (s->qcow_version < 3) {
218761b30439SKevin Wolf         if (!bs->backing) {
218861b30439SKevin Wolf             return qcow2_cluster_discard(bs, offset, bytes,
218961b30439SKevin Wolf                                          QCOW2_DISCARD_REQUEST, false);
219061b30439SKevin Wolf         }
2191621f0589SKevin Wolf         return -ENOTSUP;
2192621f0589SKevin Wolf     }
2193621f0589SKevin Wolf 
2194a6841a2dSAlberto Garcia     head = MIN(end_offset, ROUND_UP(offset, s->cluster_size)) - offset;
2195a6841a2dSAlberto Garcia     offset += head;
2196a6841a2dSAlberto Garcia 
2197a6841a2dSAlberto Garcia     tail = (end_offset >= bs->total_sectors << BDRV_SECTOR_BITS) ? 0 :
2198a6841a2dSAlberto Garcia         end_offset - MAX(offset, start_of_cluster(s, end_offset));
2199a6841a2dSAlberto Garcia     end_offset -= tail;
2200621f0589SKevin Wolf 
22010b919faeSKevin Wolf     s->cache_discards = true;
22020b919faeSKevin Wolf 
2203a6841a2dSAlberto Garcia     if (head) {
2204a6841a2dSAlberto Garcia         ret = zero_l2_subclusters(bs, offset - head,
2205a6841a2dSAlberto Garcia                                   size_to_subclusters(s, head));
2206a6841a2dSAlberto Garcia         if (ret < 0) {
2207a6841a2dSAlberto Garcia             goto fail;
2208a6841a2dSAlberto Garcia         }
2209a6841a2dSAlberto Garcia     }
2210a6841a2dSAlberto Garcia 
2211a6841a2dSAlberto Garcia     /* Each L2 slice is handled by its own loop iteration */
2212a6841a2dSAlberto Garcia     nb_clusters = size_to_clusters(s, end_offset - offset);
2213a6841a2dSAlberto Garcia 
2214621f0589SKevin Wolf     while (nb_clusters > 0) {
2215a9a9f8f0SAlberto Garcia         cleared = zero_in_l2_slice(bs, offset, nb_clusters, flags);
2216d2cb36afSEric Blake         if (cleared < 0) {
2217d2cb36afSEric Blake             ret = cleared;
22180b919faeSKevin Wolf             goto fail;
2219621f0589SKevin Wolf         }
2220621f0589SKevin Wolf 
2221d2cb36afSEric Blake         nb_clusters -= cleared;
2222d2cb36afSEric Blake         offset += (cleared * s->cluster_size);
2223621f0589SKevin Wolf     }
2224621f0589SKevin Wolf 
2225a6841a2dSAlberto Garcia     if (tail) {
2226a6841a2dSAlberto Garcia         ret = zero_l2_subclusters(bs, end_offset, size_to_subclusters(s, tail));
2227a6841a2dSAlberto Garcia         if (ret < 0) {
2228a6841a2dSAlberto Garcia             goto fail;
2229a6841a2dSAlberto Garcia         }
2230a6841a2dSAlberto Garcia     }
2231a6841a2dSAlberto Garcia 
22320b919faeSKevin Wolf     ret = 0;
22330b919faeSKevin Wolf fail:
22340b919faeSKevin Wolf     s->cache_discards = false;
22350b919faeSKevin Wolf     qcow2_process_discards(bs, ret);
22360b919faeSKevin Wolf 
22370b919faeSKevin Wolf     return ret;
2238621f0589SKevin Wolf }
223932b6444dSMax Reitz 
224032b6444dSMax Reitz /*
224132b6444dSMax Reitz  * Expands all zero clusters in a specific L1 table (or deallocates them, for
224232b6444dSMax Reitz  * non-backed non-pre-allocated zero clusters).
224332b6444dSMax Reitz  *
22444057a2b2SMax Reitz  * l1_entries and *visited_l1_entries are used to keep track of progress for
22454057a2b2SMax Reitz  * status_cb(). l1_entries contains the total number of L1 entries and
22464057a2b2SMax Reitz  * *visited_l1_entries counts all visited L1 entries.
224732b6444dSMax Reitz  */
22480bb79c97SKevin Wolf static int GRAPH_RDLOCK
expand_zero_clusters_in_l1(BlockDriverState * bs,uint64_t * l1_table,int l1_size,int64_t * visited_l1_entries,int64_t l1_entries,BlockDriverAmendStatusCB * status_cb,void * cb_opaque)22490bb79c97SKevin Wolf expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
2250ecf58777SMax Reitz                            int l1_size, int64_t *visited_l1_entries,
22514057a2b2SMax Reitz                            int64_t l1_entries,
22528b13976dSMax Reitz                            BlockDriverAmendStatusCB *status_cb,
22538b13976dSMax Reitz                            void *cb_opaque)
225432b6444dSMax Reitz {
2255ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
225632b6444dSMax Reitz     bool is_active_l1 = (l1_table == s->l1_table);
2257415184f5SAlberto Garcia     uint64_t *l2_slice = NULL;
2258415184f5SAlberto Garcia     unsigned slice, slice_size2, n_slices;
225932b6444dSMax Reitz     int ret;
226032b6444dSMax Reitz     int i, j;
226132b6444dSMax Reitz 
22627bbb5920SAlberto Garcia     /* qcow2_downgrade() is not allowed in images with subclusters */
22637bbb5920SAlberto Garcia     assert(!has_subclusters(s));
22647bbb5920SAlberto Garcia 
2265c8fd8554SAlberto Garcia     slice_size2 = s->l2_slice_size * l2_entry_size(s);
2266415184f5SAlberto Garcia     n_slices = s->cluster_size / slice_size2;
2267415184f5SAlberto Garcia 
226832b6444dSMax Reitz     if (!is_active_l1) {
226932b6444dSMax Reitz         /* inactive L2 tables require a buffer to be stored in when loading
227032b6444dSMax Reitz          * them from disk */
2271415184f5SAlberto Garcia         l2_slice = qemu_try_blockalign(bs->file->bs, slice_size2);
2272415184f5SAlberto Garcia         if (l2_slice == NULL) {
2273de82815dSKevin Wolf             return -ENOMEM;
2274de82815dSKevin Wolf         }
227532b6444dSMax Reitz     }
227632b6444dSMax Reitz 
227732b6444dSMax Reitz     for (i = 0; i < l1_size; i++) {
227832b6444dSMax Reitz         uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
22790e06528eSMax Reitz         uint64_t l2_refcount;
228032b6444dSMax Reitz 
228132b6444dSMax Reitz         if (!l2_offset) {
228232b6444dSMax Reitz             /* unallocated */
22834057a2b2SMax Reitz             (*visited_l1_entries)++;
22844057a2b2SMax Reitz             if (status_cb) {
22858b13976dSMax Reitz                 status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque);
22864057a2b2SMax Reitz             }
228732b6444dSMax Reitz             continue;
228832b6444dSMax Reitz         }
228932b6444dSMax Reitz 
22908dd93d93SMax Reitz         if (offset_into_cluster(s, l2_offset)) {
22918dd93d93SMax Reitz             qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
22928dd93d93SMax Reitz                                     PRIx64 " unaligned (L1 index: %#x)",
22938dd93d93SMax Reitz                                     l2_offset, i);
22948dd93d93SMax Reitz             ret = -EIO;
22958dd93d93SMax Reitz             goto fail;
22968dd93d93SMax Reitz         }
22978dd93d93SMax Reitz 
22989b765486SAlberto Garcia         ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
22999b765486SAlberto Garcia                                  &l2_refcount);
23009b765486SAlberto Garcia         if (ret < 0) {
23019b765486SAlberto Garcia             goto fail;
23029b765486SAlberto Garcia         }
23039b765486SAlberto Garcia 
2304415184f5SAlberto Garcia         for (slice = 0; slice < n_slices; slice++) {
2305415184f5SAlberto Garcia             uint64_t slice_offset = l2_offset + slice * slice_size2;
2306415184f5SAlberto Garcia             bool l2_dirty = false;
230732b6444dSMax Reitz             if (is_active_l1) {
230832b6444dSMax Reitz                 /* get active L2 tables from cache */
2309415184f5SAlberto Garcia                 ret = qcow2_cache_get(bs, s->l2_table_cache, slice_offset,
2310415184f5SAlberto Garcia                                       (void **)&l2_slice);
231132b6444dSMax Reitz             } else {
231232b6444dSMax Reitz                 /* load inactive L2 tables from disk */
231332cc71deSAlberto Faria                 ret = bdrv_pread(bs->file, slice_offset, slice_size2,
231432cc71deSAlberto Faria                                  l2_slice, 0);
231532b6444dSMax Reitz             }
231632b6444dSMax Reitz             if (ret < 0) {
231732b6444dSMax Reitz                 goto fail;
231832b6444dSMax Reitz             }
231932b6444dSMax Reitz 
2320415184f5SAlberto Garcia             for (j = 0; j < s->l2_slice_size; j++) {
232112c6aebeSAlberto Garcia                 uint64_t l2_entry = get_l2_entry(s, l2_slice, j);
2322ecf58777SMax Reitz                 int64_t offset = l2_entry & L2E_OFFSET_MASK;
2323226494ffSAlberto Garcia                 QCow2ClusterType cluster_type =
2324808c2bb4SKevin Wolf                     qcow2_get_cluster_type(bs, l2_entry);
232532b6444dSMax Reitz 
2326fdfab37dSEric Blake                 if (cluster_type != QCOW2_CLUSTER_ZERO_PLAIN &&
2327fdfab37dSEric Blake                     cluster_type != QCOW2_CLUSTER_ZERO_ALLOC) {
232832b6444dSMax Reitz                     continue;
232932b6444dSMax Reitz                 }
233032b6444dSMax Reitz 
2331fdfab37dSEric Blake                 if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
2332760e0063SKevin Wolf                     if (!bs->backing) {
23337bbb5920SAlberto Garcia                         /*
23347bbb5920SAlberto Garcia                          * not backed; therefore we can simply deallocate the
23357bbb5920SAlberto Garcia                          * cluster. No need to call set_l2_bitmap(), this
23367bbb5920SAlberto Garcia                          * function doesn't support images with subclusters.
23377bbb5920SAlberto Garcia                          */
233812c6aebeSAlberto Garcia                         set_l2_entry(s, l2_slice, j, 0);
233932b6444dSMax Reitz                         l2_dirty = true;
234032b6444dSMax Reitz                         continue;
234132b6444dSMax Reitz                     }
234232b6444dSMax Reitz 
234332b6444dSMax Reitz                     offset = qcow2_alloc_clusters(bs, s->cluster_size);
234432b6444dSMax Reitz                     if (offset < 0) {
234532b6444dSMax Reitz                         ret = offset;
234632b6444dSMax Reitz                         goto fail;
234732b6444dSMax Reitz                     }
2348ecf58777SMax Reitz 
23493a75a870SAlberto Garcia                     /* The offset must fit in the offset field */
23503a75a870SAlberto Garcia                     assert((offset & L2E_OFFSET_MASK) == offset);
23513a75a870SAlberto Garcia 
2352ecf58777SMax Reitz                     if (l2_refcount > 1) {
2353226494ffSAlberto Garcia                         /* For shared L2 tables, set the refcount accordingly
2354226494ffSAlberto Garcia                          * (it is already 1 and needs to be l2_refcount) */
2355226494ffSAlberto Garcia                         ret = qcow2_update_cluster_refcount(
2356226494ffSAlberto Garcia                             bs, offset >> s->cluster_bits,
23572aabe7c7SMax Reitz                             refcount_diff(1, l2_refcount), false,
2358ecf58777SMax Reitz                             QCOW2_DISCARD_OTHER);
2359ecf58777SMax Reitz                         if (ret < 0) {
2360ecf58777SMax Reitz                             qcow2_free_clusters(bs, offset, s->cluster_size,
2361ecf58777SMax Reitz                                                 QCOW2_DISCARD_OTHER);
2362ecf58777SMax Reitz                             goto fail;
2363ecf58777SMax Reitz                         }
2364ecf58777SMax Reitz                     }
236532b6444dSMax Reitz                 }
236632b6444dSMax Reitz 
23678dd93d93SMax Reitz                 if (offset_into_cluster(s, offset)) {
2368415184f5SAlberto Garcia                     int l2_index = slice * s->l2_slice_size + j;
2369226494ffSAlberto Garcia                     qcow2_signal_corruption(
2370226494ffSAlberto Garcia                         bs, true, -1, -1,
2371bcb07dbaSEric Blake                         "Cluster allocation offset "
23728dd93d93SMax Reitz                         "%#" PRIx64 " unaligned (L2 offset: %#"
23738dd93d93SMax Reitz                         PRIx64 ", L2 index: %#x)", offset,
2374415184f5SAlberto Garcia                         l2_offset, l2_index);
2375fdfab37dSEric Blake                     if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
23768dd93d93SMax Reitz                         qcow2_free_clusters(bs, offset, s->cluster_size,
23778dd93d93SMax Reitz                                             QCOW2_DISCARD_ALWAYS);
23788dd93d93SMax Reitz                     }
23798dd93d93SMax Reitz                     ret = -EIO;
23808dd93d93SMax Reitz                     goto fail;
23818dd93d93SMax Reitz                 }
23828dd93d93SMax Reitz 
2383226494ffSAlberto Garcia                 ret = qcow2_pre_write_overlap_check(bs, 0, offset,
2384966b000fSKevin Wolf                                                     s->cluster_size, true);
238532b6444dSMax Reitz                 if (ret < 0) {
2386fdfab37dSEric Blake                     if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
238732b6444dSMax Reitz                         qcow2_free_clusters(bs, offset, s->cluster_size,
238832b6444dSMax Reitz                                             QCOW2_DISCARD_ALWAYS);
2389320c7066SMax Reitz                     }
239032b6444dSMax Reitz                     goto fail;
239132b6444dSMax Reitz                 }
239232b6444dSMax Reitz 
2393966b000fSKevin Wolf                 ret = bdrv_pwrite_zeroes(s->data_file, offset,
2394966b000fSKevin Wolf                                          s->cluster_size, 0);
239532b6444dSMax Reitz                 if (ret < 0) {
2396fdfab37dSEric Blake                     if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
239732b6444dSMax Reitz                         qcow2_free_clusters(bs, offset, s->cluster_size,
239832b6444dSMax Reitz                                             QCOW2_DISCARD_ALWAYS);
2399320c7066SMax Reitz                     }
240032b6444dSMax Reitz                     goto fail;
240132b6444dSMax Reitz                 }
240232b6444dSMax Reitz 
2403ecf58777SMax Reitz                 if (l2_refcount == 1) {
240412c6aebeSAlberto Garcia                     set_l2_entry(s, l2_slice, j, offset | QCOW_OFLAG_COPIED);
2405ecf58777SMax Reitz                 } else {
240612c6aebeSAlberto Garcia                     set_l2_entry(s, l2_slice, j, offset);
2407e390cf5aSMax Reitz                 }
24087bbb5920SAlberto Garcia                 /*
24097bbb5920SAlberto Garcia                  * No need to call set_l2_bitmap() after set_l2_entry() because
24107bbb5920SAlberto Garcia                  * this function doesn't support images with subclusters.
24117bbb5920SAlberto Garcia                  */
2412ecf58777SMax Reitz                 l2_dirty = true;
241332b6444dSMax Reitz             }
241432b6444dSMax Reitz 
241532b6444dSMax Reitz             if (is_active_l1) {
241632b6444dSMax Reitz                 if (l2_dirty) {
2417415184f5SAlberto Garcia                     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
241832b6444dSMax Reitz                     qcow2_cache_depends_on_flush(s->l2_table_cache);
241932b6444dSMax Reitz                 }
2420415184f5SAlberto Garcia                 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
242132b6444dSMax Reitz             } else {
242232b6444dSMax Reitz                 if (l2_dirty) {
2423226494ffSAlberto Garcia                     ret = qcow2_pre_write_overlap_check(
2424226494ffSAlberto Garcia                         bs, QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2,
2425966b000fSKevin Wolf                         slice_offset, slice_size2, false);
242632b6444dSMax Reitz                     if (ret < 0) {
242732b6444dSMax Reitz                         goto fail;
242832b6444dSMax Reitz                     }
242932b6444dSMax Reitz 
243032cc71deSAlberto Faria                     ret = bdrv_pwrite(bs->file, slice_offset, slice_size2,
243132cc71deSAlberto Faria                                       l2_slice, 0);
243232b6444dSMax Reitz                     if (ret < 0) {
243332b6444dSMax Reitz                         goto fail;
243432b6444dSMax Reitz                     }
243532b6444dSMax Reitz                 }
243632b6444dSMax Reitz             }
2437226494ffSAlberto Garcia         }
24384057a2b2SMax Reitz 
24394057a2b2SMax Reitz         (*visited_l1_entries)++;
24404057a2b2SMax Reitz         if (status_cb) {
24418b13976dSMax Reitz             status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque);
24424057a2b2SMax Reitz         }
244332b6444dSMax Reitz     }
244432b6444dSMax Reitz 
244532b6444dSMax Reitz     ret = 0;
244632b6444dSMax Reitz 
244732b6444dSMax Reitz fail:
2448415184f5SAlberto Garcia     if (l2_slice) {
244932b6444dSMax Reitz         if (!is_active_l1) {
2450415184f5SAlberto Garcia             qemu_vfree(l2_slice);
245132b6444dSMax Reitz         } else {
2452415184f5SAlberto Garcia             qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
245332b6444dSMax Reitz         }
245432b6444dSMax Reitz     }
245532b6444dSMax Reitz     return ret;
245632b6444dSMax Reitz }
245732b6444dSMax Reitz 
245832b6444dSMax Reitz /*
245932b6444dSMax Reitz  * For backed images, expands all zero clusters on the image. For non-backed
246032b6444dSMax Reitz  * images, deallocates all non-pre-allocated zero clusters (and claims the
246132b6444dSMax Reitz  * allocation for pre-allocated ones). This is important for downgrading to a
246232b6444dSMax Reitz  * qcow2 version which doesn't yet support metadata zero clusters.
246332b6444dSMax Reitz  */
qcow2_expand_zero_clusters(BlockDriverState * bs,BlockDriverAmendStatusCB * status_cb,void * cb_opaque)24644057a2b2SMax Reitz int qcow2_expand_zero_clusters(BlockDriverState *bs,
24658b13976dSMax Reitz                                BlockDriverAmendStatusCB *status_cb,
24668b13976dSMax Reitz                                void *cb_opaque)
246732b6444dSMax Reitz {
2468ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
246932b6444dSMax Reitz     uint64_t *l1_table = NULL;
24704057a2b2SMax Reitz     int64_t l1_entries = 0, visited_l1_entries = 0;
247132b6444dSMax Reitz     int ret;
247232b6444dSMax Reitz     int i, j;
247332b6444dSMax Reitz 
24744057a2b2SMax Reitz     if (status_cb) {
24754057a2b2SMax Reitz         l1_entries = s->l1_size;
24764057a2b2SMax Reitz         for (i = 0; i < s->nb_snapshots; i++) {
24774057a2b2SMax Reitz             l1_entries += s->snapshots[i].l1_size;
24784057a2b2SMax Reitz         }
24794057a2b2SMax Reitz     }
24804057a2b2SMax Reitz 
248132b6444dSMax Reitz     ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
24824057a2b2SMax Reitz                                      &visited_l1_entries, l1_entries,
24838b13976dSMax Reitz                                      status_cb, cb_opaque);
248432b6444dSMax Reitz     if (ret < 0) {
248532b6444dSMax Reitz         goto fail;
248632b6444dSMax Reitz     }
248732b6444dSMax Reitz 
248832b6444dSMax Reitz     /* Inactive L1 tables may point to active L2 tables - therefore it is
248932b6444dSMax Reitz      * necessary to flush the L2 table cache before trying to access the L2
249032b6444dSMax Reitz      * tables pointed to by inactive L1 entries (else we might try to expand
249132b6444dSMax Reitz      * zero clusters that have already been expanded); furthermore, it is also
249232b6444dSMax Reitz      * necessary to empty the L2 table cache, since it may contain tables which
249332b6444dSMax Reitz      * are now going to be modified directly on disk, bypassing the cache.
249432b6444dSMax Reitz      * qcow2_cache_empty() does both for us. */
249532b6444dSMax Reitz     ret = qcow2_cache_empty(bs, s->l2_table_cache);
249632b6444dSMax Reitz     if (ret < 0) {
249732b6444dSMax Reitz         goto fail;
249832b6444dSMax Reitz     }
249932b6444dSMax Reitz 
250032b6444dSMax Reitz     for (i = 0; i < s->nb_snapshots; i++) {
2501c9a442e4SAlberto Garcia         int l1_size2;
2502c9a442e4SAlberto Garcia         uint64_t *new_l1_table;
2503c9a442e4SAlberto Garcia         Error *local_err = NULL;
250432b6444dSMax Reitz 
2505c9a442e4SAlberto Garcia         ret = qcow2_validate_table(bs, s->snapshots[i].l1_table_offset,
250602b1ecfaSAlberto Garcia                                    s->snapshots[i].l1_size, L1E_SIZE,
2507c9a442e4SAlberto Garcia                                    QCOW_MAX_L1_SIZE, "Snapshot L1 table",
2508c9a442e4SAlberto Garcia                                    &local_err);
2509c9a442e4SAlberto Garcia         if (ret < 0) {
2510c9a442e4SAlberto Garcia             error_report_err(local_err);
2511c9a442e4SAlberto Garcia             goto fail;
2512c9a442e4SAlberto Garcia         }
2513c9a442e4SAlberto Garcia 
251402b1ecfaSAlberto Garcia         l1_size2 = s->snapshots[i].l1_size * L1E_SIZE;
2515c9a442e4SAlberto Garcia         new_l1_table = g_try_realloc(l1_table, l1_size2);
2516de7269d2SAlberto Garcia 
2517de7269d2SAlberto Garcia         if (!new_l1_table) {
2518de7269d2SAlberto Garcia             ret = -ENOMEM;
2519de7269d2SAlberto Garcia             goto fail;
2520de7269d2SAlberto Garcia         }
2521de7269d2SAlberto Garcia 
2522de7269d2SAlberto Garcia         l1_table = new_l1_table;
252332b6444dSMax Reitz 
252432cc71deSAlberto Faria         ret = bdrv_pread(bs->file, s->snapshots[i].l1_table_offset, l1_size2,
252532cc71deSAlberto Faria                          l1_table, 0);
252632b6444dSMax Reitz         if (ret < 0) {
252732b6444dSMax Reitz             goto fail;
252832b6444dSMax Reitz         }
252932b6444dSMax Reitz 
253032b6444dSMax Reitz         for (j = 0; j < s->snapshots[i].l1_size; j++) {
253132b6444dSMax Reitz             be64_to_cpus(&l1_table[j]);
253232b6444dSMax Reitz         }
253332b6444dSMax Reitz 
253432b6444dSMax Reitz         ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size,
25354057a2b2SMax Reitz                                          &visited_l1_entries, l1_entries,
25368b13976dSMax Reitz                                          status_cb, cb_opaque);
253732b6444dSMax Reitz         if (ret < 0) {
253832b6444dSMax Reitz             goto fail;
253932b6444dSMax Reitz         }
254032b6444dSMax Reitz     }
254132b6444dSMax Reitz 
254232b6444dSMax Reitz     ret = 0;
254332b6444dSMax Reitz 
254432b6444dSMax Reitz fail:
254532b6444dSMax Reitz     g_free(l1_table);
254632b6444dSMax Reitz     return ret;
254732b6444dSMax Reitz }
2548a6e09846SVladimir Sementsov-Ogievskiy 
qcow2_parse_compressed_l2_entry(BlockDriverState * bs,uint64_t l2_entry,uint64_t * coffset,int * csize)2549a6e09846SVladimir Sementsov-Ogievskiy void qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry,
2550a6e09846SVladimir Sementsov-Ogievskiy                                      uint64_t *coffset, int *csize)
2551a6e09846SVladimir Sementsov-Ogievskiy {
2552a6e09846SVladimir Sementsov-Ogievskiy     BDRVQcow2State *s = bs->opaque;
2553a6e09846SVladimir Sementsov-Ogievskiy     int nb_csectors;
2554a6e09846SVladimir Sementsov-Ogievskiy 
2555a6e09846SVladimir Sementsov-Ogievskiy     assert(qcow2_get_cluster_type(bs, l2_entry) == QCOW2_CLUSTER_COMPRESSED);
2556a6e09846SVladimir Sementsov-Ogievskiy 
2557a6e09846SVladimir Sementsov-Ogievskiy     *coffset = l2_entry & s->cluster_offset_mask;
2558a6e09846SVladimir Sementsov-Ogievskiy 
2559a6e09846SVladimir Sementsov-Ogievskiy     nb_csectors = ((l2_entry >> s->csize_shift) & s->csize_mask) + 1;
2560a6e09846SVladimir Sementsov-Ogievskiy     *csize = nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE -
2561a6e09846SVladimir Sementsov-Ogievskiy         (*coffset & (QCOW2_COMPRESSED_SECTOR_SIZE - 1));
2562a6e09846SVladimir Sementsov-Ogievskiy }
2563