145aba42fSKevin Wolf /* 245aba42fSKevin Wolf * Block driver for the QCOW version 2 format 345aba42fSKevin Wolf * 445aba42fSKevin Wolf * Copyright (c) 2004-2006 Fabrice Bellard 545aba42fSKevin Wolf * 645aba42fSKevin Wolf * Permission is hereby granted, free of charge, to any person obtaining a copy 745aba42fSKevin Wolf * of this software and associated documentation files (the "Software"), to deal 845aba42fSKevin Wolf * in the Software without restriction, including without limitation the rights 945aba42fSKevin Wolf * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 1045aba42fSKevin Wolf * copies of the Software, and to permit persons to whom the Software is 1145aba42fSKevin Wolf * furnished to do so, subject to the following conditions: 1245aba42fSKevin Wolf * 1345aba42fSKevin Wolf * The above copyright notice and this permission notice shall be included in 1445aba42fSKevin Wolf * all copies or substantial portions of the Software. 1545aba42fSKevin Wolf * 1645aba42fSKevin Wolf * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1745aba42fSKevin Wolf * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1845aba42fSKevin Wolf * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1945aba42fSKevin Wolf * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2045aba42fSKevin Wolf * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 2145aba42fSKevin Wolf * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 2245aba42fSKevin Wolf * THE SOFTWARE. 2345aba42fSKevin Wolf */ 2445aba42fSKevin Wolf 2580c71a24SPeter Maydell #include "qemu/osdep.h" 2645aba42fSKevin Wolf #include <zlib.h> 2745aba42fSKevin Wolf 28c9a442e4SAlberto Garcia #include "qapi/error.h" 2945aba42fSKevin Wolf #include "qemu-common.h" 30737e150eSPaolo Bonzini #include "block/block_int.h" 31*0d8c41daSMichael S. Tsirkin #include "qcow2.h" 3258369e22SPaolo Bonzini #include "qemu/bswap.h" 333cce16f4SKevin Wolf #include "trace.h" 3445aba42fSKevin Wolf 3546b732cdSPavel Butsykin int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t exact_size) 3646b732cdSPavel Butsykin { 3746b732cdSPavel Butsykin BDRVQcow2State *s = bs->opaque; 3846b732cdSPavel Butsykin int new_l1_size, i, ret; 3946b732cdSPavel Butsykin 4046b732cdSPavel Butsykin if (exact_size >= s->l1_size) { 4146b732cdSPavel Butsykin return 0; 4246b732cdSPavel Butsykin } 4346b732cdSPavel Butsykin 4446b732cdSPavel Butsykin new_l1_size = exact_size; 4546b732cdSPavel Butsykin 4646b732cdSPavel Butsykin #ifdef DEBUG_ALLOC2 4746b732cdSPavel Butsykin fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size); 4846b732cdSPavel Butsykin #endif 4946b732cdSPavel Butsykin 5046b732cdSPavel Butsykin BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE); 5146b732cdSPavel Butsykin ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset + 5246b732cdSPavel Butsykin new_l1_size * sizeof(uint64_t), 5346b732cdSPavel Butsykin (s->l1_size - new_l1_size) * sizeof(uint64_t), 0); 5446b732cdSPavel Butsykin if (ret < 0) { 5546b732cdSPavel Butsykin goto fail; 5646b732cdSPavel Butsykin } 5746b732cdSPavel Butsykin 5846b732cdSPavel Butsykin ret = bdrv_flush(bs->file->bs); 5946b732cdSPavel Butsykin if (ret < 0) { 6046b732cdSPavel Butsykin goto fail; 6146b732cdSPavel Butsykin } 6246b732cdSPavel Butsykin 6346b732cdSPavel Butsykin BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS); 6446b732cdSPavel Butsykin for (i = s->l1_size - 1; i > new_l1_size - 1; i--) { 6546b732cdSPavel Butsykin if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) { 6646b732cdSPavel Butsykin continue; 6746b732cdSPavel Butsykin } 6846b732cdSPavel Butsykin qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK, 6946b732cdSPavel Butsykin s->cluster_size, QCOW2_DISCARD_ALWAYS); 7046b732cdSPavel Butsykin s->l1_table[i] = 0; 7146b732cdSPavel Butsykin } 7246b732cdSPavel Butsykin return 0; 7346b732cdSPavel Butsykin 7446b732cdSPavel Butsykin fail: 7546b732cdSPavel Butsykin /* 7646b732cdSPavel Butsykin * If the write in the l1_table failed the image may contain a partially 7746b732cdSPavel Butsykin * overwritten l1_table. In this case it would be better to clear the 7846b732cdSPavel Butsykin * l1_table in memory to avoid possible image corruption. 7946b732cdSPavel Butsykin */ 8046b732cdSPavel Butsykin memset(s->l1_table + new_l1_size, 0, 8146b732cdSPavel Butsykin (s->l1_size - new_l1_size) * sizeof(uint64_t)); 8246b732cdSPavel Butsykin return ret; 8346b732cdSPavel Butsykin } 8446b732cdSPavel Butsykin 852cf7cfa1SKevin Wolf int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, 862cf7cfa1SKevin Wolf bool exact_size) 8745aba42fSKevin Wolf { 88ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 892cf7cfa1SKevin Wolf int new_l1_size2, ret, i; 9045aba42fSKevin Wolf uint64_t *new_l1_table; 91fda74f82SMax Reitz int64_t old_l1_table_offset, old_l1_size; 922cf7cfa1SKevin Wolf int64_t new_l1_table_offset, new_l1_size; 9345aba42fSKevin Wolf uint8_t data[12]; 9445aba42fSKevin Wolf 9572893756SStefan Hajnoczi if (min_size <= s->l1_size) 9645aba42fSKevin Wolf return 0; 9772893756SStefan Hajnoczi 98b93f9950SMax Reitz /* Do a sanity check on min_size before trying to calculate new_l1_size 99b93f9950SMax Reitz * (this prevents overflows during the while loop for the calculation of 100b93f9950SMax Reitz * new_l1_size) */ 101b93f9950SMax Reitz if (min_size > INT_MAX / sizeof(uint64_t)) { 102b93f9950SMax Reitz return -EFBIG; 103b93f9950SMax Reitz } 104b93f9950SMax Reitz 10572893756SStefan Hajnoczi if (exact_size) { 10672893756SStefan Hajnoczi new_l1_size = min_size; 10772893756SStefan Hajnoczi } else { 10872893756SStefan Hajnoczi /* Bump size up to reduce the number of times we have to grow */ 10972893756SStefan Hajnoczi new_l1_size = s->l1_size; 110d191d12dSStefan Weil if (new_l1_size == 0) { 111d191d12dSStefan Weil new_l1_size = 1; 112d191d12dSStefan Weil } 11345aba42fSKevin Wolf while (min_size > new_l1_size) { 11421cf3e12SMarc-André Lureau new_l1_size = DIV_ROUND_UP(new_l1_size * 3, 2); 11545aba42fSKevin Wolf } 11672893756SStefan Hajnoczi } 11772893756SStefan Hajnoczi 11884c26520SMax Reitz QEMU_BUILD_BUG_ON(QCOW_MAX_L1_SIZE > INT_MAX); 11984c26520SMax Reitz if (new_l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) { 1202cf7cfa1SKevin Wolf return -EFBIG; 1212cf7cfa1SKevin Wolf } 1222cf7cfa1SKevin Wolf 12345aba42fSKevin Wolf #ifdef DEBUG_ALLOC2 1242cf7cfa1SKevin Wolf fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n", 1252cf7cfa1SKevin Wolf s->l1_size, new_l1_size); 12645aba42fSKevin Wolf #endif 12745aba42fSKevin Wolf 12845aba42fSKevin Wolf new_l1_size2 = sizeof(uint64_t) * new_l1_size; 1299a4f4c31SKevin Wolf new_l1_table = qemu_try_blockalign(bs->file->bs, 1309e029689SAlberto Garcia ROUND_UP(new_l1_size2, 512)); 131de82815dSKevin Wolf if (new_l1_table == NULL) { 132de82815dSKevin Wolf return -ENOMEM; 133de82815dSKevin Wolf } 1349e029689SAlberto Garcia memset(new_l1_table, 0, ROUND_UP(new_l1_size2, 512)); 135de82815dSKevin Wolf 1360647d47cSStefan Hajnoczi if (s->l1_size) { 13745aba42fSKevin Wolf memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t)); 1380647d47cSStefan Hajnoczi } 13945aba42fSKevin Wolf 14045aba42fSKevin Wolf /* write new table (align to cluster) */ 14166f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE); 142ed6ccf0fSKevin Wolf new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2); 1435d757b56SKevin Wolf if (new_l1_table_offset < 0) { 144de82815dSKevin Wolf qemu_vfree(new_l1_table); 1455d757b56SKevin Wolf return new_l1_table_offset; 1465d757b56SKevin Wolf } 14729c1a730SKevin Wolf 14829c1a730SKevin Wolf ret = qcow2_cache_flush(bs, s->refcount_block_cache); 14929c1a730SKevin Wolf if (ret < 0) { 15080fa3341SKevin Wolf goto fail; 15129c1a730SKevin Wolf } 15245aba42fSKevin Wolf 153cf93980eSMax Reitz /* the L1 position has not yet been updated, so these clusters must 154cf93980eSMax Reitz * indeed be completely free */ 155231bb267SMax Reitz ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset, 156231bb267SMax Reitz new_l1_size2); 157cf93980eSMax Reitz if (ret < 0) { 158cf93980eSMax Reitz goto fail; 159cf93980eSMax Reitz } 160cf93980eSMax Reitz 16166f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); 16245aba42fSKevin Wolf for(i = 0; i < s->l1_size; i++) 16345aba42fSKevin Wolf new_l1_table[i] = cpu_to_be64(new_l1_table[i]); 164d9ca2ea2SKevin Wolf ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, 1659a4f4c31SKevin Wolf new_l1_table, new_l1_size2); 1668b3b7206SKevin Wolf if (ret < 0) 16745aba42fSKevin Wolf goto fail; 16845aba42fSKevin Wolf for(i = 0; i < s->l1_size; i++) 16945aba42fSKevin Wolf new_l1_table[i] = be64_to_cpu(new_l1_table[i]); 17045aba42fSKevin Wolf 17145aba42fSKevin Wolf /* set new table */ 17266f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE); 173f1f7a1ddSPeter Maydell stl_be_p(data, new_l1_size); 174e4ef9f46SPeter Maydell stq_be_p(data + 4, new_l1_table_offset); 175d9ca2ea2SKevin Wolf ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), 1769a4f4c31SKevin Wolf data, sizeof(data)); 1778b3b7206SKevin Wolf if (ret < 0) { 17845aba42fSKevin Wolf goto fail; 179fb8fa77cSKevin Wolf } 180de82815dSKevin Wolf qemu_vfree(s->l1_table); 181fda74f82SMax Reitz old_l1_table_offset = s->l1_table_offset; 18245aba42fSKevin Wolf s->l1_table_offset = new_l1_table_offset; 18345aba42fSKevin Wolf s->l1_table = new_l1_table; 184fda74f82SMax Reitz old_l1_size = s->l1_size; 18545aba42fSKevin Wolf s->l1_size = new_l1_size; 186fda74f82SMax Reitz qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * sizeof(uint64_t), 187fda74f82SMax Reitz QCOW2_DISCARD_OTHER); 18845aba42fSKevin Wolf return 0; 18945aba42fSKevin Wolf fail: 190de82815dSKevin Wolf qemu_vfree(new_l1_table); 1916cfcb9b8SKevin Wolf qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2, 1926cfcb9b8SKevin Wolf QCOW2_DISCARD_OTHER); 1938b3b7206SKevin Wolf return ret; 19445aba42fSKevin Wolf } 19545aba42fSKevin Wolf 19645aba42fSKevin Wolf /* 19745aba42fSKevin Wolf * l2_load 19845aba42fSKevin Wolf * 199e2b5713eSAlberto Garcia * @bs: The BlockDriverState 200e2b5713eSAlberto Garcia * @offset: A guest offset, used to calculate what slice of the L2 201e2b5713eSAlberto Garcia * table to load. 202e2b5713eSAlberto Garcia * @l2_offset: Offset to the L2 table in the image file. 203e2b5713eSAlberto Garcia * @l2_slice: Location to store the pointer to the L2 slice. 20445aba42fSKevin Wolf * 205e2b5713eSAlberto Garcia * Loads a L2 slice into memory (L2 slices are the parts of L2 tables 206e2b5713eSAlberto Garcia * that are loaded by the qcow2 cache). If the slice is in the cache, 207e2b5713eSAlberto Garcia * the cache is used; otherwise the L2 slice is loaded from the image 208e2b5713eSAlberto Garcia * file. 20945aba42fSKevin Wolf */ 210e2b5713eSAlberto Garcia static int l2_load(BlockDriverState *bs, uint64_t offset, 211e2b5713eSAlberto Garcia uint64_t l2_offset, uint64_t **l2_slice) 21245aba42fSKevin Wolf { 213ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 214e2b5713eSAlberto Garcia int start_of_slice = sizeof(uint64_t) * 215e2b5713eSAlberto Garcia (offset_to_l2_index(s, offset) - offset_to_l2_slice_index(s, offset)); 21645aba42fSKevin Wolf 217e2b5713eSAlberto Garcia return qcow2_cache_get(bs, s->l2_table_cache, l2_offset + start_of_slice, 218e2b5713eSAlberto Garcia (void **)l2_slice); 21955c17e98SKevin Wolf } 22055c17e98SKevin Wolf 22145aba42fSKevin Wolf /* 2226583e3c7SKevin Wolf * Writes one sector of the L1 table to the disk (can't update single entries 2236583e3c7SKevin Wolf * and we really don't want bdrv_pread to perform a read-modify-write) 2246583e3c7SKevin Wolf */ 2256583e3c7SKevin Wolf #define L1_ENTRIES_PER_SECTOR (512 / 8) 226e23e400eSMax Reitz int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index) 2276583e3c7SKevin Wolf { 228ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 229a1391444SMax Reitz uint64_t buf[L1_ENTRIES_PER_SECTOR] = { 0 }; 2306583e3c7SKevin Wolf int l1_start_index; 231f7defcb6SKevin Wolf int i, ret; 2326583e3c7SKevin Wolf 2336583e3c7SKevin Wolf l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1); 234a1391444SMax Reitz for (i = 0; i < L1_ENTRIES_PER_SECTOR && l1_start_index + i < s->l1_size; 235a1391444SMax Reitz i++) 236a1391444SMax Reitz { 2376583e3c7SKevin Wolf buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]); 2386583e3c7SKevin Wolf } 2396583e3c7SKevin Wolf 240231bb267SMax Reitz ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1, 241cf93980eSMax Reitz s->l1_table_offset + 8 * l1_start_index, sizeof(buf)); 242cf93980eSMax Reitz if (ret < 0) { 243cf93980eSMax Reitz return ret; 244cf93980eSMax Reitz } 245cf93980eSMax Reitz 24666f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); 247d9ca2ea2SKevin Wolf ret = bdrv_pwrite_sync(bs->file, 2489a4f4c31SKevin Wolf s->l1_table_offset + 8 * l1_start_index, 249f7defcb6SKevin Wolf buf, sizeof(buf)); 250f7defcb6SKevin Wolf if (ret < 0) { 251f7defcb6SKevin Wolf return ret; 2526583e3c7SKevin Wolf } 2536583e3c7SKevin Wolf 2546583e3c7SKevin Wolf return 0; 2556583e3c7SKevin Wolf } 2566583e3c7SKevin Wolf 2576583e3c7SKevin Wolf /* 25845aba42fSKevin Wolf * l2_allocate 25945aba42fSKevin Wolf * 26045aba42fSKevin Wolf * Allocate a new l2 entry in the file. If l1_index points to an already 26145aba42fSKevin Wolf * used entry in the L2 table (i.e. we are doing a copy on write for the L2 26245aba42fSKevin Wolf * table) copy the contents of the old L2 table into the newly allocated one. 26345aba42fSKevin Wolf * Otherwise the new table is initialized with zeros. 26445aba42fSKevin Wolf * 26545aba42fSKevin Wolf */ 26645aba42fSKevin Wolf 2673861946aSAlberto Garcia static int l2_allocate(BlockDriverState *bs, int l1_index) 26845aba42fSKevin Wolf { 269ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 2706583e3c7SKevin Wolf uint64_t old_l2_offset; 2713861946aSAlberto Garcia uint64_t *l2_slice = NULL; 2723861946aSAlberto Garcia unsigned slice, slice_size2, n_slices; 273f4f0d391SKevin Wolf int64_t l2_offset; 274c46e1167SKevin Wolf int ret; 27545aba42fSKevin Wolf 27645aba42fSKevin Wolf old_l2_offset = s->l1_table[l1_index]; 27745aba42fSKevin Wolf 2783cce16f4SKevin Wolf trace_qcow2_l2_allocate(bs, l1_index); 2793cce16f4SKevin Wolf 28045aba42fSKevin Wolf /* allocate a new l2 entry */ 28145aba42fSKevin Wolf 282ed6ccf0fSKevin Wolf l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t)); 2835d757b56SKevin Wolf if (l2_offset < 0) { 284be0b742eSMax Reitz ret = l2_offset; 285be0b742eSMax Reitz goto fail; 2865d757b56SKevin Wolf } 28729c1a730SKevin Wolf 28898839750SAlberto Garcia /* If we're allocating the table at offset 0 then something is wrong */ 28998839750SAlberto Garcia if (l2_offset == 0) { 29098839750SAlberto Garcia qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid " 29198839750SAlberto Garcia "allocation of L2 table at offset 0"); 29298839750SAlberto Garcia ret = -EIO; 29398839750SAlberto Garcia goto fail; 29498839750SAlberto Garcia } 29598839750SAlberto Garcia 29629c1a730SKevin Wolf ret = qcow2_cache_flush(bs, s->refcount_block_cache); 29729c1a730SKevin Wolf if (ret < 0) { 29829c1a730SKevin Wolf goto fail; 29929c1a730SKevin Wolf } 30045aba42fSKevin Wolf 30145aba42fSKevin Wolf /* allocate a new entry in the l2 cache */ 30245aba42fSKevin Wolf 3033861946aSAlberto Garcia slice_size2 = s->l2_slice_size * sizeof(uint64_t); 3043861946aSAlberto Garcia n_slices = s->cluster_size / slice_size2; 3053861946aSAlberto Garcia 3063cce16f4SKevin Wolf trace_qcow2_l2_allocate_get_empty(bs, l1_index); 3073861946aSAlberto Garcia for (slice = 0; slice < n_slices; slice++) { 3086580bb09SAlberto Garcia ret = qcow2_cache_get_empty(bs, s->l2_table_cache, 3093861946aSAlberto Garcia l2_offset + slice * slice_size2, 3103861946aSAlberto Garcia (void **) &l2_slice); 31129c1a730SKevin Wolf if (ret < 0) { 312be0b742eSMax Reitz goto fail; 31329c1a730SKevin Wolf } 31429c1a730SKevin Wolf 3158e37f681SKevin Wolf if ((old_l2_offset & L1E_OFFSET_MASK) == 0) { 3163861946aSAlberto Garcia /* if there was no old l2 table, clear the new slice */ 3173861946aSAlberto Garcia memset(l2_slice, 0, slice_size2); 31845aba42fSKevin Wolf } else { 3193861946aSAlberto Garcia uint64_t *old_slice; 3203861946aSAlberto Garcia uint64_t old_l2_slice_offset = 3213861946aSAlberto Garcia (old_l2_offset & L1E_OFFSET_MASK) + slice * slice_size2; 32229c1a730SKevin Wolf 3233861946aSAlberto Garcia /* if there was an old l2 table, read a slice from the disk */ 32466f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ); 3253861946aSAlberto Garcia ret = qcow2_cache_get(bs, s->l2_table_cache, old_l2_slice_offset, 3263861946aSAlberto Garcia (void **) &old_slice); 32729c1a730SKevin Wolf if (ret < 0) { 32829c1a730SKevin Wolf goto fail; 32929c1a730SKevin Wolf } 33029c1a730SKevin Wolf 3313861946aSAlberto Garcia memcpy(l2_slice, old_slice, slice_size2); 33229c1a730SKevin Wolf 3333861946aSAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &old_slice); 33445aba42fSKevin Wolf } 33529c1a730SKevin Wolf 3363861946aSAlberto Garcia /* write the l2 slice to the file */ 33766f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE); 33829c1a730SKevin Wolf 3393cce16f4SKevin Wolf trace_qcow2_l2_allocate_write_l2(bs, l1_index); 3403861946aSAlberto Garcia qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice); 3413861946aSAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 3426580bb09SAlberto Garcia } 3436580bb09SAlberto Garcia 34429c1a730SKevin Wolf ret = qcow2_cache_flush(bs, s->l2_table_cache); 345c46e1167SKevin Wolf if (ret < 0) { 346175e1152SKevin Wolf goto fail; 347175e1152SKevin Wolf } 348175e1152SKevin Wolf 349175e1152SKevin Wolf /* update the L1 entry */ 3503cce16f4SKevin Wolf trace_qcow2_l2_allocate_write_l1(bs, l1_index); 351175e1152SKevin Wolf s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; 352e23e400eSMax Reitz ret = qcow2_write_l1_entry(bs, l1_index); 353175e1152SKevin Wolf if (ret < 0) { 354175e1152SKevin Wolf goto fail; 355c46e1167SKevin Wolf } 35645aba42fSKevin Wolf 3573cce16f4SKevin Wolf trace_qcow2_l2_allocate_done(bs, l1_index, 0); 358c46e1167SKevin Wolf return 0; 359175e1152SKevin Wolf 360175e1152SKevin Wolf fail: 3613cce16f4SKevin Wolf trace_qcow2_l2_allocate_done(bs, l1_index, ret); 3623861946aSAlberto Garcia if (l2_slice != NULL) { 3633861946aSAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 3648585afd8SMax Reitz } 36568dba0bfSKevin Wolf s->l1_table[l1_index] = old_l2_offset; 366e3b21ef9SMax Reitz if (l2_offset > 0) { 367e3b21ef9SMax Reitz qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), 368e3b21ef9SMax Reitz QCOW2_DISCARD_ALWAYS); 369e3b21ef9SMax Reitz } 370175e1152SKevin Wolf return ret; 37145aba42fSKevin Wolf } 37245aba42fSKevin Wolf 3732bfcc4a0SKevin Wolf /* 37413f893c4SAlberto Garcia * Checks how many clusters in a given L2 slice are contiguous in the image 3752bfcc4a0SKevin Wolf * file. As soon as one of the flags in the bitmask stop_flags changes compared 3762bfcc4a0SKevin Wolf * to the first cluster, the search is stopped and the cluster is not counted 3772bfcc4a0SKevin Wolf * as contiguous. (This allows it, for example, to stop at the first compressed 3782bfcc4a0SKevin Wolf * cluster which may require a different handling) 3792bfcc4a0SKevin Wolf */ 380b6d36defSMax Reitz static int count_contiguous_clusters(int nb_clusters, int cluster_size, 38113f893c4SAlberto Garcia uint64_t *l2_slice, uint64_t stop_flags) 38245aba42fSKevin Wolf { 38345aba42fSKevin Wolf int i; 3843ef95218SEric Blake QCow2ClusterType first_cluster_type; 38578a52ad5SPeter Lieven uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED; 38613f893c4SAlberto Garcia uint64_t first_entry = be64_to_cpu(l2_slice[0]); 38715684a47SMax Reitz uint64_t offset = first_entry & mask; 38845aba42fSKevin Wolf 389564a6b69SMax Reitz if (!offset) { 39045aba42fSKevin Wolf return 0; 391564a6b69SMax Reitz } 39245aba42fSKevin Wolf 393564a6b69SMax Reitz /* must be allocated */ 394564a6b69SMax Reitz first_cluster_type = qcow2_get_cluster_type(first_entry); 395564a6b69SMax Reitz assert(first_cluster_type == QCOW2_CLUSTER_NORMAL || 396fdfab37dSEric Blake first_cluster_type == QCOW2_CLUSTER_ZERO_ALLOC); 39715684a47SMax Reitz 39861653008SKevin Wolf for (i = 0; i < nb_clusters; i++) { 39913f893c4SAlberto Garcia uint64_t l2_entry = be64_to_cpu(l2_slice[i]) & mask; 4002bfcc4a0SKevin Wolf if (offset + (uint64_t) i * cluster_size != l2_entry) { 40145aba42fSKevin Wolf break; 4022bfcc4a0SKevin Wolf } 4032bfcc4a0SKevin Wolf } 40445aba42fSKevin Wolf 40561653008SKevin Wolf return i; 40645aba42fSKevin Wolf } 40745aba42fSKevin Wolf 4084341df8aSEric Blake /* 4094341df8aSEric Blake * Checks how many consecutive unallocated clusters in a given L2 410c26f10baSAlberto Garcia * slice have the same cluster type. 4114341df8aSEric Blake */ 4124341df8aSEric Blake static int count_contiguous_clusters_unallocated(int nb_clusters, 413c26f10baSAlberto Garcia uint64_t *l2_slice, 4143ef95218SEric Blake QCow2ClusterType wanted_type) 41545aba42fSKevin Wolf { 4162bfcc4a0SKevin Wolf int i; 41745aba42fSKevin Wolf 418fdfab37dSEric Blake assert(wanted_type == QCOW2_CLUSTER_ZERO_PLAIN || 4194341df8aSEric Blake wanted_type == QCOW2_CLUSTER_UNALLOCATED); 4202bfcc4a0SKevin Wolf for (i = 0; i < nb_clusters; i++) { 421c26f10baSAlberto Garcia uint64_t entry = be64_to_cpu(l2_slice[i]); 4223ef95218SEric Blake QCow2ClusterType type = qcow2_get_cluster_type(entry); 4232bfcc4a0SKevin Wolf 424fdfab37dSEric Blake if (type != wanted_type) { 4252bfcc4a0SKevin Wolf break; 4262bfcc4a0SKevin Wolf } 4272bfcc4a0SKevin Wolf } 42845aba42fSKevin Wolf 42945aba42fSKevin Wolf return i; 43045aba42fSKevin Wolf } 43145aba42fSKevin Wolf 432672f0f2cSAlberto Garcia static int coroutine_fn do_perform_cow_read(BlockDriverState *bs, 433aaa4d20bSKevin Wolf uint64_t src_cluster_offset, 434e034f5bcSAlberto Garcia unsigned offset_in_cluster, 43586b862c4SAlberto Garcia QEMUIOVector *qiov) 43645aba42fSKevin Wolf { 437aaa4d20bSKevin Wolf int ret; 4381b9f1491SKevin Wolf 43986b862c4SAlberto Garcia if (qiov->size == 0) { 44099450c6fSAlberto Garcia return 0; 44199450c6fSAlberto Garcia } 44299450c6fSAlberto Garcia 44366f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); 444aef4acb6SStefan Hajnoczi 445dba28555SMax Reitz if (!bs->drv) { 446672f0f2cSAlberto Garcia return -ENOMEDIUM; 447dba28555SMax Reitz } 448dba28555SMax Reitz 449aef4acb6SStefan Hajnoczi /* Call .bdrv_co_readv() directly instead of using the public block-layer 450aef4acb6SStefan Hajnoczi * interface. This avoids double I/O throttling and request tracking, 451aef4acb6SStefan Hajnoczi * which can lead to deadlock when block layer copy-on-read is enabled. 452aef4acb6SStefan Hajnoczi */ 453aaa4d20bSKevin Wolf ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster, 45486b862c4SAlberto Garcia qiov->size, qiov, 0); 4551b9f1491SKevin Wolf if (ret < 0) { 456672f0f2cSAlberto Garcia return ret; 4571b9f1491SKevin Wolf } 4581b9f1491SKevin Wolf 459672f0f2cSAlberto Garcia return 0; 460672f0f2cSAlberto Garcia } 461672f0f2cSAlberto Garcia 462672f0f2cSAlberto Garcia static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs, 463672f0f2cSAlberto Garcia uint64_t src_cluster_offset, 4644652b8f3SDaniel P. Berrange uint64_t cluster_offset, 465672f0f2cSAlberto Garcia unsigned offset_in_cluster, 466672f0f2cSAlberto Garcia uint8_t *buffer, 467672f0f2cSAlberto Garcia unsigned bytes) 468672f0f2cSAlberto Garcia { 469672f0f2cSAlberto Garcia if (bytes && bs->encrypted) { 470672f0f2cSAlberto Garcia BDRVQcow2State *s = bs->opaque; 4714609742aSDaniel P. Berrange int64_t offset = (s->crypt_physical_offset ? 4724652b8f3SDaniel P. Berrange (cluster_offset + offset_in_cluster) : 4734609742aSDaniel P. Berrange (src_cluster_offset + offset_in_cluster)); 474aaa4d20bSKevin Wolf assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0); 475aaa4d20bSKevin Wolf assert((bytes & ~BDRV_SECTOR_MASK) == 0); 476b25b387fSDaniel P. Berrange assert(s->crypto); 4774609742aSDaniel P. Berrange if (qcrypto_block_encrypt(s->crypto, offset, buffer, bytes, NULL) < 0) { 478672f0f2cSAlberto Garcia return false; 479f6fa64f6SDaniel P. Berrange } 48045aba42fSKevin Wolf } 481672f0f2cSAlberto Garcia return true; 482672f0f2cSAlberto Garcia } 483672f0f2cSAlberto Garcia 484672f0f2cSAlberto Garcia static int coroutine_fn do_perform_cow_write(BlockDriverState *bs, 485672f0f2cSAlberto Garcia uint64_t cluster_offset, 486672f0f2cSAlberto Garcia unsigned offset_in_cluster, 48786b862c4SAlberto Garcia QEMUIOVector *qiov) 488672f0f2cSAlberto Garcia { 489672f0f2cSAlberto Garcia int ret; 490672f0f2cSAlberto Garcia 49186b862c4SAlberto Garcia if (qiov->size == 0) { 492672f0f2cSAlberto Garcia return 0; 493672f0f2cSAlberto Garcia } 494672f0f2cSAlberto Garcia 495231bb267SMax Reitz ret = qcow2_pre_write_overlap_check(bs, 0, 49686b862c4SAlberto Garcia cluster_offset + offset_in_cluster, qiov->size); 497cf93980eSMax Reitz if (ret < 0) { 498672f0f2cSAlberto Garcia return ret; 499cf93980eSMax Reitz } 500cf93980eSMax Reitz 50166f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); 502a03ef88fSKevin Wolf ret = bdrv_co_pwritev(bs->file, cluster_offset + offset_in_cluster, 50386b862c4SAlberto Garcia qiov->size, qiov, 0); 5041b9f1491SKevin Wolf if (ret < 0) { 505672f0f2cSAlberto Garcia return ret; 5061b9f1491SKevin Wolf } 5071b9f1491SKevin Wolf 508672f0f2cSAlberto Garcia return 0; 50945aba42fSKevin Wolf } 51045aba42fSKevin Wolf 51145aba42fSKevin Wolf 51245aba42fSKevin Wolf /* 51345aba42fSKevin Wolf * get_cluster_offset 51445aba42fSKevin Wolf * 515ecfe1863SKevin Wolf * For a given offset of the virtual disk, find the cluster type and offset in 516ecfe1863SKevin Wolf * the qcow2 file. The offset is stored in *cluster_offset. 51745aba42fSKevin Wolf * 518ecfe1863SKevin Wolf * On entry, *bytes is the maximum number of contiguous bytes starting at 519ecfe1863SKevin Wolf * offset that we are interested in. 52045aba42fSKevin Wolf * 521ecfe1863SKevin Wolf * On exit, *bytes is the number of bytes starting at offset that have the same 522ecfe1863SKevin Wolf * cluster type and (if applicable) are stored contiguously in the image file. 523ecfe1863SKevin Wolf * Compressed clusters are always returned one by one. 52445aba42fSKevin Wolf * 52568d000a3SKevin Wolf * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error 52668d000a3SKevin Wolf * cases. 52745aba42fSKevin Wolf */ 5281c46efaaSKevin Wolf int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, 529ecfe1863SKevin Wolf unsigned int *bytes, uint64_t *cluster_offset) 53045aba42fSKevin Wolf { 531ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 5322cf7cfa1SKevin Wolf unsigned int l2_index; 533fd630039SAlberto Garcia uint64_t l1_index, l2_offset, *l2_slice; 534fd630039SAlberto Garcia int c; 535c834cba9SMax Reitz unsigned int offset_in_cluster; 536c834cba9SMax Reitz uint64_t bytes_available, bytes_needed, nb_clusters; 5373ef95218SEric Blake QCow2ClusterType type; 53855c17e98SKevin Wolf int ret; 539b2f65d6bSKevin Wolf 540b2f65d6bSKevin Wolf offset_in_cluster = offset_into_cluster(s, offset); 541ecfe1863SKevin Wolf bytes_needed = (uint64_t) *bytes + offset_in_cluster; 54245aba42fSKevin Wolf 543b2f65d6bSKevin Wolf /* compute how many bytes there are between the start of the cluster 544fd630039SAlberto Garcia * containing offset and the end of the l2 slice that contains 545fd630039SAlberto Garcia * the entry pointing to it */ 546fd630039SAlberto Garcia bytes_available = 547fd630039SAlberto Garcia ((uint64_t) (s->l2_slice_size - offset_to_l2_slice_index(s, offset))) 548fd630039SAlberto Garcia << s->cluster_bits; 54945aba42fSKevin Wolf 550b2f65d6bSKevin Wolf if (bytes_needed > bytes_available) { 551b2f65d6bSKevin Wolf bytes_needed = bytes_available; 55245aba42fSKevin Wolf } 55345aba42fSKevin Wolf 5541c46efaaSKevin Wolf *cluster_offset = 0; 55545aba42fSKevin Wolf 556b6af0975SDaniel P. Berrange /* seek to the l2 offset in the l1 table */ 55745aba42fSKevin Wolf 55805b5b6eeSAlberto Garcia l1_index = offset_to_l1_index(s, offset); 55968d000a3SKevin Wolf if (l1_index >= s->l1_size) { 5603ef95218SEric Blake type = QCOW2_CLUSTER_UNALLOCATED; 56145aba42fSKevin Wolf goto out; 56268d000a3SKevin Wolf } 56345aba42fSKevin Wolf 56468d000a3SKevin Wolf l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; 56568d000a3SKevin Wolf if (!l2_offset) { 5663ef95218SEric Blake type = QCOW2_CLUSTER_UNALLOCATED; 56745aba42fSKevin Wolf goto out; 56868d000a3SKevin Wolf } 56945aba42fSKevin Wolf 570a97c67eeSMax Reitz if (offset_into_cluster(s, l2_offset)) { 571a97c67eeSMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 572a97c67eeSMax Reitz " unaligned (L1 index: %#" PRIx64 ")", 573a97c67eeSMax Reitz l2_offset, l1_index); 574a97c67eeSMax Reitz return -EIO; 575a97c67eeSMax Reitz } 576a97c67eeSMax Reitz 577fd630039SAlberto Garcia /* load the l2 slice in memory */ 57845aba42fSKevin Wolf 579fd630039SAlberto Garcia ret = l2_load(bs, offset, l2_offset, &l2_slice); 58055c17e98SKevin Wolf if (ret < 0) { 58155c17e98SKevin Wolf return ret; 5821c46efaaSKevin Wolf } 58345aba42fSKevin Wolf 58445aba42fSKevin Wolf /* find the cluster offset for the given disk offset */ 58545aba42fSKevin Wolf 586fd630039SAlberto Garcia l2_index = offset_to_l2_slice_index(s, offset); 587fd630039SAlberto Garcia *cluster_offset = be64_to_cpu(l2_slice[l2_index]); 588b6d36defSMax Reitz 589b2f65d6bSKevin Wolf nb_clusters = size_to_clusters(s, bytes_needed); 590c834cba9SMax Reitz /* bytes_needed <= *bytes + offset_in_cluster, both of which are unsigned 591c834cba9SMax Reitz * integers; the minimum cluster size is 512, so this assertion is always 592c834cba9SMax Reitz * true */ 593c834cba9SMax Reitz assert(nb_clusters <= INT_MAX); 59445aba42fSKevin Wolf 5953ef95218SEric Blake type = qcow2_get_cluster_type(*cluster_offset); 596fdfab37dSEric Blake if (s->qcow_version < 3 && (type == QCOW2_CLUSTER_ZERO_PLAIN || 597fdfab37dSEric Blake type == QCOW2_CLUSTER_ZERO_ALLOC)) { 598a97c67eeSMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found" 599a97c67eeSMax Reitz " in pre-v3 image (L2 offset: %#" PRIx64 600a97c67eeSMax Reitz ", L2 index: %#x)", l2_offset, l2_index); 601a97c67eeSMax Reitz ret = -EIO; 602a97c67eeSMax Reitz goto fail; 603381b487dSPaolo Bonzini } 604fdfab37dSEric Blake switch (type) { 605fdfab37dSEric Blake case QCOW2_CLUSTER_COMPRESSED: 606fdfab37dSEric Blake /* Compressed clusters can only be processed one by one */ 607fdfab37dSEric Blake c = 1; 608fdfab37dSEric Blake *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK; 6096377af48SKevin Wolf break; 610fdfab37dSEric Blake case QCOW2_CLUSTER_ZERO_PLAIN: 61168d000a3SKevin Wolf case QCOW2_CLUSTER_UNALLOCATED: 61245aba42fSKevin Wolf /* how many empty clusters ? */ 6134341df8aSEric Blake c = count_contiguous_clusters_unallocated(nb_clusters, 614fd630039SAlberto Garcia &l2_slice[l2_index], type); 61568d000a3SKevin Wolf *cluster_offset = 0; 61668d000a3SKevin Wolf break; 617fdfab37dSEric Blake case QCOW2_CLUSTER_ZERO_ALLOC: 61868d000a3SKevin Wolf case QCOW2_CLUSTER_NORMAL: 61945aba42fSKevin Wolf /* how many allocated clusters ? */ 62045aba42fSKevin Wolf c = count_contiguous_clusters(nb_clusters, s->cluster_size, 621fd630039SAlberto Garcia &l2_slice[l2_index], QCOW_OFLAG_ZERO); 62268d000a3SKevin Wolf *cluster_offset &= L2E_OFFSET_MASK; 623a97c67eeSMax Reitz if (offset_into_cluster(s, *cluster_offset)) { 624fdfab37dSEric Blake qcow2_signal_corruption(bs, true, -1, -1, 625fdfab37dSEric Blake "Cluster allocation offset %#" 626a97c67eeSMax Reitz PRIx64 " unaligned (L2 offset: %#" PRIx64 627a97c67eeSMax Reitz ", L2 index: %#x)", *cluster_offset, 628a97c67eeSMax Reitz l2_offset, l2_index); 629a97c67eeSMax Reitz ret = -EIO; 630a97c67eeSMax Reitz goto fail; 631a97c67eeSMax Reitz } 63268d000a3SKevin Wolf break; 6331417d7e4SKevin Wolf default: 6341417d7e4SKevin Wolf abort(); 63545aba42fSKevin Wolf } 63645aba42fSKevin Wolf 637fd630039SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 63829c1a730SKevin Wolf 639c834cba9SMax Reitz bytes_available = (int64_t)c * s->cluster_size; 64068d000a3SKevin Wolf 64145aba42fSKevin Wolf out: 642b2f65d6bSKevin Wolf if (bytes_available > bytes_needed) { 643b2f65d6bSKevin Wolf bytes_available = bytes_needed; 644b2f65d6bSKevin Wolf } 64545aba42fSKevin Wolf 646c834cba9SMax Reitz /* bytes_available <= bytes_needed <= *bytes + offset_in_cluster; 647c834cba9SMax Reitz * subtracting offset_in_cluster will therefore definitely yield something 648c834cba9SMax Reitz * not exceeding UINT_MAX */ 649c834cba9SMax Reitz assert(bytes_available - offset_in_cluster <= UINT_MAX); 650ecfe1863SKevin Wolf *bytes = bytes_available - offset_in_cluster; 65145aba42fSKevin Wolf 6523ef95218SEric Blake return type; 653a97c67eeSMax Reitz 654a97c67eeSMax Reitz fail: 655fd630039SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **)&l2_slice); 656a97c67eeSMax Reitz return ret; 65745aba42fSKevin Wolf } 65845aba42fSKevin Wolf 65945aba42fSKevin Wolf /* 66045aba42fSKevin Wolf * get_cluster_table 66145aba42fSKevin Wolf * 66245aba42fSKevin Wolf * for a given disk offset, load (and allocate if needed) 663c03bfc5bSAlberto Garcia * the appropriate slice of its l2 table. 66445aba42fSKevin Wolf * 665c03bfc5bSAlberto Garcia * the cluster index in the l2 slice is given to the caller. 66645aba42fSKevin Wolf * 6671e3e8f1aSKevin Wolf * Returns 0 on success, -errno in failure case 66845aba42fSKevin Wolf */ 66945aba42fSKevin Wolf static int get_cluster_table(BlockDriverState *bs, uint64_t offset, 670c03bfc5bSAlberto Garcia uint64_t **new_l2_slice, 67145aba42fSKevin Wolf int *new_l2_index) 67245aba42fSKevin Wolf { 673ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 6742cf7cfa1SKevin Wolf unsigned int l2_index; 6752cf7cfa1SKevin Wolf uint64_t l1_index, l2_offset; 676c03bfc5bSAlberto Garcia uint64_t *l2_slice = NULL; 67780ee15a6SKevin Wolf int ret; 67845aba42fSKevin Wolf 679b6af0975SDaniel P. Berrange /* seek to the l2 offset in the l1 table */ 68045aba42fSKevin Wolf 68105b5b6eeSAlberto Garcia l1_index = offset_to_l1_index(s, offset); 68245aba42fSKevin Wolf if (l1_index >= s->l1_size) { 68372893756SStefan Hajnoczi ret = qcow2_grow_l1_table(bs, l1_index + 1, false); 6841e3e8f1aSKevin Wolf if (ret < 0) { 6851e3e8f1aSKevin Wolf return ret; 6861e3e8f1aSKevin Wolf } 68745aba42fSKevin Wolf } 6888e37f681SKevin Wolf 6892cf7cfa1SKevin Wolf assert(l1_index < s->l1_size); 6908e37f681SKevin Wolf l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; 691a97c67eeSMax Reitz if (offset_into_cluster(s, l2_offset)) { 692a97c67eeSMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 693a97c67eeSMax Reitz " unaligned (L1 index: %#" PRIx64 ")", 694a97c67eeSMax Reitz l2_offset, l1_index); 695a97c67eeSMax Reitz return -EIO; 696a97c67eeSMax Reitz } 69745aba42fSKevin Wolf 69805f9ee46SAlberto Garcia if (!(s->l1_table[l1_index] & QCOW_OFLAG_COPIED)) { 69916fde5f2SKevin Wolf /* First allocate a new L2 table (and do COW if needed) */ 7003861946aSAlberto Garcia ret = l2_allocate(bs, l1_index); 701c46e1167SKevin Wolf if (ret < 0) { 702c46e1167SKevin Wolf return ret; 7031e3e8f1aSKevin Wolf } 70416fde5f2SKevin Wolf 70516fde5f2SKevin Wolf /* Then decrease the refcount of the old table */ 70616fde5f2SKevin Wolf if (l2_offset) { 7076cfcb9b8SKevin Wolf qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), 7086cfcb9b8SKevin Wolf QCOW2_DISCARD_OTHER); 70916fde5f2SKevin Wolf } 7103861946aSAlberto Garcia 7113861946aSAlberto Garcia /* Get the offset of the newly-allocated l2 table */ 7123861946aSAlberto Garcia l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; 7133861946aSAlberto Garcia assert(offset_into_cluster(s, l2_offset) == 0); 71405f9ee46SAlberto Garcia } 71505f9ee46SAlberto Garcia 716c03bfc5bSAlberto Garcia /* load the l2 slice in memory */ 717c03bfc5bSAlberto Garcia ret = l2_load(bs, offset, l2_offset, &l2_slice); 7183861946aSAlberto Garcia if (ret < 0) { 7193861946aSAlberto Garcia return ret; 7203861946aSAlberto Garcia } 72145aba42fSKevin Wolf 72245aba42fSKevin Wolf /* find the cluster offset for the given disk offset */ 72345aba42fSKevin Wolf 724c03bfc5bSAlberto Garcia l2_index = offset_to_l2_slice_index(s, offset); 72545aba42fSKevin Wolf 726c03bfc5bSAlberto Garcia *new_l2_slice = l2_slice; 72745aba42fSKevin Wolf *new_l2_index = l2_index; 72845aba42fSKevin Wolf 7291e3e8f1aSKevin Wolf return 0; 73045aba42fSKevin Wolf } 73145aba42fSKevin Wolf 73245aba42fSKevin Wolf /* 73345aba42fSKevin Wolf * alloc_compressed_cluster_offset 73445aba42fSKevin Wolf * 73545aba42fSKevin Wolf * For a given offset of the disk image, return cluster offset in 73645aba42fSKevin Wolf * qcow2 file. 73745aba42fSKevin Wolf * 73845aba42fSKevin Wolf * If the offset is not found, allocate a new compressed cluster. 73945aba42fSKevin Wolf * 74045aba42fSKevin Wolf * Return the cluster offset if successful, 74145aba42fSKevin Wolf * Return 0, otherwise. 74245aba42fSKevin Wolf * 74345aba42fSKevin Wolf */ 74445aba42fSKevin Wolf 745ed6ccf0fSKevin Wolf uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, 74645aba42fSKevin Wolf uint64_t offset, 74745aba42fSKevin Wolf int compressed_size) 74845aba42fSKevin Wolf { 749ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 75045aba42fSKevin Wolf int l2_index, ret; 751e4e72548SAlberto Garcia uint64_t *l2_slice; 752f4f0d391SKevin Wolf int64_t cluster_offset; 75345aba42fSKevin Wolf int nb_csectors; 75445aba42fSKevin Wolf 755e4e72548SAlberto Garcia ret = get_cluster_table(bs, offset, &l2_slice, &l2_index); 7561e3e8f1aSKevin Wolf if (ret < 0) { 75745aba42fSKevin Wolf return 0; 7581e3e8f1aSKevin Wolf } 75945aba42fSKevin Wolf 760b0b6862eSKevin Wolf /* Compression can't overwrite anything. Fail if the cluster was already 761b0b6862eSKevin Wolf * allocated. */ 762e4e72548SAlberto Garcia cluster_offset = be64_to_cpu(l2_slice[l2_index]); 763b0b6862eSKevin Wolf if (cluster_offset & L2E_OFFSET_MASK) { 764e4e72548SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 7658f1efd00SKevin Wolf return 0; 7668f1efd00SKevin Wolf } 76745aba42fSKevin Wolf 768ed6ccf0fSKevin Wolf cluster_offset = qcow2_alloc_bytes(bs, compressed_size); 7695d757b56SKevin Wolf if (cluster_offset < 0) { 770e4e72548SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 7715d757b56SKevin Wolf return 0; 7725d757b56SKevin Wolf } 7735d757b56SKevin Wolf 77445aba42fSKevin Wolf nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) - 77545aba42fSKevin Wolf (cluster_offset >> 9); 77645aba42fSKevin Wolf 77745aba42fSKevin Wolf cluster_offset |= QCOW_OFLAG_COMPRESSED | 77845aba42fSKevin Wolf ((uint64_t)nb_csectors << s->csize_shift); 77945aba42fSKevin Wolf 78045aba42fSKevin Wolf /* update L2 table */ 78145aba42fSKevin Wolf 78245aba42fSKevin Wolf /* compressed clusters never have the copied flag */ 78345aba42fSKevin Wolf 78466f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); 785e4e72548SAlberto Garcia qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice); 786e4e72548SAlberto Garcia l2_slice[l2_index] = cpu_to_be64(cluster_offset); 787e4e72548SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 78845aba42fSKevin Wolf 78945aba42fSKevin Wolf return cluster_offset; 79045aba42fSKevin Wolf } 79145aba42fSKevin Wolf 79299450c6fSAlberto Garcia static int perform_cow(BlockDriverState *bs, QCowL2Meta *m) 793593fb83cSKevin Wolf { 794ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 79599450c6fSAlberto Garcia Qcow2COWRegion *start = &m->cow_start; 79699450c6fSAlberto Garcia Qcow2COWRegion *end = &m->cow_end; 797672f0f2cSAlberto Garcia unsigned buffer_size; 798b3cf1c7cSAlberto Garcia unsigned data_bytes = end->offset - (start->offset + start->nb_bytes); 799b3cf1c7cSAlberto Garcia bool merge_reads; 800672f0f2cSAlberto Garcia uint8_t *start_buffer, *end_buffer; 80186b862c4SAlberto Garcia QEMUIOVector qiov; 802593fb83cSKevin Wolf int ret; 803593fb83cSKevin Wolf 804672f0f2cSAlberto Garcia assert(start->nb_bytes <= UINT_MAX - end->nb_bytes); 805b3cf1c7cSAlberto Garcia assert(start->nb_bytes + end->nb_bytes <= UINT_MAX - data_bytes); 806b3cf1c7cSAlberto Garcia assert(start->offset + start->nb_bytes <= end->offset); 807ee22a9d8SAlberto Garcia assert(!m->data_qiov || m->data_qiov->size == data_bytes); 808672f0f2cSAlberto Garcia 80999450c6fSAlberto Garcia if (start->nb_bytes == 0 && end->nb_bytes == 0) { 810593fb83cSKevin Wolf return 0; 811593fb83cSKevin Wolf } 812593fb83cSKevin Wolf 813b3cf1c7cSAlberto Garcia /* If we have to read both the start and end COW regions and the 814b3cf1c7cSAlberto Garcia * middle region is not too large then perform just one read 815b3cf1c7cSAlberto Garcia * operation */ 816b3cf1c7cSAlberto Garcia merge_reads = start->nb_bytes && end->nb_bytes && data_bytes <= 16384; 817b3cf1c7cSAlberto Garcia if (merge_reads) { 818b3cf1c7cSAlberto Garcia buffer_size = start->nb_bytes + data_bytes + end->nb_bytes; 819b3cf1c7cSAlberto Garcia } else { 820b3cf1c7cSAlberto Garcia /* If we have to do two reads, add some padding in the middle 821b3cf1c7cSAlberto Garcia * if necessary to make sure that the end region is optimally 822b3cf1c7cSAlberto Garcia * aligned. */ 823b3cf1c7cSAlberto Garcia size_t align = bdrv_opt_mem_align(bs); 824b3cf1c7cSAlberto Garcia assert(align > 0 && align <= UINT_MAX); 825b3cf1c7cSAlberto Garcia assert(QEMU_ALIGN_UP(start->nb_bytes, align) <= 826b3cf1c7cSAlberto Garcia UINT_MAX - end->nb_bytes); 827b3cf1c7cSAlberto Garcia buffer_size = QEMU_ALIGN_UP(start->nb_bytes, align) + end->nb_bytes; 828b3cf1c7cSAlberto Garcia } 829b3cf1c7cSAlberto Garcia 830b3cf1c7cSAlberto Garcia /* Reserve a buffer large enough to store all the data that we're 831b3cf1c7cSAlberto Garcia * going to read */ 832672f0f2cSAlberto Garcia start_buffer = qemu_try_blockalign(bs, buffer_size); 833672f0f2cSAlberto Garcia if (start_buffer == NULL) { 834672f0f2cSAlberto Garcia return -ENOMEM; 835672f0f2cSAlberto Garcia } 836672f0f2cSAlberto Garcia /* The part of the buffer where the end region is located */ 837672f0f2cSAlberto Garcia end_buffer = start_buffer + buffer_size - end->nb_bytes; 838672f0f2cSAlberto Garcia 839ee22a9d8SAlberto Garcia qemu_iovec_init(&qiov, 2 + (m->data_qiov ? m->data_qiov->niov : 0)); 84086b862c4SAlberto Garcia 841593fb83cSKevin Wolf qemu_co_mutex_unlock(&s->lock); 842b3cf1c7cSAlberto Garcia /* First we read the existing data from both COW regions. We 843b3cf1c7cSAlberto Garcia * either read the whole region in one go, or the start and end 844b3cf1c7cSAlberto Garcia * regions separately. */ 845b3cf1c7cSAlberto Garcia if (merge_reads) { 84686b862c4SAlberto Garcia qemu_iovec_add(&qiov, start_buffer, buffer_size); 84786b862c4SAlberto Garcia ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov); 848b3cf1c7cSAlberto Garcia } else { 84986b862c4SAlberto Garcia qemu_iovec_add(&qiov, start_buffer, start->nb_bytes); 85086b862c4SAlberto Garcia ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov); 851593fb83cSKevin Wolf if (ret < 0) { 85299450c6fSAlberto Garcia goto fail; 853593fb83cSKevin Wolf } 854593fb83cSKevin Wolf 85586b862c4SAlberto Garcia qemu_iovec_reset(&qiov); 85686b862c4SAlberto Garcia qemu_iovec_add(&qiov, end_buffer, end->nb_bytes); 85786b862c4SAlberto Garcia ret = do_perform_cow_read(bs, m->offset, end->offset, &qiov); 858b3cf1c7cSAlberto Garcia } 859672f0f2cSAlberto Garcia if (ret < 0) { 860672f0f2cSAlberto Garcia goto fail; 861672f0f2cSAlberto Garcia } 86299450c6fSAlberto Garcia 863672f0f2cSAlberto Garcia /* Encrypt the data if necessary before writing it */ 864672f0f2cSAlberto Garcia if (bs->encrypted) { 8654652b8f3SDaniel P. Berrange if (!do_perform_cow_encrypt(bs, m->offset, m->alloc_offset, 8664652b8f3SDaniel P. Berrange start->offset, start_buffer, 8674652b8f3SDaniel P. Berrange start->nb_bytes) || 8684652b8f3SDaniel P. Berrange !do_perform_cow_encrypt(bs, m->offset, m->alloc_offset, 8694652b8f3SDaniel P. Berrange end->offset, end_buffer, end->nb_bytes)) { 870672f0f2cSAlberto Garcia ret = -EIO; 871672f0f2cSAlberto Garcia goto fail; 872672f0f2cSAlberto Garcia } 873672f0f2cSAlberto Garcia } 874672f0f2cSAlberto Garcia 875ee22a9d8SAlberto Garcia /* And now we can write everything. If we have the guest data we 876ee22a9d8SAlberto Garcia * can write everything in one single operation */ 877ee22a9d8SAlberto Garcia if (m->data_qiov) { 878ee22a9d8SAlberto Garcia qemu_iovec_reset(&qiov); 879ee22a9d8SAlberto Garcia if (start->nb_bytes) { 880ee22a9d8SAlberto Garcia qemu_iovec_add(&qiov, start_buffer, start->nb_bytes); 881ee22a9d8SAlberto Garcia } 882ee22a9d8SAlberto Garcia qemu_iovec_concat(&qiov, m->data_qiov, 0, data_bytes); 883ee22a9d8SAlberto Garcia if (end->nb_bytes) { 884ee22a9d8SAlberto Garcia qemu_iovec_add(&qiov, end_buffer, end->nb_bytes); 885ee22a9d8SAlberto Garcia } 886ee22a9d8SAlberto Garcia /* NOTE: we have a write_aio blkdebug event here followed by 887ee22a9d8SAlberto Garcia * a cow_write one in do_perform_cow_write(), but there's only 888ee22a9d8SAlberto Garcia * one single I/O operation */ 889ee22a9d8SAlberto Garcia BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); 890ee22a9d8SAlberto Garcia ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov); 891ee22a9d8SAlberto Garcia } else { 892ee22a9d8SAlberto Garcia /* If there's no guest data then write both COW regions separately */ 89386b862c4SAlberto Garcia qemu_iovec_reset(&qiov); 89486b862c4SAlberto Garcia qemu_iovec_add(&qiov, start_buffer, start->nb_bytes); 89586b862c4SAlberto Garcia ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov); 896672f0f2cSAlberto Garcia if (ret < 0) { 897672f0f2cSAlberto Garcia goto fail; 898672f0f2cSAlberto Garcia } 899672f0f2cSAlberto Garcia 90086b862c4SAlberto Garcia qemu_iovec_reset(&qiov); 90186b862c4SAlberto Garcia qemu_iovec_add(&qiov, end_buffer, end->nb_bytes); 90286b862c4SAlberto Garcia ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov); 903ee22a9d8SAlberto Garcia } 904ee22a9d8SAlberto Garcia 90599450c6fSAlberto Garcia fail: 90699450c6fSAlberto Garcia qemu_co_mutex_lock(&s->lock); 90799450c6fSAlberto Garcia 908593fb83cSKevin Wolf /* 909593fb83cSKevin Wolf * Before we update the L2 table to actually point to the new cluster, we 910593fb83cSKevin Wolf * need to be sure that the refcounts have been increased and COW was 911593fb83cSKevin Wolf * handled. 912593fb83cSKevin Wolf */ 91399450c6fSAlberto Garcia if (ret == 0) { 914593fb83cSKevin Wolf qcow2_cache_depends_on_flush(s->l2_table_cache); 91599450c6fSAlberto Garcia } 916593fb83cSKevin Wolf 917672f0f2cSAlberto Garcia qemu_vfree(start_buffer); 91886b862c4SAlberto Garcia qemu_iovec_destroy(&qiov); 91999450c6fSAlberto Garcia return ret; 920593fb83cSKevin Wolf } 921593fb83cSKevin Wolf 922148da7eaSKevin Wolf int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) 92345aba42fSKevin Wolf { 924ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 92545aba42fSKevin Wolf int i, j = 0, l2_index, ret; 926a002c0b0SAlberto Garcia uint64_t *old_cluster, *l2_slice; 927250196f1SKevin Wolf uint64_t cluster_offset = m->alloc_offset; 92845aba42fSKevin Wolf 9293cce16f4SKevin Wolf trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters); 930f50f88b9SKevin Wolf assert(m->nb_clusters > 0); 93145aba42fSKevin Wolf 9325839e53bSMarkus Armbruster old_cluster = g_try_new(uint64_t, m->nb_clusters); 933de82815dSKevin Wolf if (old_cluster == NULL) { 934de82815dSKevin Wolf ret = -ENOMEM; 935de82815dSKevin Wolf goto err; 936de82815dSKevin Wolf } 93745aba42fSKevin Wolf 93845aba42fSKevin Wolf /* copy content of unmodified sectors */ 93999450c6fSAlberto Garcia ret = perform_cow(bs, m); 940593fb83cSKevin Wolf if (ret < 0) { 94145aba42fSKevin Wolf goto err; 94245aba42fSKevin Wolf } 94345aba42fSKevin Wolf 944593fb83cSKevin Wolf /* Update L2 table. */ 94574c4510aSKevin Wolf if (s->use_lazy_refcounts) { 946280d3735SKevin Wolf qcow2_mark_dirty(bs); 947280d3735SKevin Wolf } 948bfe8043eSStefan Hajnoczi if (qcow2_need_accurate_refcounts(s)) { 949bfe8043eSStefan Hajnoczi qcow2_cache_set_dependency(bs, s->l2_table_cache, 950bfe8043eSStefan Hajnoczi s->refcount_block_cache); 951bfe8043eSStefan Hajnoczi } 952280d3735SKevin Wolf 953a002c0b0SAlberto Garcia ret = get_cluster_table(bs, m->offset, &l2_slice, &l2_index); 9541e3e8f1aSKevin Wolf if (ret < 0) { 95545aba42fSKevin Wolf goto err; 9561e3e8f1aSKevin Wolf } 957a002c0b0SAlberto Garcia qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice); 95845aba42fSKevin Wolf 959a002c0b0SAlberto Garcia assert(l2_index + m->nb_clusters <= s->l2_slice_size); 96045aba42fSKevin Wolf for (i = 0; i < m->nb_clusters; i++) { 96145aba42fSKevin Wolf /* if two concurrent writes happen to the same unallocated cluster 96245aba42fSKevin Wolf * each write allocates separate cluster and writes data concurrently. 96345aba42fSKevin Wolf * The first one to complete updates l2 table with pointer to its 96445aba42fSKevin Wolf * cluster the second one has to do RMW (which is done above by 965aaa4d20bSKevin Wolf * perform_cow()), update l2 table with its cluster pointer and free 96645aba42fSKevin Wolf * old cluster. This is what this loop does */ 967a002c0b0SAlberto Garcia if (l2_slice[l2_index + i] != 0) { 968a002c0b0SAlberto Garcia old_cluster[j++] = l2_slice[l2_index + i]; 969aaa4d20bSKevin Wolf } 97045aba42fSKevin Wolf 971a002c0b0SAlberto Garcia l2_slice[l2_index + i] = cpu_to_be64((cluster_offset + 97245aba42fSKevin Wolf (i << s->cluster_bits)) | QCOW_OFLAG_COPIED); 97345aba42fSKevin Wolf } 97445aba42fSKevin Wolf 9759f8e668eSKevin Wolf 976a002c0b0SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 97745aba42fSKevin Wolf 9787ec5e6a4SKevin Wolf /* 9797ec5e6a4SKevin Wolf * If this was a COW, we need to decrease the refcount of the old cluster. 9806cfcb9b8SKevin Wolf * 9816cfcb9b8SKevin Wolf * Don't discard clusters that reach a refcount of 0 (e.g. compressed 9826cfcb9b8SKevin Wolf * clusters), the next write will reuse them anyway. 9837ec5e6a4SKevin Wolf */ 984564a6b69SMax Reitz if (!m->keep_old_clusters && j != 0) { 9857ec5e6a4SKevin Wolf for (i = 0; i < j; i++) { 9866cfcb9b8SKevin Wolf qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1, 9876cfcb9b8SKevin Wolf QCOW2_DISCARD_NEVER); 9887ec5e6a4SKevin Wolf } 9897ec5e6a4SKevin Wolf } 99045aba42fSKevin Wolf 99145aba42fSKevin Wolf ret = 0; 99245aba42fSKevin Wolf err: 9937267c094SAnthony Liguori g_free(old_cluster); 99445aba42fSKevin Wolf return ret; 99545aba42fSKevin Wolf } 99645aba42fSKevin Wolf 99745aba42fSKevin Wolf /* 998bf319eceSKevin Wolf * Returns the number of contiguous clusters that can be used for an allocating 999bf319eceSKevin Wolf * write, but require COW to be performed (this includes yet unallocated space, 1000bf319eceSKevin Wolf * which must copy from the backing file) 1001bf319eceSKevin Wolf */ 1002ff99129aSKevin Wolf static int count_cow_clusters(BDRVQcow2State *s, int nb_clusters, 1003dd32c881SAlberto Garcia uint64_t *l2_slice, int l2_index) 1004bf319eceSKevin Wolf { 1005143550a8SKevin Wolf int i; 1006bf319eceSKevin Wolf 1007143550a8SKevin Wolf for (i = 0; i < nb_clusters; i++) { 1008dd32c881SAlberto Garcia uint64_t l2_entry = be64_to_cpu(l2_slice[l2_index + i]); 10093ef95218SEric Blake QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry); 1010143550a8SKevin Wolf 1011143550a8SKevin Wolf switch(cluster_type) { 1012143550a8SKevin Wolf case QCOW2_CLUSTER_NORMAL: 1013143550a8SKevin Wolf if (l2_entry & QCOW_OFLAG_COPIED) { 1014143550a8SKevin Wolf goto out; 1015143550a8SKevin Wolf } 1016bf319eceSKevin Wolf break; 1017143550a8SKevin Wolf case QCOW2_CLUSTER_UNALLOCATED: 1018143550a8SKevin Wolf case QCOW2_CLUSTER_COMPRESSED: 1019fdfab37dSEric Blake case QCOW2_CLUSTER_ZERO_PLAIN: 1020fdfab37dSEric Blake case QCOW2_CLUSTER_ZERO_ALLOC: 1021143550a8SKevin Wolf break; 1022143550a8SKevin Wolf default: 1023143550a8SKevin Wolf abort(); 1024143550a8SKevin Wolf } 1025bf319eceSKevin Wolf } 1026bf319eceSKevin Wolf 1027143550a8SKevin Wolf out: 1028bf319eceSKevin Wolf assert(i <= nb_clusters); 1029bf319eceSKevin Wolf return i; 1030bf319eceSKevin Wolf } 1031bf319eceSKevin Wolf 1032bf319eceSKevin Wolf /* 1033250196f1SKevin Wolf * Check if there already is an AIO write request in flight which allocates 1034250196f1SKevin Wolf * the same cluster. In this case we need to wait until the previous 1035250196f1SKevin Wolf * request has completed and updated the L2 table accordingly. 103665eb2e35SKevin Wolf * 103765eb2e35SKevin Wolf * Returns: 103865eb2e35SKevin Wolf * 0 if there was no dependency. *cur_bytes indicates the number of 103965eb2e35SKevin Wolf * bytes from guest_offset that can be read before the next 104065eb2e35SKevin Wolf * dependency must be processed (or the request is complete) 104165eb2e35SKevin Wolf * 104265eb2e35SKevin Wolf * -EAGAIN if we had to wait for another request, previously gathered 104365eb2e35SKevin Wolf * information on cluster allocation may be invalid now. The caller 104465eb2e35SKevin Wolf * must start over anyway, so consider *cur_bytes undefined. 1045250196f1SKevin Wolf */ 1046226c3c26SKevin Wolf static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, 1047ecdd5333SKevin Wolf uint64_t *cur_bytes, QCowL2Meta **m) 1048226c3c26SKevin Wolf { 1049ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 1050226c3c26SKevin Wolf QCowL2Meta *old_alloc; 105165eb2e35SKevin Wolf uint64_t bytes = *cur_bytes; 1052226c3c26SKevin Wolf 1053250196f1SKevin Wolf QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) { 1054250196f1SKevin Wolf 105565eb2e35SKevin Wolf uint64_t start = guest_offset; 105665eb2e35SKevin Wolf uint64_t end = start + bytes; 105765eb2e35SKevin Wolf uint64_t old_start = l2meta_cow_start(old_alloc); 105865eb2e35SKevin Wolf uint64_t old_end = l2meta_cow_end(old_alloc); 1059250196f1SKevin Wolf 1060d9d74f41SKevin Wolf if (end <= old_start || start >= old_end) { 1061250196f1SKevin Wolf /* No intersection */ 1062250196f1SKevin Wolf } else { 1063250196f1SKevin Wolf if (start < old_start) { 1064250196f1SKevin Wolf /* Stop at the start of a running allocation */ 106565eb2e35SKevin Wolf bytes = old_start - start; 1066250196f1SKevin Wolf } else { 106765eb2e35SKevin Wolf bytes = 0; 1068250196f1SKevin Wolf } 1069250196f1SKevin Wolf 1070ecdd5333SKevin Wolf /* Stop if already an l2meta exists. After yielding, it wouldn't 1071ecdd5333SKevin Wolf * be valid any more, so we'd have to clean up the old L2Metas 1072ecdd5333SKevin Wolf * and deal with requests depending on them before starting to 1073ecdd5333SKevin Wolf * gather new ones. Not worth the trouble. */ 1074ecdd5333SKevin Wolf if (bytes == 0 && *m) { 1075ecdd5333SKevin Wolf *cur_bytes = 0; 1076ecdd5333SKevin Wolf return 0; 1077ecdd5333SKevin Wolf } 1078ecdd5333SKevin Wolf 107965eb2e35SKevin Wolf if (bytes == 0) { 1080250196f1SKevin Wolf /* Wait for the dependency to complete. We need to recheck 1081250196f1SKevin Wolf * the free/allocated clusters when we continue. */ 10821ace7ceaSPaolo Bonzini qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock); 1083250196f1SKevin Wolf return -EAGAIN; 1084250196f1SKevin Wolf } 1085250196f1SKevin Wolf } 1086250196f1SKevin Wolf } 1087250196f1SKevin Wolf 108865eb2e35SKevin Wolf /* Make sure that existing clusters and new allocations are only used up to 108965eb2e35SKevin Wolf * the next dependency if we shortened the request above */ 109065eb2e35SKevin Wolf *cur_bytes = bytes; 1091250196f1SKevin Wolf 1092226c3c26SKevin Wolf return 0; 1093226c3c26SKevin Wolf } 1094226c3c26SKevin Wolf 1095226c3c26SKevin Wolf /* 10960af729ecSKevin Wolf * Checks how many already allocated clusters that don't require a copy on 10970af729ecSKevin Wolf * write there are at the given guest_offset (up to *bytes). If 10980af729ecSKevin Wolf * *host_offset is not zero, only physically contiguous clusters beginning at 10990af729ecSKevin Wolf * this host offset are counted. 11000af729ecSKevin Wolf * 1101411d62b0SKevin Wolf * Note that guest_offset may not be cluster aligned. In this case, the 1102411d62b0SKevin Wolf * returned *host_offset points to exact byte referenced by guest_offset and 1103411d62b0SKevin Wolf * therefore isn't cluster aligned as well. 11040af729ecSKevin Wolf * 11050af729ecSKevin Wolf * Returns: 11060af729ecSKevin Wolf * 0: if no allocated clusters are available at the given offset. 11070af729ecSKevin Wolf * *bytes is normally unchanged. It is set to 0 if the cluster 11080af729ecSKevin Wolf * is allocated and doesn't need COW, but doesn't have the right 11090af729ecSKevin Wolf * physical offset. 11100af729ecSKevin Wolf * 11110af729ecSKevin Wolf * 1: if allocated clusters that don't require a COW are available at 11120af729ecSKevin Wolf * the requested offset. *bytes may have decreased and describes 11130af729ecSKevin Wolf * the length of the area that can be written to. 11140af729ecSKevin Wolf * 11150af729ecSKevin Wolf * -errno: in error cases 11160af729ecSKevin Wolf */ 11170af729ecSKevin Wolf static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, 1118c53ede9fSKevin Wolf uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) 11190af729ecSKevin Wolf { 1120ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 11210af729ecSKevin Wolf int l2_index; 11220af729ecSKevin Wolf uint64_t cluster_offset; 1123cde91766SAlberto Garcia uint64_t *l2_slice; 1124b6d36defSMax Reitz uint64_t nb_clusters; 1125c53ede9fSKevin Wolf unsigned int keep_clusters; 1126a3f1afb4SAlberto Garcia int ret; 11270af729ecSKevin Wolf 11280af729ecSKevin Wolf trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset, 11290af729ecSKevin Wolf *bytes); 11300af729ecSKevin Wolf 1131411d62b0SKevin Wolf assert(*host_offset == 0 || offset_into_cluster(s, guest_offset) 1132411d62b0SKevin Wolf == offset_into_cluster(s, *host_offset)); 1133411d62b0SKevin Wolf 1134acb0467fSKevin Wolf /* 1135cde91766SAlberto Garcia * Calculate the number of clusters to look for. We stop at L2 slice 1136acb0467fSKevin Wolf * boundaries to keep things simple. 1137acb0467fSKevin Wolf */ 1138acb0467fSKevin Wolf nb_clusters = 1139acb0467fSKevin Wolf size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); 1140acb0467fSKevin Wolf 1141cde91766SAlberto Garcia l2_index = offset_to_l2_slice_index(s, guest_offset); 1142cde91766SAlberto Garcia nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index); 1143b6d36defSMax Reitz assert(nb_clusters <= INT_MAX); 1144acb0467fSKevin Wolf 11450af729ecSKevin Wolf /* Find L2 entry for the first involved cluster */ 1146cde91766SAlberto Garcia ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index); 11470af729ecSKevin Wolf if (ret < 0) { 11480af729ecSKevin Wolf return ret; 11490af729ecSKevin Wolf } 11500af729ecSKevin Wolf 1151cde91766SAlberto Garcia cluster_offset = be64_to_cpu(l2_slice[l2_index]); 11520af729ecSKevin Wolf 11530af729ecSKevin Wolf /* Check how many clusters are already allocated and don't need COW */ 11540af729ecSKevin Wolf if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL 11550af729ecSKevin Wolf && (cluster_offset & QCOW_OFLAG_COPIED)) 11560af729ecSKevin Wolf { 1157e62daaf6SKevin Wolf /* If a specific host_offset is required, check it */ 1158e62daaf6SKevin Wolf bool offset_matches = 1159e62daaf6SKevin Wolf (cluster_offset & L2E_OFFSET_MASK) == *host_offset; 1160e62daaf6SKevin Wolf 1161a97c67eeSMax Reitz if (offset_into_cluster(s, cluster_offset & L2E_OFFSET_MASK)) { 1162a97c67eeSMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset " 1163a97c67eeSMax Reitz "%#llx unaligned (guest offset: %#" PRIx64 1164a97c67eeSMax Reitz ")", cluster_offset & L2E_OFFSET_MASK, 1165a97c67eeSMax Reitz guest_offset); 1166a97c67eeSMax Reitz ret = -EIO; 1167a97c67eeSMax Reitz goto out; 1168a97c67eeSMax Reitz } 1169a97c67eeSMax Reitz 1170e62daaf6SKevin Wolf if (*host_offset != 0 && !offset_matches) { 1171e62daaf6SKevin Wolf *bytes = 0; 1172e62daaf6SKevin Wolf ret = 0; 1173e62daaf6SKevin Wolf goto out; 1174e62daaf6SKevin Wolf } 1175e62daaf6SKevin Wolf 11760af729ecSKevin Wolf /* We keep all QCOW_OFLAG_COPIED clusters */ 1177c53ede9fSKevin Wolf keep_clusters = 1178acb0467fSKevin Wolf count_contiguous_clusters(nb_clusters, s->cluster_size, 1179cde91766SAlberto Garcia &l2_slice[l2_index], 11800af729ecSKevin Wolf QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO); 1181c53ede9fSKevin Wolf assert(keep_clusters <= nb_clusters); 1182c53ede9fSKevin Wolf 1183c53ede9fSKevin Wolf *bytes = MIN(*bytes, 1184c53ede9fSKevin Wolf keep_clusters * s->cluster_size 1185c53ede9fSKevin Wolf - offset_into_cluster(s, guest_offset)); 11860af729ecSKevin Wolf 11870af729ecSKevin Wolf ret = 1; 11880af729ecSKevin Wolf } else { 11890af729ecSKevin Wolf ret = 0; 11900af729ecSKevin Wolf } 11910af729ecSKevin Wolf 11920af729ecSKevin Wolf /* Cleanup */ 1193e62daaf6SKevin Wolf out: 1194cde91766SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 11950af729ecSKevin Wolf 1196e62daaf6SKevin Wolf /* Only return a host offset if we actually made progress. Otherwise we 1197e62daaf6SKevin Wolf * would make requirements for handle_alloc() that it can't fulfill */ 1198a97c67eeSMax Reitz if (ret > 0) { 1199411d62b0SKevin Wolf *host_offset = (cluster_offset & L2E_OFFSET_MASK) 1200411d62b0SKevin Wolf + offset_into_cluster(s, guest_offset); 1201e62daaf6SKevin Wolf } 1202e62daaf6SKevin Wolf 12030af729ecSKevin Wolf return ret; 12040af729ecSKevin Wolf } 12050af729ecSKevin Wolf 12060af729ecSKevin Wolf /* 1207226c3c26SKevin Wolf * Allocates new clusters for the given guest_offset. 1208226c3c26SKevin Wolf * 1209226c3c26SKevin Wolf * At most *nb_clusters are allocated, and on return *nb_clusters is updated to 1210226c3c26SKevin Wolf * contain the number of clusters that have been allocated and are contiguous 1211226c3c26SKevin Wolf * in the image file. 1212226c3c26SKevin Wolf * 1213226c3c26SKevin Wolf * If *host_offset is non-zero, it specifies the offset in the image file at 1214226c3c26SKevin Wolf * which the new clusters must start. *nb_clusters can be 0 on return in this 1215226c3c26SKevin Wolf * case if the cluster at host_offset is already in use. If *host_offset is 1216226c3c26SKevin Wolf * zero, the clusters can be allocated anywhere in the image file. 1217226c3c26SKevin Wolf * 1218226c3c26SKevin Wolf * *host_offset is updated to contain the offset into the image file at which 1219226c3c26SKevin Wolf * the first allocated cluster starts. 1220226c3c26SKevin Wolf * 1221226c3c26SKevin Wolf * Return 0 on success and -errno in error cases. -EAGAIN means that the 1222226c3c26SKevin Wolf * function has been waiting for another request and the allocation must be 1223226c3c26SKevin Wolf * restarted, but the whole request should not be failed. 1224226c3c26SKevin Wolf */ 1225226c3c26SKevin Wolf static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, 1226b6d36defSMax Reitz uint64_t *host_offset, uint64_t *nb_clusters) 1227226c3c26SKevin Wolf { 1228ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 1229226c3c26SKevin Wolf 1230226c3c26SKevin Wolf trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset, 1231226c3c26SKevin Wolf *host_offset, *nb_clusters); 1232226c3c26SKevin Wolf 1233250196f1SKevin Wolf /* Allocate new clusters */ 1234250196f1SKevin Wolf trace_qcow2_cluster_alloc_phys(qemu_coroutine_self()); 1235250196f1SKevin Wolf if (*host_offset == 0) { 1236df021791SKevin Wolf int64_t cluster_offset = 1237df021791SKevin Wolf qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size); 1238250196f1SKevin Wolf if (cluster_offset < 0) { 1239250196f1SKevin Wolf return cluster_offset; 1240250196f1SKevin Wolf } 1241250196f1SKevin Wolf *host_offset = cluster_offset; 1242250196f1SKevin Wolf return 0; 1243df021791SKevin Wolf } else { 1244b6d36defSMax Reitz int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters); 1245df021791SKevin Wolf if (ret < 0) { 1246df021791SKevin Wolf return ret; 1247df021791SKevin Wolf } 1248df021791SKevin Wolf *nb_clusters = ret; 1249df021791SKevin Wolf return 0; 1250df021791SKevin Wolf } 1251250196f1SKevin Wolf } 1252250196f1SKevin Wolf 1253250196f1SKevin Wolf /* 125410f0ed8bSKevin Wolf * Allocates new clusters for an area that either is yet unallocated or needs a 125510f0ed8bSKevin Wolf * copy on write. If *host_offset is non-zero, clusters are only allocated if 125610f0ed8bSKevin Wolf * the new allocation can match the specified host offset. 125710f0ed8bSKevin Wolf * 1258411d62b0SKevin Wolf * Note that guest_offset may not be cluster aligned. In this case, the 1259411d62b0SKevin Wolf * returned *host_offset points to exact byte referenced by guest_offset and 1260411d62b0SKevin Wolf * therefore isn't cluster aligned as well. 126110f0ed8bSKevin Wolf * 126210f0ed8bSKevin Wolf * Returns: 126310f0ed8bSKevin Wolf * 0: if no clusters could be allocated. *bytes is set to 0, 126410f0ed8bSKevin Wolf * *host_offset is left unchanged. 126510f0ed8bSKevin Wolf * 126610f0ed8bSKevin Wolf * 1: if new clusters were allocated. *bytes may be decreased if the 126710f0ed8bSKevin Wolf * new allocation doesn't cover all of the requested area. 126810f0ed8bSKevin Wolf * *host_offset is updated to contain the host offset of the first 126910f0ed8bSKevin Wolf * newly allocated cluster. 127010f0ed8bSKevin Wolf * 127110f0ed8bSKevin Wolf * -errno: in error cases 127210f0ed8bSKevin Wolf */ 127310f0ed8bSKevin Wolf static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, 1274c37f4cd7SKevin Wolf uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) 127510f0ed8bSKevin Wolf { 1276ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 127710f0ed8bSKevin Wolf int l2_index; 12786d99a344SAlberto Garcia uint64_t *l2_slice; 127910f0ed8bSKevin Wolf uint64_t entry; 1280b6d36defSMax Reitz uint64_t nb_clusters; 128110f0ed8bSKevin Wolf int ret; 1282564a6b69SMax Reitz bool keep_old_clusters = false; 128310f0ed8bSKevin Wolf 1284564a6b69SMax Reitz uint64_t alloc_cluster_offset = 0; 128510f0ed8bSKevin Wolf 128610f0ed8bSKevin Wolf trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset, 128710f0ed8bSKevin Wolf *bytes); 128810f0ed8bSKevin Wolf assert(*bytes > 0); 128910f0ed8bSKevin Wolf 1290f5bc6350SKevin Wolf /* 12916d99a344SAlberto Garcia * Calculate the number of clusters to look for. We stop at L2 slice 1292f5bc6350SKevin Wolf * boundaries to keep things simple. 1293f5bc6350SKevin Wolf */ 1294c37f4cd7SKevin Wolf nb_clusters = 1295c37f4cd7SKevin Wolf size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); 1296c37f4cd7SKevin Wolf 12976d99a344SAlberto Garcia l2_index = offset_to_l2_slice_index(s, guest_offset); 12986d99a344SAlberto Garcia nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index); 1299b6d36defSMax Reitz assert(nb_clusters <= INT_MAX); 1300f5bc6350SKevin Wolf 130110f0ed8bSKevin Wolf /* Find L2 entry for the first involved cluster */ 13026d99a344SAlberto Garcia ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index); 130310f0ed8bSKevin Wolf if (ret < 0) { 130410f0ed8bSKevin Wolf return ret; 130510f0ed8bSKevin Wolf } 130610f0ed8bSKevin Wolf 13076d99a344SAlberto Garcia entry = be64_to_cpu(l2_slice[l2_index]); 130810f0ed8bSKevin Wolf 130910f0ed8bSKevin Wolf /* For the moment, overwrite compressed clusters one by one */ 131010f0ed8bSKevin Wolf if (entry & QCOW_OFLAG_COMPRESSED) { 131110f0ed8bSKevin Wolf nb_clusters = 1; 131210f0ed8bSKevin Wolf } else { 13136d99a344SAlberto Garcia nb_clusters = count_cow_clusters(s, nb_clusters, l2_slice, l2_index); 131410f0ed8bSKevin Wolf } 131510f0ed8bSKevin Wolf 1316ecdd5333SKevin Wolf /* This function is only called when there were no non-COW clusters, so if 1317ecdd5333SKevin Wolf * we can't find any unallocated or COW clusters either, something is 1318ecdd5333SKevin Wolf * wrong with our code. */ 1319ecdd5333SKevin Wolf assert(nb_clusters > 0); 1320ecdd5333SKevin Wolf 1321fdfab37dSEric Blake if (qcow2_get_cluster_type(entry) == QCOW2_CLUSTER_ZERO_ALLOC && 1322fdfab37dSEric Blake (entry & QCOW_OFLAG_COPIED) && 1323564a6b69SMax Reitz (!*host_offset || 1324564a6b69SMax Reitz start_of_cluster(s, *host_offset) == (entry & L2E_OFFSET_MASK))) 1325564a6b69SMax Reitz { 132693bbaf03SMax Reitz int preallocated_nb_clusters; 132793bbaf03SMax Reitz 132893bbaf03SMax Reitz if (offset_into_cluster(s, entry & L2E_OFFSET_MASK)) { 132993bbaf03SMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "Preallocated zero " 133093bbaf03SMax Reitz "cluster offset %#llx unaligned (guest " 133193bbaf03SMax Reitz "offset: %#" PRIx64 ")", 133293bbaf03SMax Reitz entry & L2E_OFFSET_MASK, guest_offset); 133393bbaf03SMax Reitz ret = -EIO; 133493bbaf03SMax Reitz goto fail; 133593bbaf03SMax Reitz } 133693bbaf03SMax Reitz 1337564a6b69SMax Reitz /* Try to reuse preallocated zero clusters; contiguous normal clusters 1338564a6b69SMax Reitz * would be fine, too, but count_cow_clusters() above has limited 1339564a6b69SMax Reitz * nb_clusters already to a range of COW clusters */ 134093bbaf03SMax Reitz preallocated_nb_clusters = 1341564a6b69SMax Reitz count_contiguous_clusters(nb_clusters, s->cluster_size, 13426d99a344SAlberto Garcia &l2_slice[l2_index], QCOW_OFLAG_COPIED); 1343564a6b69SMax Reitz assert(preallocated_nb_clusters > 0); 1344564a6b69SMax Reitz 1345564a6b69SMax Reitz nb_clusters = preallocated_nb_clusters; 1346564a6b69SMax Reitz alloc_cluster_offset = entry & L2E_OFFSET_MASK; 1347564a6b69SMax Reitz 1348564a6b69SMax Reitz /* We want to reuse these clusters, so qcow2_alloc_cluster_link_l2() 1349564a6b69SMax Reitz * should not free them. */ 1350564a6b69SMax Reitz keep_old_clusters = true; 1351564a6b69SMax Reitz } 1352564a6b69SMax Reitz 13536d99a344SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 135410f0ed8bSKevin Wolf 1355564a6b69SMax Reitz if (!alloc_cluster_offset) { 135610f0ed8bSKevin Wolf /* Allocate, if necessary at a given offset in the image file */ 1357411d62b0SKevin Wolf alloc_cluster_offset = start_of_cluster(s, *host_offset); 135883baa9a4SKevin Wolf ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset, 135910f0ed8bSKevin Wolf &nb_clusters); 136010f0ed8bSKevin Wolf if (ret < 0) { 136110f0ed8bSKevin Wolf goto fail; 136210f0ed8bSKevin Wolf } 136310f0ed8bSKevin Wolf 136483baa9a4SKevin Wolf /* Can't extend contiguous allocation */ 136583baa9a4SKevin Wolf if (nb_clusters == 0) { 136683baa9a4SKevin Wolf *bytes = 0; 136783baa9a4SKevin Wolf return 0; 136883baa9a4SKevin Wolf } 136983baa9a4SKevin Wolf 1370564a6b69SMax Reitz /* !*host_offset would overwrite the image header and is reserved for 1371564a6b69SMax Reitz * "no host offset preferred". If 0 was a valid host offset, it'd 1372564a6b69SMax Reitz * trigger the following overlap check; do that now to avoid having an 1373564a6b69SMax Reitz * invalid value in *host_offset. */ 1374ff52aab2SMax Reitz if (!alloc_cluster_offset) { 1375ff52aab2SMax Reitz ret = qcow2_pre_write_overlap_check(bs, 0, alloc_cluster_offset, 1376ff52aab2SMax Reitz nb_clusters * s->cluster_size); 1377ff52aab2SMax Reitz assert(ret < 0); 1378ff52aab2SMax Reitz goto fail; 1379ff52aab2SMax Reitz } 1380564a6b69SMax Reitz } 1381ff52aab2SMax Reitz 138210f0ed8bSKevin Wolf /* 138383baa9a4SKevin Wolf * Save info needed for meta data update. 138483baa9a4SKevin Wolf * 138585567393SKevin Wolf * requested_bytes: Number of bytes from the start of the first 138610f0ed8bSKevin Wolf * newly allocated cluster to the end of the (possibly shortened 138710f0ed8bSKevin Wolf * before) write request. 138810f0ed8bSKevin Wolf * 138985567393SKevin Wolf * avail_bytes: Number of bytes from the start of the first 139010f0ed8bSKevin Wolf * newly allocated to the end of the last newly allocated cluster. 139110f0ed8bSKevin Wolf * 139285567393SKevin Wolf * nb_bytes: The number of bytes from the start of the first 139383baa9a4SKevin Wolf * newly allocated cluster to the end of the area that the write 139410f0ed8bSKevin Wolf * request actually writes to (excluding COW at the end) 139510f0ed8bSKevin Wolf */ 139685567393SKevin Wolf uint64_t requested_bytes = *bytes + offset_into_cluster(s, guest_offset); 139785567393SKevin Wolf int avail_bytes = MIN(INT_MAX, nb_clusters << s->cluster_bits); 139885567393SKevin Wolf int nb_bytes = MIN(requested_bytes, avail_bytes); 139988c6588cSKevin Wolf QCowL2Meta *old_m = *m; 140010f0ed8bSKevin Wolf 140110f0ed8bSKevin Wolf *m = g_malloc0(sizeof(**m)); 140210f0ed8bSKevin Wolf 140310f0ed8bSKevin Wolf **m = (QCowL2Meta) { 140488c6588cSKevin Wolf .next = old_m, 140588c6588cSKevin Wolf 1406411d62b0SKevin Wolf .alloc_offset = alloc_cluster_offset, 140783baa9a4SKevin Wolf .offset = start_of_cluster(s, guest_offset), 140810f0ed8bSKevin Wolf .nb_clusters = nb_clusters, 140910f0ed8bSKevin Wolf 1410564a6b69SMax Reitz .keep_old_clusters = keep_old_clusters, 1411564a6b69SMax Reitz 141210f0ed8bSKevin Wolf .cow_start = { 141310f0ed8bSKevin Wolf .offset = 0, 141485567393SKevin Wolf .nb_bytes = offset_into_cluster(s, guest_offset), 141510f0ed8bSKevin Wolf }, 141610f0ed8bSKevin Wolf .cow_end = { 141785567393SKevin Wolf .offset = nb_bytes, 141885567393SKevin Wolf .nb_bytes = avail_bytes - nb_bytes, 141910f0ed8bSKevin Wolf }, 142010f0ed8bSKevin Wolf }; 142110f0ed8bSKevin Wolf qemu_co_queue_init(&(*m)->dependent_requests); 142210f0ed8bSKevin Wolf QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight); 142310f0ed8bSKevin Wolf 1424411d62b0SKevin Wolf *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset); 142585567393SKevin Wolf *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset)); 1426c37f4cd7SKevin Wolf assert(*bytes != 0); 142710f0ed8bSKevin Wolf 142810f0ed8bSKevin Wolf return 1; 142910f0ed8bSKevin Wolf 143010f0ed8bSKevin Wolf fail: 143110f0ed8bSKevin Wolf if (*m && (*m)->nb_clusters > 0) { 143210f0ed8bSKevin Wolf QLIST_REMOVE(*m, next_in_flight); 143310f0ed8bSKevin Wolf } 143410f0ed8bSKevin Wolf return ret; 143510f0ed8bSKevin Wolf } 143610f0ed8bSKevin Wolf 143710f0ed8bSKevin Wolf /* 143845aba42fSKevin Wolf * alloc_cluster_offset 143945aba42fSKevin Wolf * 1440250196f1SKevin Wolf * For a given offset on the virtual disk, find the cluster offset in qcow2 1441250196f1SKevin Wolf * file. If the offset is not found, allocate a new cluster. 144245aba42fSKevin Wolf * 1443250196f1SKevin Wolf * If the cluster was already allocated, m->nb_clusters is set to 0 and 1444a7912369SFrediano Ziglio * other fields in m are meaningless. 144545aba42fSKevin Wolf * 1446148da7eaSKevin Wolf * If the cluster is newly allocated, m->nb_clusters is set to the number of 144768d100e9SKevin Wolf * contiguous clusters that have been allocated. In this case, the other 144868d100e9SKevin Wolf * fields of m are valid and contain information about the first allocated 144968d100e9SKevin Wolf * cluster. 1450148da7eaSKevin Wolf * 145168d100e9SKevin Wolf * If the request conflicts with another write request in flight, the coroutine 145268d100e9SKevin Wolf * is queued and will be reentered when the dependency has completed. 1453148da7eaSKevin Wolf * 1454148da7eaSKevin Wolf * Return 0 on success and -errno in error cases 145545aba42fSKevin Wolf */ 1456f4f0d391SKevin Wolf int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, 1457d46a0bb2SKevin Wolf unsigned int *bytes, uint64_t *host_offset, 1458d46a0bb2SKevin Wolf QCowL2Meta **m) 145945aba42fSKevin Wolf { 1460ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 1461710c2496SKevin Wolf uint64_t start, remaining; 1462250196f1SKevin Wolf uint64_t cluster_offset; 146365eb2e35SKevin Wolf uint64_t cur_bytes; 1464710c2496SKevin Wolf int ret; 146545aba42fSKevin Wolf 1466d46a0bb2SKevin Wolf trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *bytes); 1467710c2496SKevin Wolf 146872424114SKevin Wolf again: 146916f0587eSHu Tao start = offset; 1470d46a0bb2SKevin Wolf remaining = *bytes; 14710af729ecSKevin Wolf cluster_offset = 0; 14720af729ecSKevin Wolf *host_offset = 0; 1473ecdd5333SKevin Wolf cur_bytes = 0; 1474ecdd5333SKevin Wolf *m = NULL; 14750af729ecSKevin Wolf 14762c3b32d2SKevin Wolf while (true) { 1477ecdd5333SKevin Wolf 1478ecdd5333SKevin Wolf if (!*host_offset) { 1479ecdd5333SKevin Wolf *host_offset = start_of_cluster(s, cluster_offset); 1480ecdd5333SKevin Wolf } 1481ecdd5333SKevin Wolf 1482ecdd5333SKevin Wolf assert(remaining >= cur_bytes); 1483ecdd5333SKevin Wolf 1484ecdd5333SKevin Wolf start += cur_bytes; 1485ecdd5333SKevin Wolf remaining -= cur_bytes; 1486ecdd5333SKevin Wolf cluster_offset += cur_bytes; 1487ecdd5333SKevin Wolf 1488ecdd5333SKevin Wolf if (remaining == 0) { 1489ecdd5333SKevin Wolf break; 1490ecdd5333SKevin Wolf } 1491ecdd5333SKevin Wolf 1492ecdd5333SKevin Wolf cur_bytes = remaining; 1493ecdd5333SKevin Wolf 1494250196f1SKevin Wolf /* 149517a71e58SKevin Wolf * Now start gathering as many contiguous clusters as possible: 149617a71e58SKevin Wolf * 149717a71e58SKevin Wolf * 1. Check for overlaps with in-flight allocations 149817a71e58SKevin Wolf * 14992c3b32d2SKevin Wolf * a) Overlap not in the first cluster -> shorten this request and 15002c3b32d2SKevin Wolf * let the caller handle the rest in its next loop iteration. 150117a71e58SKevin Wolf * 15022c3b32d2SKevin Wolf * b) Real overlaps of two requests. Yield and restart the search 15032c3b32d2SKevin Wolf * for contiguous clusters (the situation could have changed 15042c3b32d2SKevin Wolf * while we were sleeping) 150517a71e58SKevin Wolf * 150617a71e58SKevin Wolf * c) TODO: Request starts in the same cluster as the in-flight 15072c3b32d2SKevin Wolf * allocation ends. Shorten the COW of the in-fight allocation, 15082c3b32d2SKevin Wolf * set cluster_offset to write to the same cluster and set up 15092c3b32d2SKevin Wolf * the right synchronisation between the in-flight request and 15102c3b32d2SKevin Wolf * the new one. 151117a71e58SKevin Wolf */ 1512ecdd5333SKevin Wolf ret = handle_dependencies(bs, start, &cur_bytes, m); 151317a71e58SKevin Wolf if (ret == -EAGAIN) { 1514ecdd5333SKevin Wolf /* Currently handle_dependencies() doesn't yield if we already had 1515ecdd5333SKevin Wolf * an allocation. If it did, we would have to clean up the L2Meta 1516ecdd5333SKevin Wolf * structs before starting over. */ 1517ecdd5333SKevin Wolf assert(*m == NULL); 151817a71e58SKevin Wolf goto again; 151917a71e58SKevin Wolf } else if (ret < 0) { 152017a71e58SKevin Wolf return ret; 1521ecdd5333SKevin Wolf } else if (cur_bytes == 0) { 1522ecdd5333SKevin Wolf break; 152317a71e58SKevin Wolf } else { 152417a71e58SKevin Wolf /* handle_dependencies() may have decreased cur_bytes (shortened 152517a71e58SKevin Wolf * the allocations below) so that the next dependency is processed 152617a71e58SKevin Wolf * correctly during the next loop iteration. */ 152717a71e58SKevin Wolf } 152817a71e58SKevin Wolf 152972424114SKevin Wolf /* 15300af729ecSKevin Wolf * 2. Count contiguous COPIED clusters. 153172424114SKevin Wolf */ 1532710c2496SKevin Wolf ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m); 153372424114SKevin Wolf if (ret < 0) { 153472424114SKevin Wolf return ret; 15350af729ecSKevin Wolf } else if (ret) { 1536ecdd5333SKevin Wolf continue; 1537e62daaf6SKevin Wolf } else if (cur_bytes == 0) { 15382c3b32d2SKevin Wolf break; 153972424114SKevin Wolf } 154072424114SKevin Wolf 15410af729ecSKevin Wolf /* 15420af729ecSKevin Wolf * 3. If the request still hasn't completed, allocate new clusters, 15430af729ecSKevin Wolf * considering any cluster_offset of steps 1c or 2. 15440af729ecSKevin Wolf */ 1545710c2496SKevin Wolf ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m); 1546037689d8SKevin Wolf if (ret < 0) { 1547037689d8SKevin Wolf return ret; 1548710c2496SKevin Wolf } else if (ret) { 1549ecdd5333SKevin Wolf continue; 15502c3b32d2SKevin Wolf } else { 15512c3b32d2SKevin Wolf assert(cur_bytes == 0); 15522c3b32d2SKevin Wolf break; 15532c3b32d2SKevin Wolf } 1554710c2496SKevin Wolf } 1555250196f1SKevin Wolf 1556d46a0bb2SKevin Wolf *bytes -= remaining; 1557d46a0bb2SKevin Wolf assert(*bytes > 0); 1558710c2496SKevin Wolf assert(*host_offset != 0); 155945aba42fSKevin Wolf 1560148da7eaSKevin Wolf return 0; 156145aba42fSKevin Wolf } 156245aba42fSKevin Wolf 156345aba42fSKevin Wolf static int decompress_buffer(uint8_t *out_buf, int out_buf_size, 156445aba42fSKevin Wolf const uint8_t *buf, int buf_size) 156545aba42fSKevin Wolf { 156645aba42fSKevin Wolf z_stream strm1, *strm = &strm1; 156745aba42fSKevin Wolf int ret, out_len; 156845aba42fSKevin Wolf 156945aba42fSKevin Wolf memset(strm, 0, sizeof(*strm)); 157045aba42fSKevin Wolf 157145aba42fSKevin Wolf strm->next_in = (uint8_t *)buf; 157245aba42fSKevin Wolf strm->avail_in = buf_size; 157345aba42fSKevin Wolf strm->next_out = out_buf; 157445aba42fSKevin Wolf strm->avail_out = out_buf_size; 157545aba42fSKevin Wolf 157645aba42fSKevin Wolf ret = inflateInit2(strm, -12); 157745aba42fSKevin Wolf if (ret != Z_OK) 157845aba42fSKevin Wolf return -1; 157945aba42fSKevin Wolf ret = inflate(strm, Z_FINISH); 158045aba42fSKevin Wolf out_len = strm->next_out - out_buf; 158145aba42fSKevin Wolf if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || 158245aba42fSKevin Wolf out_len != out_buf_size) { 158345aba42fSKevin Wolf inflateEnd(strm); 158445aba42fSKevin Wolf return -1; 158545aba42fSKevin Wolf } 158645aba42fSKevin Wolf inflateEnd(strm); 158745aba42fSKevin Wolf return 0; 158845aba42fSKevin Wolf } 158945aba42fSKevin Wolf 159066f82ceeSKevin Wolf int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) 159145aba42fSKevin Wolf { 1592ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 159345aba42fSKevin Wolf int ret, csize, nb_csectors, sector_offset; 159445aba42fSKevin Wolf uint64_t coffset; 159545aba42fSKevin Wolf 159645aba42fSKevin Wolf coffset = cluster_offset & s->cluster_offset_mask; 159745aba42fSKevin Wolf if (s->cluster_cache_offset != coffset) { 159845aba42fSKevin Wolf nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1; 159945aba42fSKevin Wolf sector_offset = coffset & 511; 160045aba42fSKevin Wolf csize = nb_csectors * 512 - sector_offset; 16013e4c7052SStefan Hajnoczi 16023e4c7052SStefan Hajnoczi /* Allocate buffers on first decompress operation, most images are 16033e4c7052SStefan Hajnoczi * uncompressed and the memory overhead can be avoided. The buffers 16043e4c7052SStefan Hajnoczi * are freed in .bdrv_close(). 16053e4c7052SStefan Hajnoczi */ 16063e4c7052SStefan Hajnoczi if (!s->cluster_data) { 16073e4c7052SStefan Hajnoczi /* one more sector for decompressed data alignment */ 16083e4c7052SStefan Hajnoczi s->cluster_data = qemu_try_blockalign(bs->file->bs, 16093e4c7052SStefan Hajnoczi QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size + 512); 16103e4c7052SStefan Hajnoczi if (!s->cluster_data) { 16113e4c7052SStefan Hajnoczi return -ENOMEM; 16123e4c7052SStefan Hajnoczi } 16133e4c7052SStefan Hajnoczi } 16143e4c7052SStefan Hajnoczi if (!s->cluster_cache) { 16153e4c7052SStefan Hajnoczi s->cluster_cache = g_malloc(s->cluster_size); 16163e4c7052SStefan Hajnoczi } 16173e4c7052SStefan Hajnoczi 161866f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); 1619fbcbbf4eSKevin Wolf ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, 16209a4f4c31SKevin Wolf nb_csectors); 162145aba42fSKevin Wolf if (ret < 0) { 16228af36488SKevin Wolf return ret; 162345aba42fSKevin Wolf } 162445aba42fSKevin Wolf if (decompress_buffer(s->cluster_cache, s->cluster_size, 162545aba42fSKevin Wolf s->cluster_data + sector_offset, csize) < 0) { 16268af36488SKevin Wolf return -EIO; 162745aba42fSKevin Wolf } 162845aba42fSKevin Wolf s->cluster_cache_offset = coffset; 162945aba42fSKevin Wolf } 163045aba42fSKevin Wolf return 0; 163145aba42fSKevin Wolf } 16325ea929e3SKevin Wolf 16335ea929e3SKevin Wolf /* 16345ea929e3SKevin Wolf * This discards as many clusters of nb_clusters as possible at once (i.e. 163521ab3addSAlberto Garcia * all clusters in the same L2 slice) and returns the number of discarded 16365ea929e3SKevin Wolf * clusters. 16375ea929e3SKevin Wolf */ 163821ab3addSAlberto Garcia static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, 163921ab3addSAlberto Garcia uint64_t nb_clusters, 164021ab3addSAlberto Garcia enum qcow2_discard_type type, bool full_discard) 16415ea929e3SKevin Wolf { 1642ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 164321ab3addSAlberto Garcia uint64_t *l2_slice; 16445ea929e3SKevin Wolf int l2_index; 16455ea929e3SKevin Wolf int ret; 16465ea929e3SKevin Wolf int i; 16475ea929e3SKevin Wolf 164821ab3addSAlberto Garcia ret = get_cluster_table(bs, offset, &l2_slice, &l2_index); 16495ea929e3SKevin Wolf if (ret < 0) { 16505ea929e3SKevin Wolf return ret; 16515ea929e3SKevin Wolf } 16525ea929e3SKevin Wolf 165321ab3addSAlberto Garcia /* Limit nb_clusters to one L2 slice */ 165421ab3addSAlberto Garcia nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index); 1655b6d36defSMax Reitz assert(nb_clusters <= INT_MAX); 16565ea929e3SKevin Wolf 16575ea929e3SKevin Wolf for (i = 0; i < nb_clusters; i++) { 1658c883db0dSMax Reitz uint64_t old_l2_entry; 16595ea929e3SKevin Wolf 166021ab3addSAlberto Garcia old_l2_entry = be64_to_cpu(l2_slice[l2_index + i]); 1661a71835a0SKevin Wolf 1662a71835a0SKevin Wolf /* 1663808c4b6fSMax Reitz * If full_discard is false, make sure that a discarded area reads back 1664808c4b6fSMax Reitz * as zeroes for v3 images (we cannot do it for v2 without actually 1665808c4b6fSMax Reitz * writing a zero-filled buffer). We can skip the operation if the 1666808c4b6fSMax Reitz * cluster is already marked as zero, or if it's unallocated and we 1667808c4b6fSMax Reitz * don't have a backing file. 1668a71835a0SKevin Wolf * 1669237d78f8SEric Blake * TODO We might want to use bdrv_block_status(bs) here, but we're 1670a71835a0SKevin Wolf * holding s->lock, so that doesn't work today. 1671808c4b6fSMax Reitz * 1672808c4b6fSMax Reitz * If full_discard is true, the sector should not read back as zeroes, 1673808c4b6fSMax Reitz * but rather fall through to the backing file. 1674a71835a0SKevin Wolf */ 1675c883db0dSMax Reitz switch (qcow2_get_cluster_type(old_l2_entry)) { 1676c883db0dSMax Reitz case QCOW2_CLUSTER_UNALLOCATED: 1677760e0063SKevin Wolf if (full_discard || !bs->backing) { 1678a71835a0SKevin Wolf continue; 1679a71835a0SKevin Wolf } 1680c883db0dSMax Reitz break; 1681a71835a0SKevin Wolf 1682fdfab37dSEric Blake case QCOW2_CLUSTER_ZERO_PLAIN: 1683fdfab37dSEric Blake if (!full_discard) { 16845ea929e3SKevin Wolf continue; 1685808c4b6fSMax Reitz } 1686808c4b6fSMax Reitz break; 1687c883db0dSMax Reitz 1688fdfab37dSEric Blake case QCOW2_CLUSTER_ZERO_ALLOC: 1689c883db0dSMax Reitz case QCOW2_CLUSTER_NORMAL: 1690c883db0dSMax Reitz case QCOW2_CLUSTER_COMPRESSED: 1691c883db0dSMax Reitz break; 1692c883db0dSMax Reitz 1693c883db0dSMax Reitz default: 1694c883db0dSMax Reitz abort(); 16955ea929e3SKevin Wolf } 16965ea929e3SKevin Wolf 16975ea929e3SKevin Wolf /* First remove L2 entries */ 169821ab3addSAlberto Garcia qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice); 1699808c4b6fSMax Reitz if (!full_discard && s->qcow_version >= 3) { 170021ab3addSAlberto Garcia l2_slice[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); 1701a71835a0SKevin Wolf } else { 170221ab3addSAlberto Garcia l2_slice[l2_index + i] = cpu_to_be64(0); 1703a71835a0SKevin Wolf } 17045ea929e3SKevin Wolf 17055ea929e3SKevin Wolf /* Then decrease the refcount */ 1706c883db0dSMax Reitz qcow2_free_any_clusters(bs, old_l2_entry, 1, type); 17075ea929e3SKevin Wolf } 17085ea929e3SKevin Wolf 170921ab3addSAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 17105ea929e3SKevin Wolf 17115ea929e3SKevin Wolf return nb_clusters; 17125ea929e3SKevin Wolf } 17135ea929e3SKevin Wolf 1714d2cb36afSEric Blake int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset, 1715d2cb36afSEric Blake uint64_t bytes, enum qcow2_discard_type type, 1716d2cb36afSEric Blake bool full_discard) 17175ea929e3SKevin Wolf { 1718ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 1719d2cb36afSEric Blake uint64_t end_offset = offset + bytes; 1720b6d36defSMax Reitz uint64_t nb_clusters; 1721d2cb36afSEric Blake int64_t cleared; 17225ea929e3SKevin Wolf int ret; 17235ea929e3SKevin Wolf 1724f10ee139SEric Blake /* Caller must pass aligned values, except at image end */ 17250c1bd469SEric Blake assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 1726f10ee139SEric Blake assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) || 1727f10ee139SEric Blake end_offset == bs->total_sectors << BDRV_SECTOR_BITS); 17285ea929e3SKevin Wolf 1729d2cb36afSEric Blake nb_clusters = size_to_clusters(s, bytes); 17305ea929e3SKevin Wolf 17310b919faeSKevin Wolf s->cache_discards = true; 17320b919faeSKevin Wolf 173321ab3addSAlberto Garcia /* Each L2 slice is handled by its own loop iteration */ 17345ea929e3SKevin Wolf while (nb_clusters > 0) { 173521ab3addSAlberto Garcia cleared = discard_in_l2_slice(bs, offset, nb_clusters, type, 1736d2cb36afSEric Blake full_discard); 1737d2cb36afSEric Blake if (cleared < 0) { 1738d2cb36afSEric Blake ret = cleared; 17390b919faeSKevin Wolf goto fail; 17405ea929e3SKevin Wolf } 17415ea929e3SKevin Wolf 1742d2cb36afSEric Blake nb_clusters -= cleared; 1743d2cb36afSEric Blake offset += (cleared * s->cluster_size); 17445ea929e3SKevin Wolf } 17455ea929e3SKevin Wolf 17460b919faeSKevin Wolf ret = 0; 17470b919faeSKevin Wolf fail: 17480b919faeSKevin Wolf s->cache_discards = false; 17490b919faeSKevin Wolf qcow2_process_discards(bs, ret); 17500b919faeSKevin Wolf 17510b919faeSKevin Wolf return ret; 17525ea929e3SKevin Wolf } 1753621f0589SKevin Wolf 1754621f0589SKevin Wolf /* 1755621f0589SKevin Wolf * This zeroes as many clusters of nb_clusters as possible at once (i.e. 1756a9a9f8f0SAlberto Garcia * all clusters in the same L2 slice) and returns the number of zeroed 1757621f0589SKevin Wolf * clusters. 1758621f0589SKevin Wolf */ 1759a9a9f8f0SAlberto Garcia static int zero_in_l2_slice(BlockDriverState *bs, uint64_t offset, 1760170f4b2eSFam Zheng uint64_t nb_clusters, int flags) 1761621f0589SKevin Wolf { 1762ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 1763a9a9f8f0SAlberto Garcia uint64_t *l2_slice; 1764621f0589SKevin Wolf int l2_index; 1765621f0589SKevin Wolf int ret; 1766621f0589SKevin Wolf int i; 176706cc5e2bSEric Blake bool unmap = !!(flags & BDRV_REQ_MAY_UNMAP); 1768621f0589SKevin Wolf 1769a9a9f8f0SAlberto Garcia ret = get_cluster_table(bs, offset, &l2_slice, &l2_index); 1770621f0589SKevin Wolf if (ret < 0) { 1771621f0589SKevin Wolf return ret; 1772621f0589SKevin Wolf } 1773621f0589SKevin Wolf 1774a9a9f8f0SAlberto Garcia /* Limit nb_clusters to one L2 slice */ 1775a9a9f8f0SAlberto Garcia nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index); 1776b6d36defSMax Reitz assert(nb_clusters <= INT_MAX); 1777621f0589SKevin Wolf 1778621f0589SKevin Wolf for (i = 0; i < nb_clusters; i++) { 1779621f0589SKevin Wolf uint64_t old_offset; 178006cc5e2bSEric Blake QCow2ClusterType cluster_type; 1781621f0589SKevin Wolf 1782a9a9f8f0SAlberto Garcia old_offset = be64_to_cpu(l2_slice[l2_index + i]); 1783621f0589SKevin Wolf 178406cc5e2bSEric Blake /* 178506cc5e2bSEric Blake * Minimize L2 changes if the cluster already reads back as 178606cc5e2bSEric Blake * zeroes with correct allocation. 178706cc5e2bSEric Blake */ 178806cc5e2bSEric Blake cluster_type = qcow2_get_cluster_type(old_offset); 178906cc5e2bSEric Blake if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN || 179006cc5e2bSEric Blake (cluster_type == QCOW2_CLUSTER_ZERO_ALLOC && !unmap)) { 179106cc5e2bSEric Blake continue; 179206cc5e2bSEric Blake } 179306cc5e2bSEric Blake 1794a9a9f8f0SAlberto Garcia qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice); 179506cc5e2bSEric Blake if (cluster_type == QCOW2_CLUSTER_COMPRESSED || unmap) { 1796a9a9f8f0SAlberto Garcia l2_slice[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); 17976cfcb9b8SKevin Wolf qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); 1798621f0589SKevin Wolf } else { 1799a9a9f8f0SAlberto Garcia l2_slice[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO); 1800621f0589SKevin Wolf } 1801621f0589SKevin Wolf } 1802621f0589SKevin Wolf 1803a9a9f8f0SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 1804621f0589SKevin Wolf 1805621f0589SKevin Wolf return nb_clusters; 1806621f0589SKevin Wolf } 1807621f0589SKevin Wolf 1808d2cb36afSEric Blake int qcow2_cluster_zeroize(BlockDriverState *bs, uint64_t offset, 1809d2cb36afSEric Blake uint64_t bytes, int flags) 1810621f0589SKevin Wolf { 1811ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 1812d2cb36afSEric Blake uint64_t end_offset = offset + bytes; 1813b6d36defSMax Reitz uint64_t nb_clusters; 1814d2cb36afSEric Blake int64_t cleared; 1815621f0589SKevin Wolf int ret; 1816621f0589SKevin Wolf 1817f10ee139SEric Blake /* Caller must pass aligned values, except at image end */ 1818f10ee139SEric Blake assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 1819f10ee139SEric Blake assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) || 1820f10ee139SEric Blake end_offset == bs->total_sectors << BDRV_SECTOR_BITS); 1821f10ee139SEric Blake 1822621f0589SKevin Wolf /* The zero flag is only supported by version 3 and newer */ 1823621f0589SKevin Wolf if (s->qcow_version < 3) { 1824621f0589SKevin Wolf return -ENOTSUP; 1825621f0589SKevin Wolf } 1826621f0589SKevin Wolf 1827a9a9f8f0SAlberto Garcia /* Each L2 slice is handled by its own loop iteration */ 1828d2cb36afSEric Blake nb_clusters = size_to_clusters(s, bytes); 1829621f0589SKevin Wolf 18300b919faeSKevin Wolf s->cache_discards = true; 18310b919faeSKevin Wolf 1832621f0589SKevin Wolf while (nb_clusters > 0) { 1833a9a9f8f0SAlberto Garcia cleared = zero_in_l2_slice(bs, offset, nb_clusters, flags); 1834d2cb36afSEric Blake if (cleared < 0) { 1835d2cb36afSEric Blake ret = cleared; 18360b919faeSKevin Wolf goto fail; 1837621f0589SKevin Wolf } 1838621f0589SKevin Wolf 1839d2cb36afSEric Blake nb_clusters -= cleared; 1840d2cb36afSEric Blake offset += (cleared * s->cluster_size); 1841621f0589SKevin Wolf } 1842621f0589SKevin Wolf 18430b919faeSKevin Wolf ret = 0; 18440b919faeSKevin Wolf fail: 18450b919faeSKevin Wolf s->cache_discards = false; 18460b919faeSKevin Wolf qcow2_process_discards(bs, ret); 18470b919faeSKevin Wolf 18480b919faeSKevin Wolf return ret; 1849621f0589SKevin Wolf } 185032b6444dSMax Reitz 185132b6444dSMax Reitz /* 185232b6444dSMax Reitz * Expands all zero clusters in a specific L1 table (or deallocates them, for 185332b6444dSMax Reitz * non-backed non-pre-allocated zero clusters). 185432b6444dSMax Reitz * 18554057a2b2SMax Reitz * l1_entries and *visited_l1_entries are used to keep track of progress for 18564057a2b2SMax Reitz * status_cb(). l1_entries contains the total number of L1 entries and 18574057a2b2SMax Reitz * *visited_l1_entries counts all visited L1 entries. 185832b6444dSMax Reitz */ 185932b6444dSMax Reitz static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, 1860ecf58777SMax Reitz int l1_size, int64_t *visited_l1_entries, 18614057a2b2SMax Reitz int64_t l1_entries, 18628b13976dSMax Reitz BlockDriverAmendStatusCB *status_cb, 18638b13976dSMax Reitz void *cb_opaque) 186432b6444dSMax Reitz { 1865ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 186632b6444dSMax Reitz bool is_active_l1 = (l1_table == s->l1_table); 1867415184f5SAlberto Garcia uint64_t *l2_slice = NULL; 1868415184f5SAlberto Garcia unsigned slice, slice_size2, n_slices; 186932b6444dSMax Reitz int ret; 187032b6444dSMax Reitz int i, j; 187132b6444dSMax Reitz 1872415184f5SAlberto Garcia slice_size2 = s->l2_slice_size * sizeof(uint64_t); 1873415184f5SAlberto Garcia n_slices = s->cluster_size / slice_size2; 1874415184f5SAlberto Garcia 187532b6444dSMax Reitz if (!is_active_l1) { 187632b6444dSMax Reitz /* inactive L2 tables require a buffer to be stored in when loading 187732b6444dSMax Reitz * them from disk */ 1878415184f5SAlberto Garcia l2_slice = qemu_try_blockalign(bs->file->bs, slice_size2); 1879415184f5SAlberto Garcia if (l2_slice == NULL) { 1880de82815dSKevin Wolf return -ENOMEM; 1881de82815dSKevin Wolf } 188232b6444dSMax Reitz } 188332b6444dSMax Reitz 188432b6444dSMax Reitz for (i = 0; i < l1_size; i++) { 188532b6444dSMax Reitz uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK; 18860e06528eSMax Reitz uint64_t l2_refcount; 188732b6444dSMax Reitz 188832b6444dSMax Reitz if (!l2_offset) { 188932b6444dSMax Reitz /* unallocated */ 18904057a2b2SMax Reitz (*visited_l1_entries)++; 18914057a2b2SMax Reitz if (status_cb) { 18928b13976dSMax Reitz status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque); 18934057a2b2SMax Reitz } 189432b6444dSMax Reitz continue; 189532b6444dSMax Reitz } 189632b6444dSMax Reitz 18978dd93d93SMax Reitz if (offset_into_cluster(s, l2_offset)) { 18988dd93d93SMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" 18998dd93d93SMax Reitz PRIx64 " unaligned (L1 index: %#x)", 19008dd93d93SMax Reitz l2_offset, i); 19018dd93d93SMax Reitz ret = -EIO; 19028dd93d93SMax Reitz goto fail; 19038dd93d93SMax Reitz } 19048dd93d93SMax Reitz 19059b765486SAlberto Garcia ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, 19069b765486SAlberto Garcia &l2_refcount); 19079b765486SAlberto Garcia if (ret < 0) { 19089b765486SAlberto Garcia goto fail; 19099b765486SAlberto Garcia } 19109b765486SAlberto Garcia 1911415184f5SAlberto Garcia for (slice = 0; slice < n_slices; slice++) { 1912415184f5SAlberto Garcia uint64_t slice_offset = l2_offset + slice * slice_size2; 1913415184f5SAlberto Garcia bool l2_dirty = false; 191432b6444dSMax Reitz if (is_active_l1) { 191532b6444dSMax Reitz /* get active L2 tables from cache */ 1916415184f5SAlberto Garcia ret = qcow2_cache_get(bs, s->l2_table_cache, slice_offset, 1917415184f5SAlberto Garcia (void **)&l2_slice); 191832b6444dSMax Reitz } else { 191932b6444dSMax Reitz /* load inactive L2 tables from disk */ 1920415184f5SAlberto Garcia ret = bdrv_pread(bs->file, slice_offset, l2_slice, slice_size2); 192132b6444dSMax Reitz } 192232b6444dSMax Reitz if (ret < 0) { 192332b6444dSMax Reitz goto fail; 192432b6444dSMax Reitz } 192532b6444dSMax Reitz 1926415184f5SAlberto Garcia for (j = 0; j < s->l2_slice_size; j++) { 1927415184f5SAlberto Garcia uint64_t l2_entry = be64_to_cpu(l2_slice[j]); 1928ecf58777SMax Reitz int64_t offset = l2_entry & L2E_OFFSET_MASK; 1929226494ffSAlberto Garcia QCow2ClusterType cluster_type = 1930226494ffSAlberto Garcia qcow2_get_cluster_type(l2_entry); 193132b6444dSMax Reitz 1932fdfab37dSEric Blake if (cluster_type != QCOW2_CLUSTER_ZERO_PLAIN && 1933fdfab37dSEric Blake cluster_type != QCOW2_CLUSTER_ZERO_ALLOC) { 193432b6444dSMax Reitz continue; 193532b6444dSMax Reitz } 193632b6444dSMax Reitz 1937fdfab37dSEric Blake if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { 1938760e0063SKevin Wolf if (!bs->backing) { 193932b6444dSMax Reitz /* not backed; therefore we can simply deallocate the 194032b6444dSMax Reitz * cluster */ 1941415184f5SAlberto Garcia l2_slice[j] = 0; 194232b6444dSMax Reitz l2_dirty = true; 194332b6444dSMax Reitz continue; 194432b6444dSMax Reitz } 194532b6444dSMax Reitz 194632b6444dSMax Reitz offset = qcow2_alloc_clusters(bs, s->cluster_size); 194732b6444dSMax Reitz if (offset < 0) { 194832b6444dSMax Reitz ret = offset; 194932b6444dSMax Reitz goto fail; 195032b6444dSMax Reitz } 1951ecf58777SMax Reitz 1952ecf58777SMax Reitz if (l2_refcount > 1) { 1953226494ffSAlberto Garcia /* For shared L2 tables, set the refcount accordingly 1954226494ffSAlberto Garcia * (it is already 1 and needs to be l2_refcount) */ 1955226494ffSAlberto Garcia ret = qcow2_update_cluster_refcount( 1956226494ffSAlberto Garcia bs, offset >> s->cluster_bits, 19572aabe7c7SMax Reitz refcount_diff(1, l2_refcount), false, 1958ecf58777SMax Reitz QCOW2_DISCARD_OTHER); 1959ecf58777SMax Reitz if (ret < 0) { 1960ecf58777SMax Reitz qcow2_free_clusters(bs, offset, s->cluster_size, 1961ecf58777SMax Reitz QCOW2_DISCARD_OTHER); 1962ecf58777SMax Reitz goto fail; 1963ecf58777SMax Reitz } 1964ecf58777SMax Reitz } 196532b6444dSMax Reitz } 196632b6444dSMax Reitz 19678dd93d93SMax Reitz if (offset_into_cluster(s, offset)) { 1968415184f5SAlberto Garcia int l2_index = slice * s->l2_slice_size + j; 1969226494ffSAlberto Garcia qcow2_signal_corruption( 1970226494ffSAlberto Garcia bs, true, -1, -1, 1971bcb07dbaSEric Blake "Cluster allocation offset " 19728dd93d93SMax Reitz "%#" PRIx64 " unaligned (L2 offset: %#" 19738dd93d93SMax Reitz PRIx64 ", L2 index: %#x)", offset, 1974415184f5SAlberto Garcia l2_offset, l2_index); 1975fdfab37dSEric Blake if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { 19768dd93d93SMax Reitz qcow2_free_clusters(bs, offset, s->cluster_size, 19778dd93d93SMax Reitz QCOW2_DISCARD_ALWAYS); 19788dd93d93SMax Reitz } 19798dd93d93SMax Reitz ret = -EIO; 19808dd93d93SMax Reitz goto fail; 19818dd93d93SMax Reitz } 19828dd93d93SMax Reitz 1983226494ffSAlberto Garcia ret = qcow2_pre_write_overlap_check(bs, 0, offset, 1984226494ffSAlberto Garcia s->cluster_size); 198532b6444dSMax Reitz if (ret < 0) { 1986fdfab37dSEric Blake if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { 198732b6444dSMax Reitz qcow2_free_clusters(bs, offset, s->cluster_size, 198832b6444dSMax Reitz QCOW2_DISCARD_ALWAYS); 1989320c7066SMax Reitz } 199032b6444dSMax Reitz goto fail; 199132b6444dSMax Reitz } 199232b6444dSMax Reitz 1993720ff280SKevin Wolf ret = bdrv_pwrite_zeroes(bs->file, offset, s->cluster_size, 0); 199432b6444dSMax Reitz if (ret < 0) { 1995fdfab37dSEric Blake if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { 199632b6444dSMax Reitz qcow2_free_clusters(bs, offset, s->cluster_size, 199732b6444dSMax Reitz QCOW2_DISCARD_ALWAYS); 1998320c7066SMax Reitz } 199932b6444dSMax Reitz goto fail; 200032b6444dSMax Reitz } 200132b6444dSMax Reitz 2002ecf58777SMax Reitz if (l2_refcount == 1) { 2003415184f5SAlberto Garcia l2_slice[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED); 2004ecf58777SMax Reitz } else { 2005415184f5SAlberto Garcia l2_slice[j] = cpu_to_be64(offset); 2006e390cf5aSMax Reitz } 2007ecf58777SMax Reitz l2_dirty = true; 200832b6444dSMax Reitz } 200932b6444dSMax Reitz 201032b6444dSMax Reitz if (is_active_l1) { 201132b6444dSMax Reitz if (l2_dirty) { 2012415184f5SAlberto Garcia qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice); 201332b6444dSMax Reitz qcow2_cache_depends_on_flush(s->l2_table_cache); 201432b6444dSMax Reitz } 2015415184f5SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 201632b6444dSMax Reitz } else { 201732b6444dSMax Reitz if (l2_dirty) { 2018226494ffSAlberto Garcia ret = qcow2_pre_write_overlap_check( 2019226494ffSAlberto Garcia bs, QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2, 2020415184f5SAlberto Garcia slice_offset, slice_size2); 202132b6444dSMax Reitz if (ret < 0) { 202232b6444dSMax Reitz goto fail; 202332b6444dSMax Reitz } 202432b6444dSMax Reitz 2025415184f5SAlberto Garcia ret = bdrv_pwrite(bs->file, slice_offset, 2026415184f5SAlberto Garcia l2_slice, slice_size2); 202732b6444dSMax Reitz if (ret < 0) { 202832b6444dSMax Reitz goto fail; 202932b6444dSMax Reitz } 203032b6444dSMax Reitz } 203132b6444dSMax Reitz } 2032226494ffSAlberto Garcia } 20334057a2b2SMax Reitz 20344057a2b2SMax Reitz (*visited_l1_entries)++; 20354057a2b2SMax Reitz if (status_cb) { 20368b13976dSMax Reitz status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque); 20374057a2b2SMax Reitz } 203832b6444dSMax Reitz } 203932b6444dSMax Reitz 204032b6444dSMax Reitz ret = 0; 204132b6444dSMax Reitz 204232b6444dSMax Reitz fail: 2043415184f5SAlberto Garcia if (l2_slice) { 204432b6444dSMax Reitz if (!is_active_l1) { 2045415184f5SAlberto Garcia qemu_vfree(l2_slice); 204632b6444dSMax Reitz } else { 2047415184f5SAlberto Garcia qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 204832b6444dSMax Reitz } 204932b6444dSMax Reitz } 205032b6444dSMax Reitz return ret; 205132b6444dSMax Reitz } 205232b6444dSMax Reitz 205332b6444dSMax Reitz /* 205432b6444dSMax Reitz * For backed images, expands all zero clusters on the image. For non-backed 205532b6444dSMax Reitz * images, deallocates all non-pre-allocated zero clusters (and claims the 205632b6444dSMax Reitz * allocation for pre-allocated ones). This is important for downgrading to a 205732b6444dSMax Reitz * qcow2 version which doesn't yet support metadata zero clusters. 205832b6444dSMax Reitz */ 20594057a2b2SMax Reitz int qcow2_expand_zero_clusters(BlockDriverState *bs, 20608b13976dSMax Reitz BlockDriverAmendStatusCB *status_cb, 20618b13976dSMax Reitz void *cb_opaque) 206232b6444dSMax Reitz { 2063ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 206432b6444dSMax Reitz uint64_t *l1_table = NULL; 20654057a2b2SMax Reitz int64_t l1_entries = 0, visited_l1_entries = 0; 206632b6444dSMax Reitz int ret; 206732b6444dSMax Reitz int i, j; 206832b6444dSMax Reitz 20694057a2b2SMax Reitz if (status_cb) { 20704057a2b2SMax Reitz l1_entries = s->l1_size; 20714057a2b2SMax Reitz for (i = 0; i < s->nb_snapshots; i++) { 20724057a2b2SMax Reitz l1_entries += s->snapshots[i].l1_size; 20734057a2b2SMax Reitz } 20744057a2b2SMax Reitz } 20754057a2b2SMax Reitz 207632b6444dSMax Reitz ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size, 20774057a2b2SMax Reitz &visited_l1_entries, l1_entries, 20788b13976dSMax Reitz status_cb, cb_opaque); 207932b6444dSMax Reitz if (ret < 0) { 208032b6444dSMax Reitz goto fail; 208132b6444dSMax Reitz } 208232b6444dSMax Reitz 208332b6444dSMax Reitz /* Inactive L1 tables may point to active L2 tables - therefore it is 208432b6444dSMax Reitz * necessary to flush the L2 table cache before trying to access the L2 208532b6444dSMax Reitz * tables pointed to by inactive L1 entries (else we might try to expand 208632b6444dSMax Reitz * zero clusters that have already been expanded); furthermore, it is also 208732b6444dSMax Reitz * necessary to empty the L2 table cache, since it may contain tables which 208832b6444dSMax Reitz * are now going to be modified directly on disk, bypassing the cache. 208932b6444dSMax Reitz * qcow2_cache_empty() does both for us. */ 209032b6444dSMax Reitz ret = qcow2_cache_empty(bs, s->l2_table_cache); 209132b6444dSMax Reitz if (ret < 0) { 209232b6444dSMax Reitz goto fail; 209332b6444dSMax Reitz } 209432b6444dSMax Reitz 209532b6444dSMax Reitz for (i = 0; i < s->nb_snapshots; i++) { 2096c9a442e4SAlberto Garcia int l1_size2; 2097c9a442e4SAlberto Garcia uint64_t *new_l1_table; 2098c9a442e4SAlberto Garcia Error *local_err = NULL; 209932b6444dSMax Reitz 2100c9a442e4SAlberto Garcia ret = qcow2_validate_table(bs, s->snapshots[i].l1_table_offset, 2101c9a442e4SAlberto Garcia s->snapshots[i].l1_size, sizeof(uint64_t), 2102c9a442e4SAlberto Garcia QCOW_MAX_L1_SIZE, "Snapshot L1 table", 2103c9a442e4SAlberto Garcia &local_err); 2104c9a442e4SAlberto Garcia if (ret < 0) { 2105c9a442e4SAlberto Garcia error_report_err(local_err); 2106c9a442e4SAlberto Garcia goto fail; 2107c9a442e4SAlberto Garcia } 2108c9a442e4SAlberto Garcia 2109c9a442e4SAlberto Garcia l1_size2 = s->snapshots[i].l1_size * sizeof(uint64_t); 2110c9a442e4SAlberto Garcia new_l1_table = g_try_realloc(l1_table, l1_size2); 2111de7269d2SAlberto Garcia 2112de7269d2SAlberto Garcia if (!new_l1_table) { 2113de7269d2SAlberto Garcia ret = -ENOMEM; 2114de7269d2SAlberto Garcia goto fail; 2115de7269d2SAlberto Garcia } 2116de7269d2SAlberto Garcia 2117de7269d2SAlberto Garcia l1_table = new_l1_table; 211832b6444dSMax Reitz 2119c9a442e4SAlberto Garcia ret = bdrv_pread(bs->file, s->snapshots[i].l1_table_offset, 2120c9a442e4SAlberto Garcia l1_table, l1_size2); 212132b6444dSMax Reitz if (ret < 0) { 212232b6444dSMax Reitz goto fail; 212332b6444dSMax Reitz } 212432b6444dSMax Reitz 212532b6444dSMax Reitz for (j = 0; j < s->snapshots[i].l1_size; j++) { 212632b6444dSMax Reitz be64_to_cpus(&l1_table[j]); 212732b6444dSMax Reitz } 212832b6444dSMax Reitz 212932b6444dSMax Reitz ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size, 21304057a2b2SMax Reitz &visited_l1_entries, l1_entries, 21318b13976dSMax Reitz status_cb, cb_opaque); 213232b6444dSMax Reitz if (ret < 0) { 213332b6444dSMax Reitz goto fail; 213432b6444dSMax Reitz } 213532b6444dSMax Reitz } 213632b6444dSMax Reitz 213732b6444dSMax Reitz ret = 0; 213832b6444dSMax Reitz 213932b6444dSMax Reitz fail: 214032b6444dSMax Reitz g_free(l1_table); 214132b6444dSMax Reitz return ret; 214232b6444dSMax Reitz } 2143