145aba42fSKevin Wolf /* 245aba42fSKevin Wolf * Block driver for the QCOW version 2 format 345aba42fSKevin Wolf * 445aba42fSKevin Wolf * Copyright (c) 2004-2006 Fabrice Bellard 545aba42fSKevin Wolf * 645aba42fSKevin Wolf * Permission is hereby granted, free of charge, to any person obtaining a copy 745aba42fSKevin Wolf * of this software and associated documentation files (the "Software"), to deal 845aba42fSKevin Wolf * in the Software without restriction, including without limitation the rights 945aba42fSKevin Wolf * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 1045aba42fSKevin Wolf * copies of the Software, and to permit persons to whom the Software is 1145aba42fSKevin Wolf * furnished to do so, subject to the following conditions: 1245aba42fSKevin Wolf * 1345aba42fSKevin Wolf * The above copyright notice and this permission notice shall be included in 1445aba42fSKevin Wolf * all copies or substantial portions of the Software. 1545aba42fSKevin Wolf * 1645aba42fSKevin Wolf * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1745aba42fSKevin Wolf * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1845aba42fSKevin Wolf * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1945aba42fSKevin Wolf * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2045aba42fSKevin Wolf * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 2145aba42fSKevin Wolf * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 2245aba42fSKevin Wolf * THE SOFTWARE. 2345aba42fSKevin Wolf */ 2445aba42fSKevin Wolf 2580c71a24SPeter Maydell #include "qemu/osdep.h" 2645aba42fSKevin Wolf #include <zlib.h> 2745aba42fSKevin Wolf 28da34e65cSMarkus Armbruster #include "qapi/error.h" 2945aba42fSKevin Wolf #include "qemu-common.h" 30737e150eSPaolo Bonzini #include "block/block_int.h" 3145aba42fSKevin Wolf #include "block/qcow2.h" 3258369e22SPaolo Bonzini #include "qemu/bswap.h" 333cce16f4SKevin Wolf #include "trace.h" 3445aba42fSKevin Wolf 352cf7cfa1SKevin Wolf int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, 362cf7cfa1SKevin Wolf bool exact_size) 3745aba42fSKevin Wolf { 38ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 392cf7cfa1SKevin Wolf int new_l1_size2, ret, i; 4045aba42fSKevin Wolf uint64_t *new_l1_table; 41fda74f82SMax Reitz int64_t old_l1_table_offset, old_l1_size; 422cf7cfa1SKevin Wolf int64_t new_l1_table_offset, new_l1_size; 4345aba42fSKevin Wolf uint8_t data[12]; 4445aba42fSKevin Wolf 4572893756SStefan Hajnoczi if (min_size <= s->l1_size) 4645aba42fSKevin Wolf return 0; 4772893756SStefan Hajnoczi 48b93f9950SMax Reitz /* Do a sanity check on min_size before trying to calculate new_l1_size 49b93f9950SMax Reitz * (this prevents overflows during the while loop for the calculation of 50b93f9950SMax Reitz * new_l1_size) */ 51b93f9950SMax Reitz if (min_size > INT_MAX / sizeof(uint64_t)) { 52b93f9950SMax Reitz return -EFBIG; 53b93f9950SMax Reitz } 54b93f9950SMax Reitz 5572893756SStefan Hajnoczi if (exact_size) { 5672893756SStefan Hajnoczi new_l1_size = min_size; 5772893756SStefan Hajnoczi } else { 5872893756SStefan Hajnoczi /* Bump size up to reduce the number of times we have to grow */ 5972893756SStefan Hajnoczi new_l1_size = s->l1_size; 60d191d12dSStefan Weil if (new_l1_size == 0) { 61d191d12dSStefan Weil new_l1_size = 1; 62d191d12dSStefan Weil } 6345aba42fSKevin Wolf while (min_size > new_l1_size) { 6445aba42fSKevin Wolf new_l1_size = (new_l1_size * 3 + 1) / 2; 6545aba42fSKevin Wolf } 6672893756SStefan Hajnoczi } 6772893756SStefan Hajnoczi 6884c26520SMax Reitz QEMU_BUILD_BUG_ON(QCOW_MAX_L1_SIZE > INT_MAX); 6984c26520SMax Reitz if (new_l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) { 702cf7cfa1SKevin Wolf return -EFBIG; 712cf7cfa1SKevin Wolf } 722cf7cfa1SKevin Wolf 7345aba42fSKevin Wolf #ifdef DEBUG_ALLOC2 742cf7cfa1SKevin Wolf fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n", 752cf7cfa1SKevin Wolf s->l1_size, new_l1_size); 7645aba42fSKevin Wolf #endif 7745aba42fSKevin Wolf 7845aba42fSKevin Wolf new_l1_size2 = sizeof(uint64_t) * new_l1_size; 799a4f4c31SKevin Wolf new_l1_table = qemu_try_blockalign(bs->file->bs, 80de82815dSKevin Wolf align_offset(new_l1_size2, 512)); 81de82815dSKevin Wolf if (new_l1_table == NULL) { 82de82815dSKevin Wolf return -ENOMEM; 83de82815dSKevin Wolf } 84de82815dSKevin Wolf memset(new_l1_table, 0, align_offset(new_l1_size2, 512)); 85de82815dSKevin Wolf 860647d47cSStefan Hajnoczi if (s->l1_size) { 8745aba42fSKevin Wolf memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t)); 880647d47cSStefan Hajnoczi } 8945aba42fSKevin Wolf 9045aba42fSKevin Wolf /* write new table (align to cluster) */ 9166f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE); 92ed6ccf0fSKevin Wolf new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2); 935d757b56SKevin Wolf if (new_l1_table_offset < 0) { 94de82815dSKevin Wolf qemu_vfree(new_l1_table); 955d757b56SKevin Wolf return new_l1_table_offset; 965d757b56SKevin Wolf } 9729c1a730SKevin Wolf 9829c1a730SKevin Wolf ret = qcow2_cache_flush(bs, s->refcount_block_cache); 9929c1a730SKevin Wolf if (ret < 0) { 10080fa3341SKevin Wolf goto fail; 10129c1a730SKevin Wolf } 10245aba42fSKevin Wolf 103cf93980eSMax Reitz /* the L1 position has not yet been updated, so these clusters must 104cf93980eSMax Reitz * indeed be completely free */ 105231bb267SMax Reitz ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset, 106231bb267SMax Reitz new_l1_size2); 107cf93980eSMax Reitz if (ret < 0) { 108cf93980eSMax Reitz goto fail; 109cf93980eSMax Reitz } 110cf93980eSMax Reitz 11166f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); 11245aba42fSKevin Wolf for(i = 0; i < s->l1_size; i++) 11345aba42fSKevin Wolf new_l1_table[i] = cpu_to_be64(new_l1_table[i]); 114d9ca2ea2SKevin Wolf ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, 1159a4f4c31SKevin Wolf new_l1_table, new_l1_size2); 1168b3b7206SKevin Wolf if (ret < 0) 11745aba42fSKevin Wolf goto fail; 11845aba42fSKevin Wolf for(i = 0; i < s->l1_size; i++) 11945aba42fSKevin Wolf new_l1_table[i] = be64_to_cpu(new_l1_table[i]); 12045aba42fSKevin Wolf 12145aba42fSKevin Wolf /* set new table */ 12266f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE); 123f1f7a1ddSPeter Maydell stl_be_p(data, new_l1_size); 124e4ef9f46SPeter Maydell stq_be_p(data + 4, new_l1_table_offset); 125d9ca2ea2SKevin Wolf ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), 1269a4f4c31SKevin Wolf data, sizeof(data)); 1278b3b7206SKevin Wolf if (ret < 0) { 12845aba42fSKevin Wolf goto fail; 129fb8fa77cSKevin Wolf } 130de82815dSKevin Wolf qemu_vfree(s->l1_table); 131fda74f82SMax Reitz old_l1_table_offset = s->l1_table_offset; 13245aba42fSKevin Wolf s->l1_table_offset = new_l1_table_offset; 13345aba42fSKevin Wolf s->l1_table = new_l1_table; 134fda74f82SMax Reitz old_l1_size = s->l1_size; 13545aba42fSKevin Wolf s->l1_size = new_l1_size; 136fda74f82SMax Reitz qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * sizeof(uint64_t), 137fda74f82SMax Reitz QCOW2_DISCARD_OTHER); 13845aba42fSKevin Wolf return 0; 13945aba42fSKevin Wolf fail: 140de82815dSKevin Wolf qemu_vfree(new_l1_table); 1416cfcb9b8SKevin Wolf qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2, 1426cfcb9b8SKevin Wolf QCOW2_DISCARD_OTHER); 1438b3b7206SKevin Wolf return ret; 14445aba42fSKevin Wolf } 14545aba42fSKevin Wolf 14645aba42fSKevin Wolf /* 14745aba42fSKevin Wolf * l2_load 14845aba42fSKevin Wolf * 14945aba42fSKevin Wolf * Loads a L2 table into memory. If the table is in the cache, the cache 15045aba42fSKevin Wolf * is used; otherwise the L2 table is loaded from the image file. 15145aba42fSKevin Wolf * 15245aba42fSKevin Wolf * Returns a pointer to the L2 table on success, or NULL if the read from 15345aba42fSKevin Wolf * the image file failed. 15445aba42fSKevin Wolf */ 15545aba42fSKevin Wolf 15655c17e98SKevin Wolf static int l2_load(BlockDriverState *bs, uint64_t l2_offset, 15755c17e98SKevin Wolf uint64_t **l2_table) 15845aba42fSKevin Wolf { 159ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 16045aba42fSKevin Wolf 1619be38598SEduardo Habkost return qcow2_cache_get(bs, s->l2_table_cache, l2_offset, 1629be38598SEduardo Habkost (void **)l2_table); 16355c17e98SKevin Wolf } 16455c17e98SKevin Wolf 16545aba42fSKevin Wolf /* 1666583e3c7SKevin Wolf * Writes one sector of the L1 table to the disk (can't update single entries 1676583e3c7SKevin Wolf * and we really don't want bdrv_pread to perform a read-modify-write) 1686583e3c7SKevin Wolf */ 1696583e3c7SKevin Wolf #define L1_ENTRIES_PER_SECTOR (512 / 8) 170e23e400eSMax Reitz int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index) 1716583e3c7SKevin Wolf { 172ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 173a1391444SMax Reitz uint64_t buf[L1_ENTRIES_PER_SECTOR] = { 0 }; 1746583e3c7SKevin Wolf int l1_start_index; 175f7defcb6SKevin Wolf int i, ret; 1766583e3c7SKevin Wolf 1776583e3c7SKevin Wolf l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1); 178a1391444SMax Reitz for (i = 0; i < L1_ENTRIES_PER_SECTOR && l1_start_index + i < s->l1_size; 179a1391444SMax Reitz i++) 180a1391444SMax Reitz { 1816583e3c7SKevin Wolf buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]); 1826583e3c7SKevin Wolf } 1836583e3c7SKevin Wolf 184231bb267SMax Reitz ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1, 185cf93980eSMax Reitz s->l1_table_offset + 8 * l1_start_index, sizeof(buf)); 186cf93980eSMax Reitz if (ret < 0) { 187cf93980eSMax Reitz return ret; 188cf93980eSMax Reitz } 189cf93980eSMax Reitz 19066f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); 191d9ca2ea2SKevin Wolf ret = bdrv_pwrite_sync(bs->file, 1929a4f4c31SKevin Wolf s->l1_table_offset + 8 * l1_start_index, 193f7defcb6SKevin Wolf buf, sizeof(buf)); 194f7defcb6SKevin Wolf if (ret < 0) { 195f7defcb6SKevin Wolf return ret; 1966583e3c7SKevin Wolf } 1976583e3c7SKevin Wolf 1986583e3c7SKevin Wolf return 0; 1996583e3c7SKevin Wolf } 2006583e3c7SKevin Wolf 2016583e3c7SKevin Wolf /* 20245aba42fSKevin Wolf * l2_allocate 20345aba42fSKevin Wolf * 20445aba42fSKevin Wolf * Allocate a new l2 entry in the file. If l1_index points to an already 20545aba42fSKevin Wolf * used entry in the L2 table (i.e. we are doing a copy on write for the L2 20645aba42fSKevin Wolf * table) copy the contents of the old L2 table into the newly allocated one. 20745aba42fSKevin Wolf * Otherwise the new table is initialized with zeros. 20845aba42fSKevin Wolf * 20945aba42fSKevin Wolf */ 21045aba42fSKevin Wolf 211c46e1167SKevin Wolf static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) 21245aba42fSKevin Wolf { 213ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 2146583e3c7SKevin Wolf uint64_t old_l2_offset; 2158585afd8SMax Reitz uint64_t *l2_table = NULL; 216f4f0d391SKevin Wolf int64_t l2_offset; 217c46e1167SKevin Wolf int ret; 21845aba42fSKevin Wolf 21945aba42fSKevin Wolf old_l2_offset = s->l1_table[l1_index]; 22045aba42fSKevin Wolf 2213cce16f4SKevin Wolf trace_qcow2_l2_allocate(bs, l1_index); 2223cce16f4SKevin Wolf 22345aba42fSKevin Wolf /* allocate a new l2 entry */ 22445aba42fSKevin Wolf 225ed6ccf0fSKevin Wolf l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t)); 2265d757b56SKevin Wolf if (l2_offset < 0) { 227be0b742eSMax Reitz ret = l2_offset; 228be0b742eSMax Reitz goto fail; 2295d757b56SKevin Wolf } 23029c1a730SKevin Wolf 23129c1a730SKevin Wolf ret = qcow2_cache_flush(bs, s->refcount_block_cache); 23229c1a730SKevin Wolf if (ret < 0) { 23329c1a730SKevin Wolf goto fail; 23429c1a730SKevin Wolf } 23545aba42fSKevin Wolf 23645aba42fSKevin Wolf /* allocate a new entry in the l2 cache */ 23745aba42fSKevin Wolf 2383cce16f4SKevin Wolf trace_qcow2_l2_allocate_get_empty(bs, l1_index); 23929c1a730SKevin Wolf ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table); 24029c1a730SKevin Wolf if (ret < 0) { 241be0b742eSMax Reitz goto fail; 24229c1a730SKevin Wolf } 24329c1a730SKevin Wolf 24429c1a730SKevin Wolf l2_table = *table; 24545aba42fSKevin Wolf 2468e37f681SKevin Wolf if ((old_l2_offset & L1E_OFFSET_MASK) == 0) { 24745aba42fSKevin Wolf /* if there was no old l2 table, clear the new table */ 24845aba42fSKevin Wolf memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); 24945aba42fSKevin Wolf } else { 25029c1a730SKevin Wolf uint64_t* old_table; 25129c1a730SKevin Wolf 25245aba42fSKevin Wolf /* if there was an old l2 table, read it from the disk */ 25366f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ); 2548e37f681SKevin Wolf ret = qcow2_cache_get(bs, s->l2_table_cache, 2558e37f681SKevin Wolf old_l2_offset & L1E_OFFSET_MASK, 25629c1a730SKevin Wolf (void**) &old_table); 25729c1a730SKevin Wolf if (ret < 0) { 25829c1a730SKevin Wolf goto fail; 25929c1a730SKevin Wolf } 26029c1a730SKevin Wolf 26129c1a730SKevin Wolf memcpy(l2_table, old_table, s->cluster_size); 26229c1a730SKevin Wolf 263a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &old_table); 26445aba42fSKevin Wolf } 26529c1a730SKevin Wolf 26645aba42fSKevin Wolf /* write the l2 table to the file */ 26766f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE); 26829c1a730SKevin Wolf 2693cce16f4SKevin Wolf trace_qcow2_l2_allocate_write_l2(bs, l1_index); 27072e80b89SAlberto Garcia qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 27129c1a730SKevin Wolf ret = qcow2_cache_flush(bs, s->l2_table_cache); 272c46e1167SKevin Wolf if (ret < 0) { 273175e1152SKevin Wolf goto fail; 274175e1152SKevin Wolf } 275175e1152SKevin Wolf 276175e1152SKevin Wolf /* update the L1 entry */ 2773cce16f4SKevin Wolf trace_qcow2_l2_allocate_write_l1(bs, l1_index); 278175e1152SKevin Wolf s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; 279e23e400eSMax Reitz ret = qcow2_write_l1_entry(bs, l1_index); 280175e1152SKevin Wolf if (ret < 0) { 281175e1152SKevin Wolf goto fail; 282c46e1167SKevin Wolf } 28345aba42fSKevin Wolf 284c46e1167SKevin Wolf *table = l2_table; 2853cce16f4SKevin Wolf trace_qcow2_l2_allocate_done(bs, l1_index, 0); 286c46e1167SKevin Wolf return 0; 287175e1152SKevin Wolf 288175e1152SKevin Wolf fail: 2893cce16f4SKevin Wolf trace_qcow2_l2_allocate_done(bs, l1_index, ret); 2908585afd8SMax Reitz if (l2_table != NULL) { 29129c1a730SKevin Wolf qcow2_cache_put(bs, s->l2_table_cache, (void**) table); 2928585afd8SMax Reitz } 29368dba0bfSKevin Wolf s->l1_table[l1_index] = old_l2_offset; 294e3b21ef9SMax Reitz if (l2_offset > 0) { 295e3b21ef9SMax Reitz qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), 296e3b21ef9SMax Reitz QCOW2_DISCARD_ALWAYS); 297e3b21ef9SMax Reitz } 298175e1152SKevin Wolf return ret; 29945aba42fSKevin Wolf } 30045aba42fSKevin Wolf 3012bfcc4a0SKevin Wolf /* 3022bfcc4a0SKevin Wolf * Checks how many clusters in a given L2 table are contiguous in the image 3032bfcc4a0SKevin Wolf * file. As soon as one of the flags in the bitmask stop_flags changes compared 3042bfcc4a0SKevin Wolf * to the first cluster, the search is stopped and the cluster is not counted 3052bfcc4a0SKevin Wolf * as contiguous. (This allows it, for example, to stop at the first compressed 3062bfcc4a0SKevin Wolf * cluster which may require a different handling) 3072bfcc4a0SKevin Wolf */ 308b6d36defSMax Reitz static int count_contiguous_clusters(int nb_clusters, int cluster_size, 30961653008SKevin Wolf uint64_t *l2_table, uint64_t stop_flags) 31045aba42fSKevin Wolf { 31145aba42fSKevin Wolf int i; 312564a6b69SMax Reitz int first_cluster_type; 31378a52ad5SPeter Lieven uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED; 31415684a47SMax Reitz uint64_t first_entry = be64_to_cpu(l2_table[0]); 31515684a47SMax Reitz uint64_t offset = first_entry & mask; 31645aba42fSKevin Wolf 317564a6b69SMax Reitz if (!offset) { 31845aba42fSKevin Wolf return 0; 319564a6b69SMax Reitz } 32045aba42fSKevin Wolf 321564a6b69SMax Reitz /* must be allocated */ 322564a6b69SMax Reitz first_cluster_type = qcow2_get_cluster_type(first_entry); 323564a6b69SMax Reitz assert(first_cluster_type == QCOW2_CLUSTER_NORMAL || 324564a6b69SMax Reitz (first_cluster_type == QCOW2_CLUSTER_ZERO && 325564a6b69SMax Reitz (first_entry & L2E_OFFSET_MASK) != 0)); 32615684a47SMax Reitz 32761653008SKevin Wolf for (i = 0; i < nb_clusters; i++) { 3282bfcc4a0SKevin Wolf uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask; 3292bfcc4a0SKevin Wolf if (offset + (uint64_t) i * cluster_size != l2_entry) { 33045aba42fSKevin Wolf break; 3312bfcc4a0SKevin Wolf } 3322bfcc4a0SKevin Wolf } 33345aba42fSKevin Wolf 33461653008SKevin Wolf return i; 33545aba42fSKevin Wolf } 33645aba42fSKevin Wolf 337*4341df8aSEric Blake /* 338*4341df8aSEric Blake * Checks how many consecutive unallocated clusters in a given L2 339*4341df8aSEric Blake * table have the same cluster type. 340*4341df8aSEric Blake */ 341*4341df8aSEric Blake static int count_contiguous_clusters_unallocated(int nb_clusters, 342a99dfb45SKevin Wolf uint64_t *l2_table, 343a99dfb45SKevin Wolf int wanted_type) 34445aba42fSKevin Wolf { 3452bfcc4a0SKevin Wolf int i; 34645aba42fSKevin Wolf 347*4341df8aSEric Blake assert(wanted_type == QCOW2_CLUSTER_ZERO || 348*4341df8aSEric Blake wanted_type == QCOW2_CLUSTER_UNALLOCATED); 3492bfcc4a0SKevin Wolf for (i = 0; i < nb_clusters; i++) { 350*4341df8aSEric Blake uint64_t entry = be64_to_cpu(l2_table[i]); 351*4341df8aSEric Blake int type = qcow2_get_cluster_type(entry); 3522bfcc4a0SKevin Wolf 353*4341df8aSEric Blake if (type != wanted_type || entry & L2E_OFFSET_MASK) { 3542bfcc4a0SKevin Wolf break; 3552bfcc4a0SKevin Wolf } 3562bfcc4a0SKevin Wolf } 35745aba42fSKevin Wolf 35845aba42fSKevin Wolf return i; 35945aba42fSKevin Wolf } 36045aba42fSKevin Wolf 36145aba42fSKevin Wolf /* The crypt function is compatible with the linux cryptoloop 36245aba42fSKevin Wolf algorithm for < 4 GB images. NOTE: out_buf == in_buf is 36345aba42fSKevin Wolf supported */ 364ff99129aSKevin Wolf int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num, 36545aba42fSKevin Wolf uint8_t *out_buf, const uint8_t *in_buf, 366f6fa64f6SDaniel P. Berrange int nb_sectors, bool enc, 367f6fa64f6SDaniel P. Berrange Error **errp) 36845aba42fSKevin Wolf { 36945aba42fSKevin Wolf union { 37045aba42fSKevin Wolf uint64_t ll[2]; 37145aba42fSKevin Wolf uint8_t b[16]; 37245aba42fSKevin Wolf } ivec; 37345aba42fSKevin Wolf int i; 374f6fa64f6SDaniel P. Berrange int ret; 37545aba42fSKevin Wolf 37645aba42fSKevin Wolf for(i = 0; i < nb_sectors; i++) { 37745aba42fSKevin Wolf ivec.ll[0] = cpu_to_le64(sector_num); 37845aba42fSKevin Wolf ivec.ll[1] = 0; 379f6fa64f6SDaniel P. Berrange if (qcrypto_cipher_setiv(s->cipher, 380f6fa64f6SDaniel P. Berrange ivec.b, G_N_ELEMENTS(ivec.b), 381f6fa64f6SDaniel P. Berrange errp) < 0) { 382f6fa64f6SDaniel P. Berrange return -1; 383f6fa64f6SDaniel P. Berrange } 384f6fa64f6SDaniel P. Berrange if (enc) { 385f6fa64f6SDaniel P. Berrange ret = qcrypto_cipher_encrypt(s->cipher, 386f6fa64f6SDaniel P. Berrange in_buf, 387f6fa64f6SDaniel P. Berrange out_buf, 388f6fa64f6SDaniel P. Berrange 512, 389f6fa64f6SDaniel P. Berrange errp); 390f6fa64f6SDaniel P. Berrange } else { 391f6fa64f6SDaniel P. Berrange ret = qcrypto_cipher_decrypt(s->cipher, 392f6fa64f6SDaniel P. Berrange in_buf, 393f6fa64f6SDaniel P. Berrange out_buf, 394f6fa64f6SDaniel P. Berrange 512, 395f6fa64f6SDaniel P. Berrange errp); 396f6fa64f6SDaniel P. Berrange } 397f6fa64f6SDaniel P. Berrange if (ret < 0) { 398f6fa64f6SDaniel P. Berrange return -1; 399f6fa64f6SDaniel P. Berrange } 40045aba42fSKevin Wolf sector_num++; 40145aba42fSKevin Wolf in_buf += 512; 40245aba42fSKevin Wolf out_buf += 512; 40345aba42fSKevin Wolf } 404f6fa64f6SDaniel P. Berrange return 0; 40545aba42fSKevin Wolf } 40645aba42fSKevin Wolf 407aaa4d20bSKevin Wolf static int coroutine_fn do_perform_cow(BlockDriverState *bs, 408aaa4d20bSKevin Wolf uint64_t src_cluster_offset, 409aef4acb6SStefan Hajnoczi uint64_t cluster_offset, 410aaa4d20bSKevin Wolf int offset_in_cluster, 411aaa4d20bSKevin Wolf int bytes) 41245aba42fSKevin Wolf { 413ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 414aef4acb6SStefan Hajnoczi QEMUIOVector qiov; 415aef4acb6SStefan Hajnoczi struct iovec iov; 416aaa4d20bSKevin Wolf int ret; 4171b9f1491SKevin Wolf 418aaa4d20bSKevin Wolf iov.iov_len = bytes; 419de82815dSKevin Wolf iov.iov_base = qemu_try_blockalign(bs, iov.iov_len); 420de82815dSKevin Wolf if (iov.iov_base == NULL) { 421de82815dSKevin Wolf return -ENOMEM; 422de82815dSKevin Wolf } 423aef4acb6SStefan Hajnoczi 424aef4acb6SStefan Hajnoczi qemu_iovec_init_external(&qiov, &iov, 1); 4251b9f1491SKevin Wolf 42666f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); 427aef4acb6SStefan Hajnoczi 428dba28555SMax Reitz if (!bs->drv) { 429bd604369SKevin Wolf ret = -ENOMEDIUM; 430bd604369SKevin Wolf goto out; 431dba28555SMax Reitz } 432dba28555SMax Reitz 433aef4acb6SStefan Hajnoczi /* Call .bdrv_co_readv() directly instead of using the public block-layer 434aef4acb6SStefan Hajnoczi * interface. This avoids double I/O throttling and request tracking, 435aef4acb6SStefan Hajnoczi * which can lead to deadlock when block layer copy-on-read is enabled. 436aef4acb6SStefan Hajnoczi */ 437aaa4d20bSKevin Wolf ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster, 438aaa4d20bSKevin Wolf bytes, &qiov, 0); 4391b9f1491SKevin Wolf if (ret < 0) { 4401b9f1491SKevin Wolf goto out; 4411b9f1491SKevin Wolf } 4421b9f1491SKevin Wolf 4438336aafaSDaniel P. Berrange if (bs->encrypted) { 444f6fa64f6SDaniel P. Berrange Error *err = NULL; 445bb9f8dd0SDaniel P. Berrange int64_t sector = (src_cluster_offset + offset_in_cluster) 446aaa4d20bSKevin Wolf >> BDRV_SECTOR_BITS; 447f6fa64f6SDaniel P. Berrange assert(s->cipher); 448aaa4d20bSKevin Wolf assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0); 449aaa4d20bSKevin Wolf assert((bytes & ~BDRV_SECTOR_MASK) == 0); 450aaa4d20bSKevin Wolf if (qcow2_encrypt_sectors(s, sector, iov.iov_base, iov.iov_base, 451aaa4d20bSKevin Wolf bytes >> BDRV_SECTOR_BITS, true, &err) < 0) { 452f6fa64f6SDaniel P. Berrange ret = -EIO; 453f6fa64f6SDaniel P. Berrange error_free(err); 454f6fa64f6SDaniel P. Berrange goto out; 455f6fa64f6SDaniel P. Berrange } 45645aba42fSKevin Wolf } 4571b9f1491SKevin Wolf 458231bb267SMax Reitz ret = qcow2_pre_write_overlap_check(bs, 0, 459aaa4d20bSKevin Wolf cluster_offset + offset_in_cluster, bytes); 460cf93980eSMax Reitz if (ret < 0) { 461cf93980eSMax Reitz goto out; 462cf93980eSMax Reitz } 463cf93980eSMax Reitz 46466f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); 465a03ef88fSKevin Wolf ret = bdrv_co_pwritev(bs->file, cluster_offset + offset_in_cluster, 466aaa4d20bSKevin Wolf bytes, &qiov, 0); 4671b9f1491SKevin Wolf if (ret < 0) { 4681b9f1491SKevin Wolf goto out; 4691b9f1491SKevin Wolf } 4701b9f1491SKevin Wolf 4711b9f1491SKevin Wolf ret = 0; 4721b9f1491SKevin Wolf out: 473aef4acb6SStefan Hajnoczi qemu_vfree(iov.iov_base); 47445aba42fSKevin Wolf return ret; 47545aba42fSKevin Wolf } 47645aba42fSKevin Wolf 47745aba42fSKevin Wolf 47845aba42fSKevin Wolf /* 47945aba42fSKevin Wolf * get_cluster_offset 48045aba42fSKevin Wolf * 481ecfe1863SKevin Wolf * For a given offset of the virtual disk, find the cluster type and offset in 482ecfe1863SKevin Wolf * the qcow2 file. The offset is stored in *cluster_offset. 48345aba42fSKevin Wolf * 484ecfe1863SKevin Wolf * On entry, *bytes is the maximum number of contiguous bytes starting at 485ecfe1863SKevin Wolf * offset that we are interested in. 48645aba42fSKevin Wolf * 487ecfe1863SKevin Wolf * On exit, *bytes is the number of bytes starting at offset that have the same 488ecfe1863SKevin Wolf * cluster type and (if applicable) are stored contiguously in the image file. 489ecfe1863SKevin Wolf * Compressed clusters are always returned one by one. 49045aba42fSKevin Wolf * 49168d000a3SKevin Wolf * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error 49268d000a3SKevin Wolf * cases. 49345aba42fSKevin Wolf */ 4941c46efaaSKevin Wolf int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, 495ecfe1863SKevin Wolf unsigned int *bytes, uint64_t *cluster_offset) 49645aba42fSKevin Wolf { 497ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 4982cf7cfa1SKevin Wolf unsigned int l2_index; 4992cf7cfa1SKevin Wolf uint64_t l1_index, l2_offset, *l2_table; 50045aba42fSKevin Wolf int l1_bits, c; 501c834cba9SMax Reitz unsigned int offset_in_cluster; 502c834cba9SMax Reitz uint64_t bytes_available, bytes_needed, nb_clusters; 50355c17e98SKevin Wolf int ret; 504b2f65d6bSKevin Wolf 505b2f65d6bSKevin Wolf offset_in_cluster = offset_into_cluster(s, offset); 506ecfe1863SKevin Wolf bytes_needed = (uint64_t) *bytes + offset_in_cluster; 50745aba42fSKevin Wolf 50845aba42fSKevin Wolf l1_bits = s->l2_bits + s->cluster_bits; 50945aba42fSKevin Wolf 510b2f65d6bSKevin Wolf /* compute how many bytes there are between the start of the cluster 511b2f65d6bSKevin Wolf * containing offset and the end of the l1 entry */ 512b2f65d6bSKevin Wolf bytes_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1)) 513b2f65d6bSKevin Wolf + offset_in_cluster; 51445aba42fSKevin Wolf 515b2f65d6bSKevin Wolf if (bytes_needed > bytes_available) { 516b2f65d6bSKevin Wolf bytes_needed = bytes_available; 51745aba42fSKevin Wolf } 51845aba42fSKevin Wolf 5191c46efaaSKevin Wolf *cluster_offset = 0; 52045aba42fSKevin Wolf 521b6af0975SDaniel P. Berrange /* seek to the l2 offset in the l1 table */ 52245aba42fSKevin Wolf 52345aba42fSKevin Wolf l1_index = offset >> l1_bits; 52468d000a3SKevin Wolf if (l1_index >= s->l1_size) { 52568d000a3SKevin Wolf ret = QCOW2_CLUSTER_UNALLOCATED; 52645aba42fSKevin Wolf goto out; 52768d000a3SKevin Wolf } 52845aba42fSKevin Wolf 52968d000a3SKevin Wolf l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; 53068d000a3SKevin Wolf if (!l2_offset) { 53168d000a3SKevin Wolf ret = QCOW2_CLUSTER_UNALLOCATED; 53245aba42fSKevin Wolf goto out; 53368d000a3SKevin Wolf } 53445aba42fSKevin Wolf 535a97c67eeSMax Reitz if (offset_into_cluster(s, l2_offset)) { 536a97c67eeSMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 537a97c67eeSMax Reitz " unaligned (L1 index: %#" PRIx64 ")", 538a97c67eeSMax Reitz l2_offset, l1_index); 539a97c67eeSMax Reitz return -EIO; 540a97c67eeSMax Reitz } 541a97c67eeSMax Reitz 54245aba42fSKevin Wolf /* load the l2 table in memory */ 54345aba42fSKevin Wolf 54455c17e98SKevin Wolf ret = l2_load(bs, l2_offset, &l2_table); 54555c17e98SKevin Wolf if (ret < 0) { 54655c17e98SKevin Wolf return ret; 5471c46efaaSKevin Wolf } 54845aba42fSKevin Wolf 54945aba42fSKevin Wolf /* find the cluster offset for the given disk offset */ 55045aba42fSKevin Wolf 55145aba42fSKevin Wolf l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); 5521c46efaaSKevin Wolf *cluster_offset = be64_to_cpu(l2_table[l2_index]); 553b6d36defSMax Reitz 554b2f65d6bSKevin Wolf nb_clusters = size_to_clusters(s, bytes_needed); 555c834cba9SMax Reitz /* bytes_needed <= *bytes + offset_in_cluster, both of which are unsigned 556c834cba9SMax Reitz * integers; the minimum cluster size is 512, so this assertion is always 557c834cba9SMax Reitz * true */ 558c834cba9SMax Reitz assert(nb_clusters <= INT_MAX); 55945aba42fSKevin Wolf 56068d000a3SKevin Wolf ret = qcow2_get_cluster_type(*cluster_offset); 56168d000a3SKevin Wolf switch (ret) { 56268d000a3SKevin Wolf case QCOW2_CLUSTER_COMPRESSED: 56368d000a3SKevin Wolf /* Compressed clusters can only be processed one by one */ 56468d000a3SKevin Wolf c = 1; 56568d000a3SKevin Wolf *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK; 56668d000a3SKevin Wolf break; 5676377af48SKevin Wolf case QCOW2_CLUSTER_ZERO: 568381b487dSPaolo Bonzini if (s->qcow_version < 3) { 569a97c67eeSMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found" 570a97c67eeSMax Reitz " in pre-v3 image (L2 offset: %#" PRIx64 571a97c67eeSMax Reitz ", L2 index: %#x)", l2_offset, l2_index); 572a97c67eeSMax Reitz ret = -EIO; 573a97c67eeSMax Reitz goto fail; 574381b487dSPaolo Bonzini } 575*4341df8aSEric Blake /* Distinguish between pure zero clusters and pre-allocated ones */ 576*4341df8aSEric Blake if (*cluster_offset & L2E_OFFSET_MASK) { 577*4341df8aSEric Blake c = count_contiguous_clusters(nb_clusters, s->cluster_size, 578*4341df8aSEric Blake &l2_table[l2_index], QCOW_OFLAG_ZERO); 579*4341df8aSEric Blake *cluster_offset &= L2E_OFFSET_MASK; 580*4341df8aSEric Blake if (offset_into_cluster(s, *cluster_offset)) { 581*4341df8aSEric Blake qcow2_signal_corruption(bs, true, -1, -1, 582*4341df8aSEric Blake "Preallocated zero cluster offset %#" 583*4341df8aSEric Blake PRIx64 " unaligned (L2 offset: %#" 584*4341df8aSEric Blake PRIx64 ", L2 index: %#x)", 585*4341df8aSEric Blake *cluster_offset, l2_offset, l2_index); 586*4341df8aSEric Blake ret = -EIO; 587*4341df8aSEric Blake goto fail; 588*4341df8aSEric Blake } 589*4341df8aSEric Blake } else { 590*4341df8aSEric Blake c = count_contiguous_clusters_unallocated(nb_clusters, 591*4341df8aSEric Blake &l2_table[l2_index], 592a99dfb45SKevin Wolf QCOW2_CLUSTER_ZERO); 5936377af48SKevin Wolf *cluster_offset = 0; 594*4341df8aSEric Blake } 5956377af48SKevin Wolf break; 59668d000a3SKevin Wolf case QCOW2_CLUSTER_UNALLOCATED: 59745aba42fSKevin Wolf /* how many empty clusters ? */ 598*4341df8aSEric Blake c = count_contiguous_clusters_unallocated(nb_clusters, 599*4341df8aSEric Blake &l2_table[l2_index], 600a99dfb45SKevin Wolf QCOW2_CLUSTER_UNALLOCATED); 60168d000a3SKevin Wolf *cluster_offset = 0; 60268d000a3SKevin Wolf break; 60368d000a3SKevin Wolf case QCOW2_CLUSTER_NORMAL: 60445aba42fSKevin Wolf /* how many allocated clusters ? */ 60545aba42fSKevin Wolf c = count_contiguous_clusters(nb_clusters, s->cluster_size, 60661653008SKevin Wolf &l2_table[l2_index], QCOW_OFLAG_ZERO); 60768d000a3SKevin Wolf *cluster_offset &= L2E_OFFSET_MASK; 608a97c67eeSMax Reitz if (offset_into_cluster(s, *cluster_offset)) { 609a97c67eeSMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset %#" 610a97c67eeSMax Reitz PRIx64 " unaligned (L2 offset: %#" PRIx64 611a97c67eeSMax Reitz ", L2 index: %#x)", *cluster_offset, 612a97c67eeSMax Reitz l2_offset, l2_index); 613a97c67eeSMax Reitz ret = -EIO; 614a97c67eeSMax Reitz goto fail; 615a97c67eeSMax Reitz } 61668d000a3SKevin Wolf break; 6171417d7e4SKevin Wolf default: 6181417d7e4SKevin Wolf abort(); 61945aba42fSKevin Wolf } 62045aba42fSKevin Wolf 62129c1a730SKevin Wolf qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 62229c1a730SKevin Wolf 623c834cba9SMax Reitz bytes_available = (int64_t)c * s->cluster_size; 62468d000a3SKevin Wolf 62545aba42fSKevin Wolf out: 626b2f65d6bSKevin Wolf if (bytes_available > bytes_needed) { 627b2f65d6bSKevin Wolf bytes_available = bytes_needed; 628b2f65d6bSKevin Wolf } 62945aba42fSKevin Wolf 630c834cba9SMax Reitz /* bytes_available <= bytes_needed <= *bytes + offset_in_cluster; 631c834cba9SMax Reitz * subtracting offset_in_cluster will therefore definitely yield something 632c834cba9SMax Reitz * not exceeding UINT_MAX */ 633c834cba9SMax Reitz assert(bytes_available - offset_in_cluster <= UINT_MAX); 634ecfe1863SKevin Wolf *bytes = bytes_available - offset_in_cluster; 63545aba42fSKevin Wolf 63668d000a3SKevin Wolf return ret; 637a97c67eeSMax Reitz 638a97c67eeSMax Reitz fail: 639a97c67eeSMax Reitz qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table); 640a97c67eeSMax Reitz return ret; 64145aba42fSKevin Wolf } 64245aba42fSKevin Wolf 64345aba42fSKevin Wolf /* 64445aba42fSKevin Wolf * get_cluster_table 64545aba42fSKevin Wolf * 64645aba42fSKevin Wolf * for a given disk offset, load (and allocate if needed) 64745aba42fSKevin Wolf * the l2 table. 64845aba42fSKevin Wolf * 64945aba42fSKevin Wolf * the l2 table offset in the qcow2 file and the cluster index 65045aba42fSKevin Wolf * in the l2 table are given to the caller. 65145aba42fSKevin Wolf * 6521e3e8f1aSKevin Wolf * Returns 0 on success, -errno in failure case 65345aba42fSKevin Wolf */ 65445aba42fSKevin Wolf static int get_cluster_table(BlockDriverState *bs, uint64_t offset, 65545aba42fSKevin Wolf uint64_t **new_l2_table, 65645aba42fSKevin Wolf int *new_l2_index) 65745aba42fSKevin Wolf { 658ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 6592cf7cfa1SKevin Wolf unsigned int l2_index; 6602cf7cfa1SKevin Wolf uint64_t l1_index, l2_offset; 661c46e1167SKevin Wolf uint64_t *l2_table = NULL; 66280ee15a6SKevin Wolf int ret; 66345aba42fSKevin Wolf 664b6af0975SDaniel P. Berrange /* seek to the l2 offset in the l1 table */ 66545aba42fSKevin Wolf 66645aba42fSKevin Wolf l1_index = offset >> (s->l2_bits + s->cluster_bits); 66745aba42fSKevin Wolf if (l1_index >= s->l1_size) { 66872893756SStefan Hajnoczi ret = qcow2_grow_l1_table(bs, l1_index + 1, false); 6691e3e8f1aSKevin Wolf if (ret < 0) { 6701e3e8f1aSKevin Wolf return ret; 6711e3e8f1aSKevin Wolf } 67245aba42fSKevin Wolf } 6738e37f681SKevin Wolf 6742cf7cfa1SKevin Wolf assert(l1_index < s->l1_size); 6758e37f681SKevin Wolf l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; 676a97c67eeSMax Reitz if (offset_into_cluster(s, l2_offset)) { 677a97c67eeSMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 678a97c67eeSMax Reitz " unaligned (L1 index: %#" PRIx64 ")", 679a97c67eeSMax Reitz l2_offset, l1_index); 680a97c67eeSMax Reitz return -EIO; 681a97c67eeSMax Reitz } 68245aba42fSKevin Wolf 68345aba42fSKevin Wolf /* seek the l2 table of the given l2 offset */ 68445aba42fSKevin Wolf 6858e37f681SKevin Wolf if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) { 68645aba42fSKevin Wolf /* load the l2 table in memory */ 68755c17e98SKevin Wolf ret = l2_load(bs, l2_offset, &l2_table); 68855c17e98SKevin Wolf if (ret < 0) { 68955c17e98SKevin Wolf return ret; 6901e3e8f1aSKevin Wolf } 69145aba42fSKevin Wolf } else { 69216fde5f2SKevin Wolf /* First allocate a new L2 table (and do COW if needed) */ 693c46e1167SKevin Wolf ret = l2_allocate(bs, l1_index, &l2_table); 694c46e1167SKevin Wolf if (ret < 0) { 695c46e1167SKevin Wolf return ret; 6961e3e8f1aSKevin Wolf } 69716fde5f2SKevin Wolf 69816fde5f2SKevin Wolf /* Then decrease the refcount of the old table */ 69916fde5f2SKevin Wolf if (l2_offset) { 7006cfcb9b8SKevin Wolf qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), 7016cfcb9b8SKevin Wolf QCOW2_DISCARD_OTHER); 70216fde5f2SKevin Wolf } 70345aba42fSKevin Wolf } 70445aba42fSKevin Wolf 70545aba42fSKevin Wolf /* find the cluster offset for the given disk offset */ 70645aba42fSKevin Wolf 70745aba42fSKevin Wolf l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); 70845aba42fSKevin Wolf 70945aba42fSKevin Wolf *new_l2_table = l2_table; 71045aba42fSKevin Wolf *new_l2_index = l2_index; 71145aba42fSKevin Wolf 7121e3e8f1aSKevin Wolf return 0; 71345aba42fSKevin Wolf } 71445aba42fSKevin Wolf 71545aba42fSKevin Wolf /* 71645aba42fSKevin Wolf * alloc_compressed_cluster_offset 71745aba42fSKevin Wolf * 71845aba42fSKevin Wolf * For a given offset of the disk image, return cluster offset in 71945aba42fSKevin Wolf * qcow2 file. 72045aba42fSKevin Wolf * 72145aba42fSKevin Wolf * If the offset is not found, allocate a new compressed cluster. 72245aba42fSKevin Wolf * 72345aba42fSKevin Wolf * Return the cluster offset if successful, 72445aba42fSKevin Wolf * Return 0, otherwise. 72545aba42fSKevin Wolf * 72645aba42fSKevin Wolf */ 72745aba42fSKevin Wolf 728ed6ccf0fSKevin Wolf uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, 72945aba42fSKevin Wolf uint64_t offset, 73045aba42fSKevin Wolf int compressed_size) 73145aba42fSKevin Wolf { 732ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 73345aba42fSKevin Wolf int l2_index, ret; 7343948d1d4SKevin Wolf uint64_t *l2_table; 735f4f0d391SKevin Wolf int64_t cluster_offset; 73645aba42fSKevin Wolf int nb_csectors; 73745aba42fSKevin Wolf 7383948d1d4SKevin Wolf ret = get_cluster_table(bs, offset, &l2_table, &l2_index); 7391e3e8f1aSKevin Wolf if (ret < 0) { 74045aba42fSKevin Wolf return 0; 7411e3e8f1aSKevin Wolf } 74245aba42fSKevin Wolf 743b0b6862eSKevin Wolf /* Compression can't overwrite anything. Fail if the cluster was already 744b0b6862eSKevin Wolf * allocated. */ 74545aba42fSKevin Wolf cluster_offset = be64_to_cpu(l2_table[l2_index]); 746b0b6862eSKevin Wolf if (cluster_offset & L2E_OFFSET_MASK) { 7478f1efd00SKevin Wolf qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 7488f1efd00SKevin Wolf return 0; 7498f1efd00SKevin Wolf } 75045aba42fSKevin Wolf 751ed6ccf0fSKevin Wolf cluster_offset = qcow2_alloc_bytes(bs, compressed_size); 7525d757b56SKevin Wolf if (cluster_offset < 0) { 75329c1a730SKevin Wolf qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 7545d757b56SKevin Wolf return 0; 7555d757b56SKevin Wolf } 7565d757b56SKevin Wolf 75745aba42fSKevin Wolf nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) - 75845aba42fSKevin Wolf (cluster_offset >> 9); 75945aba42fSKevin Wolf 76045aba42fSKevin Wolf cluster_offset |= QCOW_OFLAG_COMPRESSED | 76145aba42fSKevin Wolf ((uint64_t)nb_csectors << s->csize_shift); 76245aba42fSKevin Wolf 76345aba42fSKevin Wolf /* update L2 table */ 76445aba42fSKevin Wolf 76545aba42fSKevin Wolf /* compressed clusters never have the copied flag */ 76645aba42fSKevin Wolf 76766f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); 76872e80b89SAlberto Garcia qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 76945aba42fSKevin Wolf l2_table[l2_index] = cpu_to_be64(cluster_offset); 770a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 77145aba42fSKevin Wolf 77245aba42fSKevin Wolf return cluster_offset; 77345aba42fSKevin Wolf } 77445aba42fSKevin Wolf 775593fb83cSKevin Wolf static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r) 776593fb83cSKevin Wolf { 777ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 778593fb83cSKevin Wolf int ret; 779593fb83cSKevin Wolf 78085567393SKevin Wolf if (r->nb_bytes == 0) { 781593fb83cSKevin Wolf return 0; 782593fb83cSKevin Wolf } 783593fb83cSKevin Wolf 784593fb83cSKevin Wolf qemu_co_mutex_unlock(&s->lock); 78585567393SKevin Wolf ret = do_perform_cow(bs, m->offset, m->alloc_offset, r->offset, r->nb_bytes); 786593fb83cSKevin Wolf qemu_co_mutex_lock(&s->lock); 787593fb83cSKevin Wolf 788593fb83cSKevin Wolf if (ret < 0) { 789593fb83cSKevin Wolf return ret; 790593fb83cSKevin Wolf } 791593fb83cSKevin Wolf 792593fb83cSKevin Wolf /* 793593fb83cSKevin Wolf * Before we update the L2 table to actually point to the new cluster, we 794593fb83cSKevin Wolf * need to be sure that the refcounts have been increased and COW was 795593fb83cSKevin Wolf * handled. 796593fb83cSKevin Wolf */ 797593fb83cSKevin Wolf qcow2_cache_depends_on_flush(s->l2_table_cache); 798593fb83cSKevin Wolf 799593fb83cSKevin Wolf return 0; 800593fb83cSKevin Wolf } 801593fb83cSKevin Wolf 802148da7eaSKevin Wolf int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) 80345aba42fSKevin Wolf { 804ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 80545aba42fSKevin Wolf int i, j = 0, l2_index, ret; 806593fb83cSKevin Wolf uint64_t *old_cluster, *l2_table; 807250196f1SKevin Wolf uint64_t cluster_offset = m->alloc_offset; 80845aba42fSKevin Wolf 8093cce16f4SKevin Wolf trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters); 810f50f88b9SKevin Wolf assert(m->nb_clusters > 0); 81145aba42fSKevin Wolf 8125839e53bSMarkus Armbruster old_cluster = g_try_new(uint64_t, m->nb_clusters); 813de82815dSKevin Wolf if (old_cluster == NULL) { 814de82815dSKevin Wolf ret = -ENOMEM; 815de82815dSKevin Wolf goto err; 816de82815dSKevin Wolf } 81745aba42fSKevin Wolf 81845aba42fSKevin Wolf /* copy content of unmodified sectors */ 819593fb83cSKevin Wolf ret = perform_cow(bs, m, &m->cow_start); 820593fb83cSKevin Wolf if (ret < 0) { 82145aba42fSKevin Wolf goto err; 82245aba42fSKevin Wolf } 82345aba42fSKevin Wolf 824593fb83cSKevin Wolf ret = perform_cow(bs, m, &m->cow_end); 825593fb83cSKevin Wolf if (ret < 0) { 82645aba42fSKevin Wolf goto err; 82745aba42fSKevin Wolf } 82845aba42fSKevin Wolf 829593fb83cSKevin Wolf /* Update L2 table. */ 83074c4510aSKevin Wolf if (s->use_lazy_refcounts) { 831280d3735SKevin Wolf qcow2_mark_dirty(bs); 832280d3735SKevin Wolf } 833bfe8043eSStefan Hajnoczi if (qcow2_need_accurate_refcounts(s)) { 834bfe8043eSStefan Hajnoczi qcow2_cache_set_dependency(bs, s->l2_table_cache, 835bfe8043eSStefan Hajnoczi s->refcount_block_cache); 836bfe8043eSStefan Hajnoczi } 837280d3735SKevin Wolf 8383948d1d4SKevin Wolf ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index); 8391e3e8f1aSKevin Wolf if (ret < 0) { 84045aba42fSKevin Wolf goto err; 8411e3e8f1aSKevin Wolf } 84272e80b89SAlberto Garcia qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 84345aba42fSKevin Wolf 844c01dbccbSMax Reitz assert(l2_index + m->nb_clusters <= s->l2_size); 84545aba42fSKevin Wolf for (i = 0; i < m->nb_clusters; i++) { 84645aba42fSKevin Wolf /* if two concurrent writes happen to the same unallocated cluster 84745aba42fSKevin Wolf * each write allocates separate cluster and writes data concurrently. 84845aba42fSKevin Wolf * The first one to complete updates l2 table with pointer to its 84945aba42fSKevin Wolf * cluster the second one has to do RMW (which is done above by 850aaa4d20bSKevin Wolf * perform_cow()), update l2 table with its cluster pointer and free 85145aba42fSKevin Wolf * old cluster. This is what this loop does */ 852aaa4d20bSKevin Wolf if (l2_table[l2_index + i] != 0) { 85345aba42fSKevin Wolf old_cluster[j++] = l2_table[l2_index + i]; 854aaa4d20bSKevin Wolf } 85545aba42fSKevin Wolf 85645aba42fSKevin Wolf l2_table[l2_index + i] = cpu_to_be64((cluster_offset + 85745aba42fSKevin Wolf (i << s->cluster_bits)) | QCOW_OFLAG_COPIED); 85845aba42fSKevin Wolf } 85945aba42fSKevin Wolf 8609f8e668eSKevin Wolf 861a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 86245aba42fSKevin Wolf 8637ec5e6a4SKevin Wolf /* 8647ec5e6a4SKevin Wolf * If this was a COW, we need to decrease the refcount of the old cluster. 8656cfcb9b8SKevin Wolf * 8666cfcb9b8SKevin Wolf * Don't discard clusters that reach a refcount of 0 (e.g. compressed 8676cfcb9b8SKevin Wolf * clusters), the next write will reuse them anyway. 8687ec5e6a4SKevin Wolf */ 869564a6b69SMax Reitz if (!m->keep_old_clusters && j != 0) { 8707ec5e6a4SKevin Wolf for (i = 0; i < j; i++) { 8716cfcb9b8SKevin Wolf qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1, 8726cfcb9b8SKevin Wolf QCOW2_DISCARD_NEVER); 8737ec5e6a4SKevin Wolf } 8747ec5e6a4SKevin Wolf } 87545aba42fSKevin Wolf 87645aba42fSKevin Wolf ret = 0; 87745aba42fSKevin Wolf err: 8787267c094SAnthony Liguori g_free(old_cluster); 87945aba42fSKevin Wolf return ret; 88045aba42fSKevin Wolf } 88145aba42fSKevin Wolf 88245aba42fSKevin Wolf /* 883bf319eceSKevin Wolf * Returns the number of contiguous clusters that can be used for an allocating 884bf319eceSKevin Wolf * write, but require COW to be performed (this includes yet unallocated space, 885bf319eceSKevin Wolf * which must copy from the backing file) 886bf319eceSKevin Wolf */ 887ff99129aSKevin Wolf static int count_cow_clusters(BDRVQcow2State *s, int nb_clusters, 888bf319eceSKevin Wolf uint64_t *l2_table, int l2_index) 889bf319eceSKevin Wolf { 890143550a8SKevin Wolf int i; 891bf319eceSKevin Wolf 892143550a8SKevin Wolf for (i = 0; i < nb_clusters; i++) { 893143550a8SKevin Wolf uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]); 894143550a8SKevin Wolf int cluster_type = qcow2_get_cluster_type(l2_entry); 895143550a8SKevin Wolf 896143550a8SKevin Wolf switch(cluster_type) { 897143550a8SKevin Wolf case QCOW2_CLUSTER_NORMAL: 898143550a8SKevin Wolf if (l2_entry & QCOW_OFLAG_COPIED) { 899143550a8SKevin Wolf goto out; 900143550a8SKevin Wolf } 901bf319eceSKevin Wolf break; 902143550a8SKevin Wolf case QCOW2_CLUSTER_UNALLOCATED: 903143550a8SKevin Wolf case QCOW2_CLUSTER_COMPRESSED: 9046377af48SKevin Wolf case QCOW2_CLUSTER_ZERO: 905143550a8SKevin Wolf break; 906143550a8SKevin Wolf default: 907143550a8SKevin Wolf abort(); 908143550a8SKevin Wolf } 909bf319eceSKevin Wolf } 910bf319eceSKevin Wolf 911143550a8SKevin Wolf out: 912bf319eceSKevin Wolf assert(i <= nb_clusters); 913bf319eceSKevin Wolf return i; 914bf319eceSKevin Wolf } 915bf319eceSKevin Wolf 916bf319eceSKevin Wolf /* 917250196f1SKevin Wolf * Check if there already is an AIO write request in flight which allocates 918250196f1SKevin Wolf * the same cluster. In this case we need to wait until the previous 919250196f1SKevin Wolf * request has completed and updated the L2 table accordingly. 92065eb2e35SKevin Wolf * 92165eb2e35SKevin Wolf * Returns: 92265eb2e35SKevin Wolf * 0 if there was no dependency. *cur_bytes indicates the number of 92365eb2e35SKevin Wolf * bytes from guest_offset that can be read before the next 92465eb2e35SKevin Wolf * dependency must be processed (or the request is complete) 92565eb2e35SKevin Wolf * 92665eb2e35SKevin Wolf * -EAGAIN if we had to wait for another request, previously gathered 92765eb2e35SKevin Wolf * information on cluster allocation may be invalid now. The caller 92865eb2e35SKevin Wolf * must start over anyway, so consider *cur_bytes undefined. 929250196f1SKevin Wolf */ 930226c3c26SKevin Wolf static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, 931ecdd5333SKevin Wolf uint64_t *cur_bytes, QCowL2Meta **m) 932226c3c26SKevin Wolf { 933ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 934226c3c26SKevin Wolf QCowL2Meta *old_alloc; 93565eb2e35SKevin Wolf uint64_t bytes = *cur_bytes; 936226c3c26SKevin Wolf 937250196f1SKevin Wolf QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) { 938250196f1SKevin Wolf 93965eb2e35SKevin Wolf uint64_t start = guest_offset; 94065eb2e35SKevin Wolf uint64_t end = start + bytes; 94165eb2e35SKevin Wolf uint64_t old_start = l2meta_cow_start(old_alloc); 94265eb2e35SKevin Wolf uint64_t old_end = l2meta_cow_end(old_alloc); 943250196f1SKevin Wolf 944d9d74f41SKevin Wolf if (end <= old_start || start >= old_end) { 945250196f1SKevin Wolf /* No intersection */ 946250196f1SKevin Wolf } else { 947250196f1SKevin Wolf if (start < old_start) { 948250196f1SKevin Wolf /* Stop at the start of a running allocation */ 94965eb2e35SKevin Wolf bytes = old_start - start; 950250196f1SKevin Wolf } else { 95165eb2e35SKevin Wolf bytes = 0; 952250196f1SKevin Wolf } 953250196f1SKevin Wolf 954ecdd5333SKevin Wolf /* Stop if already an l2meta exists. After yielding, it wouldn't 955ecdd5333SKevin Wolf * be valid any more, so we'd have to clean up the old L2Metas 956ecdd5333SKevin Wolf * and deal with requests depending on them before starting to 957ecdd5333SKevin Wolf * gather new ones. Not worth the trouble. */ 958ecdd5333SKevin Wolf if (bytes == 0 && *m) { 959ecdd5333SKevin Wolf *cur_bytes = 0; 960ecdd5333SKevin Wolf return 0; 961ecdd5333SKevin Wolf } 962ecdd5333SKevin Wolf 96365eb2e35SKevin Wolf if (bytes == 0) { 964250196f1SKevin Wolf /* Wait for the dependency to complete. We need to recheck 965250196f1SKevin Wolf * the free/allocated clusters when we continue. */ 9661ace7ceaSPaolo Bonzini qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock); 967250196f1SKevin Wolf return -EAGAIN; 968250196f1SKevin Wolf } 969250196f1SKevin Wolf } 970250196f1SKevin Wolf } 971250196f1SKevin Wolf 97265eb2e35SKevin Wolf /* Make sure that existing clusters and new allocations are only used up to 97365eb2e35SKevin Wolf * the next dependency if we shortened the request above */ 97465eb2e35SKevin Wolf *cur_bytes = bytes; 975250196f1SKevin Wolf 976226c3c26SKevin Wolf return 0; 977226c3c26SKevin Wolf } 978226c3c26SKevin Wolf 979226c3c26SKevin Wolf /* 9800af729ecSKevin Wolf * Checks how many already allocated clusters that don't require a copy on 9810af729ecSKevin Wolf * write there are at the given guest_offset (up to *bytes). If 9820af729ecSKevin Wolf * *host_offset is not zero, only physically contiguous clusters beginning at 9830af729ecSKevin Wolf * this host offset are counted. 9840af729ecSKevin Wolf * 985411d62b0SKevin Wolf * Note that guest_offset may not be cluster aligned. In this case, the 986411d62b0SKevin Wolf * returned *host_offset points to exact byte referenced by guest_offset and 987411d62b0SKevin Wolf * therefore isn't cluster aligned as well. 9880af729ecSKevin Wolf * 9890af729ecSKevin Wolf * Returns: 9900af729ecSKevin Wolf * 0: if no allocated clusters are available at the given offset. 9910af729ecSKevin Wolf * *bytes is normally unchanged. It is set to 0 if the cluster 9920af729ecSKevin Wolf * is allocated and doesn't need COW, but doesn't have the right 9930af729ecSKevin Wolf * physical offset. 9940af729ecSKevin Wolf * 9950af729ecSKevin Wolf * 1: if allocated clusters that don't require a COW are available at 9960af729ecSKevin Wolf * the requested offset. *bytes may have decreased and describes 9970af729ecSKevin Wolf * the length of the area that can be written to. 9980af729ecSKevin Wolf * 9990af729ecSKevin Wolf * -errno: in error cases 10000af729ecSKevin Wolf */ 10010af729ecSKevin Wolf static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, 1002c53ede9fSKevin Wolf uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) 10030af729ecSKevin Wolf { 1004ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 10050af729ecSKevin Wolf int l2_index; 10060af729ecSKevin Wolf uint64_t cluster_offset; 10070af729ecSKevin Wolf uint64_t *l2_table; 1008b6d36defSMax Reitz uint64_t nb_clusters; 1009c53ede9fSKevin Wolf unsigned int keep_clusters; 1010a3f1afb4SAlberto Garcia int ret; 10110af729ecSKevin Wolf 10120af729ecSKevin Wolf trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset, 10130af729ecSKevin Wolf *bytes); 10140af729ecSKevin Wolf 1015411d62b0SKevin Wolf assert(*host_offset == 0 || offset_into_cluster(s, guest_offset) 1016411d62b0SKevin Wolf == offset_into_cluster(s, *host_offset)); 1017411d62b0SKevin Wolf 1018acb0467fSKevin Wolf /* 1019acb0467fSKevin Wolf * Calculate the number of clusters to look for. We stop at L2 table 1020acb0467fSKevin Wolf * boundaries to keep things simple. 1021acb0467fSKevin Wolf */ 1022acb0467fSKevin Wolf nb_clusters = 1023acb0467fSKevin Wolf size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); 1024acb0467fSKevin Wolf 1025acb0467fSKevin Wolf l2_index = offset_to_l2_index(s, guest_offset); 1026acb0467fSKevin Wolf nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1027b6d36defSMax Reitz assert(nb_clusters <= INT_MAX); 1028acb0467fSKevin Wolf 10290af729ecSKevin Wolf /* Find L2 entry for the first involved cluster */ 10300af729ecSKevin Wolf ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); 10310af729ecSKevin Wolf if (ret < 0) { 10320af729ecSKevin Wolf return ret; 10330af729ecSKevin Wolf } 10340af729ecSKevin Wolf 10350af729ecSKevin Wolf cluster_offset = be64_to_cpu(l2_table[l2_index]); 10360af729ecSKevin Wolf 10370af729ecSKevin Wolf /* Check how many clusters are already allocated and don't need COW */ 10380af729ecSKevin Wolf if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL 10390af729ecSKevin Wolf && (cluster_offset & QCOW_OFLAG_COPIED)) 10400af729ecSKevin Wolf { 1041e62daaf6SKevin Wolf /* If a specific host_offset is required, check it */ 1042e62daaf6SKevin Wolf bool offset_matches = 1043e62daaf6SKevin Wolf (cluster_offset & L2E_OFFSET_MASK) == *host_offset; 1044e62daaf6SKevin Wolf 1045a97c67eeSMax Reitz if (offset_into_cluster(s, cluster_offset & L2E_OFFSET_MASK)) { 1046a97c67eeSMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset " 1047a97c67eeSMax Reitz "%#llx unaligned (guest offset: %#" PRIx64 1048a97c67eeSMax Reitz ")", cluster_offset & L2E_OFFSET_MASK, 1049a97c67eeSMax Reitz guest_offset); 1050a97c67eeSMax Reitz ret = -EIO; 1051a97c67eeSMax Reitz goto out; 1052a97c67eeSMax Reitz } 1053a97c67eeSMax Reitz 1054e62daaf6SKevin Wolf if (*host_offset != 0 && !offset_matches) { 1055e62daaf6SKevin Wolf *bytes = 0; 1056e62daaf6SKevin Wolf ret = 0; 1057e62daaf6SKevin Wolf goto out; 1058e62daaf6SKevin Wolf } 1059e62daaf6SKevin Wolf 10600af729ecSKevin Wolf /* We keep all QCOW_OFLAG_COPIED clusters */ 1061c53ede9fSKevin Wolf keep_clusters = 1062acb0467fSKevin Wolf count_contiguous_clusters(nb_clusters, s->cluster_size, 106361653008SKevin Wolf &l2_table[l2_index], 10640af729ecSKevin Wolf QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO); 1065c53ede9fSKevin Wolf assert(keep_clusters <= nb_clusters); 1066c53ede9fSKevin Wolf 1067c53ede9fSKevin Wolf *bytes = MIN(*bytes, 1068c53ede9fSKevin Wolf keep_clusters * s->cluster_size 1069c53ede9fSKevin Wolf - offset_into_cluster(s, guest_offset)); 10700af729ecSKevin Wolf 10710af729ecSKevin Wolf ret = 1; 10720af729ecSKevin Wolf } else { 10730af729ecSKevin Wolf ret = 0; 10740af729ecSKevin Wolf } 10750af729ecSKevin Wolf 10760af729ecSKevin Wolf /* Cleanup */ 1077e62daaf6SKevin Wolf out: 1078a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 10790af729ecSKevin Wolf 1080e62daaf6SKevin Wolf /* Only return a host offset if we actually made progress. Otherwise we 1081e62daaf6SKevin Wolf * would make requirements for handle_alloc() that it can't fulfill */ 1082a97c67eeSMax Reitz if (ret > 0) { 1083411d62b0SKevin Wolf *host_offset = (cluster_offset & L2E_OFFSET_MASK) 1084411d62b0SKevin Wolf + offset_into_cluster(s, guest_offset); 1085e62daaf6SKevin Wolf } 1086e62daaf6SKevin Wolf 10870af729ecSKevin Wolf return ret; 10880af729ecSKevin Wolf } 10890af729ecSKevin Wolf 10900af729ecSKevin Wolf /* 1091226c3c26SKevin Wolf * Allocates new clusters for the given guest_offset. 1092226c3c26SKevin Wolf * 1093226c3c26SKevin Wolf * At most *nb_clusters are allocated, and on return *nb_clusters is updated to 1094226c3c26SKevin Wolf * contain the number of clusters that have been allocated and are contiguous 1095226c3c26SKevin Wolf * in the image file. 1096226c3c26SKevin Wolf * 1097226c3c26SKevin Wolf * If *host_offset is non-zero, it specifies the offset in the image file at 1098226c3c26SKevin Wolf * which the new clusters must start. *nb_clusters can be 0 on return in this 1099226c3c26SKevin Wolf * case if the cluster at host_offset is already in use. If *host_offset is 1100226c3c26SKevin Wolf * zero, the clusters can be allocated anywhere in the image file. 1101226c3c26SKevin Wolf * 1102226c3c26SKevin Wolf * *host_offset is updated to contain the offset into the image file at which 1103226c3c26SKevin Wolf * the first allocated cluster starts. 1104226c3c26SKevin Wolf * 1105226c3c26SKevin Wolf * Return 0 on success and -errno in error cases. -EAGAIN means that the 1106226c3c26SKevin Wolf * function has been waiting for another request and the allocation must be 1107226c3c26SKevin Wolf * restarted, but the whole request should not be failed. 1108226c3c26SKevin Wolf */ 1109226c3c26SKevin Wolf static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, 1110b6d36defSMax Reitz uint64_t *host_offset, uint64_t *nb_clusters) 1111226c3c26SKevin Wolf { 1112ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 1113226c3c26SKevin Wolf 1114226c3c26SKevin Wolf trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset, 1115226c3c26SKevin Wolf *host_offset, *nb_clusters); 1116226c3c26SKevin Wolf 1117250196f1SKevin Wolf /* Allocate new clusters */ 1118250196f1SKevin Wolf trace_qcow2_cluster_alloc_phys(qemu_coroutine_self()); 1119250196f1SKevin Wolf if (*host_offset == 0) { 1120df021791SKevin Wolf int64_t cluster_offset = 1121df021791SKevin Wolf qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size); 1122250196f1SKevin Wolf if (cluster_offset < 0) { 1123250196f1SKevin Wolf return cluster_offset; 1124250196f1SKevin Wolf } 1125250196f1SKevin Wolf *host_offset = cluster_offset; 1126250196f1SKevin Wolf return 0; 1127df021791SKevin Wolf } else { 1128b6d36defSMax Reitz int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters); 1129df021791SKevin Wolf if (ret < 0) { 1130df021791SKevin Wolf return ret; 1131df021791SKevin Wolf } 1132df021791SKevin Wolf *nb_clusters = ret; 1133df021791SKevin Wolf return 0; 1134df021791SKevin Wolf } 1135250196f1SKevin Wolf } 1136250196f1SKevin Wolf 1137250196f1SKevin Wolf /* 113810f0ed8bSKevin Wolf * Allocates new clusters for an area that either is yet unallocated or needs a 113910f0ed8bSKevin Wolf * copy on write. If *host_offset is non-zero, clusters are only allocated if 114010f0ed8bSKevin Wolf * the new allocation can match the specified host offset. 114110f0ed8bSKevin Wolf * 1142411d62b0SKevin Wolf * Note that guest_offset may not be cluster aligned. In this case, the 1143411d62b0SKevin Wolf * returned *host_offset points to exact byte referenced by guest_offset and 1144411d62b0SKevin Wolf * therefore isn't cluster aligned as well. 114510f0ed8bSKevin Wolf * 114610f0ed8bSKevin Wolf * Returns: 114710f0ed8bSKevin Wolf * 0: if no clusters could be allocated. *bytes is set to 0, 114810f0ed8bSKevin Wolf * *host_offset is left unchanged. 114910f0ed8bSKevin Wolf * 115010f0ed8bSKevin Wolf * 1: if new clusters were allocated. *bytes may be decreased if the 115110f0ed8bSKevin Wolf * new allocation doesn't cover all of the requested area. 115210f0ed8bSKevin Wolf * *host_offset is updated to contain the host offset of the first 115310f0ed8bSKevin Wolf * newly allocated cluster. 115410f0ed8bSKevin Wolf * 115510f0ed8bSKevin Wolf * -errno: in error cases 115610f0ed8bSKevin Wolf */ 115710f0ed8bSKevin Wolf static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, 1158c37f4cd7SKevin Wolf uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) 115910f0ed8bSKevin Wolf { 1160ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 116110f0ed8bSKevin Wolf int l2_index; 116210f0ed8bSKevin Wolf uint64_t *l2_table; 116310f0ed8bSKevin Wolf uint64_t entry; 1164b6d36defSMax Reitz uint64_t nb_clusters; 116510f0ed8bSKevin Wolf int ret; 1166564a6b69SMax Reitz bool keep_old_clusters = false; 116710f0ed8bSKevin Wolf 1168564a6b69SMax Reitz uint64_t alloc_cluster_offset = 0; 116910f0ed8bSKevin Wolf 117010f0ed8bSKevin Wolf trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset, 117110f0ed8bSKevin Wolf *bytes); 117210f0ed8bSKevin Wolf assert(*bytes > 0); 117310f0ed8bSKevin Wolf 1174f5bc6350SKevin Wolf /* 1175f5bc6350SKevin Wolf * Calculate the number of clusters to look for. We stop at L2 table 1176f5bc6350SKevin Wolf * boundaries to keep things simple. 1177f5bc6350SKevin Wolf */ 1178c37f4cd7SKevin Wolf nb_clusters = 1179c37f4cd7SKevin Wolf size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); 1180c37f4cd7SKevin Wolf 1181f5bc6350SKevin Wolf l2_index = offset_to_l2_index(s, guest_offset); 1182c37f4cd7SKevin Wolf nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1183b6d36defSMax Reitz assert(nb_clusters <= INT_MAX); 1184f5bc6350SKevin Wolf 118510f0ed8bSKevin Wolf /* Find L2 entry for the first involved cluster */ 118610f0ed8bSKevin Wolf ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); 118710f0ed8bSKevin Wolf if (ret < 0) { 118810f0ed8bSKevin Wolf return ret; 118910f0ed8bSKevin Wolf } 119010f0ed8bSKevin Wolf 11913b8e2e26SKevin Wolf entry = be64_to_cpu(l2_table[l2_index]); 119210f0ed8bSKevin Wolf 119310f0ed8bSKevin Wolf /* For the moment, overwrite compressed clusters one by one */ 119410f0ed8bSKevin Wolf if (entry & QCOW_OFLAG_COMPRESSED) { 119510f0ed8bSKevin Wolf nb_clusters = 1; 119610f0ed8bSKevin Wolf } else { 11973b8e2e26SKevin Wolf nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index); 119810f0ed8bSKevin Wolf } 119910f0ed8bSKevin Wolf 1200ecdd5333SKevin Wolf /* This function is only called when there were no non-COW clusters, so if 1201ecdd5333SKevin Wolf * we can't find any unallocated or COW clusters either, something is 1202ecdd5333SKevin Wolf * wrong with our code. */ 1203ecdd5333SKevin Wolf assert(nb_clusters > 0); 1204ecdd5333SKevin Wolf 1205564a6b69SMax Reitz if (qcow2_get_cluster_type(entry) == QCOW2_CLUSTER_ZERO && 1206564a6b69SMax Reitz (entry & L2E_OFFSET_MASK) != 0 && (entry & QCOW_OFLAG_COPIED) && 1207564a6b69SMax Reitz (!*host_offset || 1208564a6b69SMax Reitz start_of_cluster(s, *host_offset) == (entry & L2E_OFFSET_MASK))) 1209564a6b69SMax Reitz { 1210564a6b69SMax Reitz /* Try to reuse preallocated zero clusters; contiguous normal clusters 1211564a6b69SMax Reitz * would be fine, too, but count_cow_clusters() above has limited 1212564a6b69SMax Reitz * nb_clusters already to a range of COW clusters */ 1213564a6b69SMax Reitz int preallocated_nb_clusters = 1214564a6b69SMax Reitz count_contiguous_clusters(nb_clusters, s->cluster_size, 1215564a6b69SMax Reitz &l2_table[l2_index], QCOW_OFLAG_COPIED); 1216564a6b69SMax Reitz assert(preallocated_nb_clusters > 0); 1217564a6b69SMax Reitz 1218564a6b69SMax Reitz nb_clusters = preallocated_nb_clusters; 1219564a6b69SMax Reitz alloc_cluster_offset = entry & L2E_OFFSET_MASK; 1220564a6b69SMax Reitz 1221564a6b69SMax Reitz /* We want to reuse these clusters, so qcow2_alloc_cluster_link_l2() 1222564a6b69SMax Reitz * should not free them. */ 1223564a6b69SMax Reitz keep_old_clusters = true; 1224564a6b69SMax Reitz } 1225564a6b69SMax Reitz 1226a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 122710f0ed8bSKevin Wolf 1228564a6b69SMax Reitz if (!alloc_cluster_offset) { 122910f0ed8bSKevin Wolf /* Allocate, if necessary at a given offset in the image file */ 1230411d62b0SKevin Wolf alloc_cluster_offset = start_of_cluster(s, *host_offset); 123183baa9a4SKevin Wolf ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset, 123210f0ed8bSKevin Wolf &nb_clusters); 123310f0ed8bSKevin Wolf if (ret < 0) { 123410f0ed8bSKevin Wolf goto fail; 123510f0ed8bSKevin Wolf } 123610f0ed8bSKevin Wolf 123783baa9a4SKevin Wolf /* Can't extend contiguous allocation */ 123883baa9a4SKevin Wolf if (nb_clusters == 0) { 123983baa9a4SKevin Wolf *bytes = 0; 124083baa9a4SKevin Wolf return 0; 124183baa9a4SKevin Wolf } 124283baa9a4SKevin Wolf 1243564a6b69SMax Reitz /* !*host_offset would overwrite the image header and is reserved for 1244564a6b69SMax Reitz * "no host offset preferred". If 0 was a valid host offset, it'd 1245564a6b69SMax Reitz * trigger the following overlap check; do that now to avoid having an 1246564a6b69SMax Reitz * invalid value in *host_offset. */ 1247ff52aab2SMax Reitz if (!alloc_cluster_offset) { 1248ff52aab2SMax Reitz ret = qcow2_pre_write_overlap_check(bs, 0, alloc_cluster_offset, 1249ff52aab2SMax Reitz nb_clusters * s->cluster_size); 1250ff52aab2SMax Reitz assert(ret < 0); 1251ff52aab2SMax Reitz goto fail; 1252ff52aab2SMax Reitz } 1253564a6b69SMax Reitz } 1254ff52aab2SMax Reitz 125510f0ed8bSKevin Wolf /* 125683baa9a4SKevin Wolf * Save info needed for meta data update. 125783baa9a4SKevin Wolf * 125885567393SKevin Wolf * requested_bytes: Number of bytes from the start of the first 125910f0ed8bSKevin Wolf * newly allocated cluster to the end of the (possibly shortened 126010f0ed8bSKevin Wolf * before) write request. 126110f0ed8bSKevin Wolf * 126285567393SKevin Wolf * avail_bytes: Number of bytes from the start of the first 126310f0ed8bSKevin Wolf * newly allocated to the end of the last newly allocated cluster. 126410f0ed8bSKevin Wolf * 126585567393SKevin Wolf * nb_bytes: The number of bytes from the start of the first 126683baa9a4SKevin Wolf * newly allocated cluster to the end of the area that the write 126710f0ed8bSKevin Wolf * request actually writes to (excluding COW at the end) 126810f0ed8bSKevin Wolf */ 126985567393SKevin Wolf uint64_t requested_bytes = *bytes + offset_into_cluster(s, guest_offset); 127085567393SKevin Wolf int avail_bytes = MIN(INT_MAX, nb_clusters << s->cluster_bits); 127185567393SKevin Wolf int nb_bytes = MIN(requested_bytes, avail_bytes); 127288c6588cSKevin Wolf QCowL2Meta *old_m = *m; 127310f0ed8bSKevin Wolf 127410f0ed8bSKevin Wolf *m = g_malloc0(sizeof(**m)); 127510f0ed8bSKevin Wolf 127610f0ed8bSKevin Wolf **m = (QCowL2Meta) { 127788c6588cSKevin Wolf .next = old_m, 127888c6588cSKevin Wolf 1279411d62b0SKevin Wolf .alloc_offset = alloc_cluster_offset, 128083baa9a4SKevin Wolf .offset = start_of_cluster(s, guest_offset), 128110f0ed8bSKevin Wolf .nb_clusters = nb_clusters, 128210f0ed8bSKevin Wolf 1283564a6b69SMax Reitz .keep_old_clusters = keep_old_clusters, 1284564a6b69SMax Reitz 128510f0ed8bSKevin Wolf .cow_start = { 128610f0ed8bSKevin Wolf .offset = 0, 128785567393SKevin Wolf .nb_bytes = offset_into_cluster(s, guest_offset), 128810f0ed8bSKevin Wolf }, 128910f0ed8bSKevin Wolf .cow_end = { 129085567393SKevin Wolf .offset = nb_bytes, 129185567393SKevin Wolf .nb_bytes = avail_bytes - nb_bytes, 129210f0ed8bSKevin Wolf }, 129310f0ed8bSKevin Wolf }; 129410f0ed8bSKevin Wolf qemu_co_queue_init(&(*m)->dependent_requests); 129510f0ed8bSKevin Wolf QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight); 129610f0ed8bSKevin Wolf 1297411d62b0SKevin Wolf *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset); 129885567393SKevin Wolf *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset)); 1299c37f4cd7SKevin Wolf assert(*bytes != 0); 130010f0ed8bSKevin Wolf 130110f0ed8bSKevin Wolf return 1; 130210f0ed8bSKevin Wolf 130310f0ed8bSKevin Wolf fail: 130410f0ed8bSKevin Wolf if (*m && (*m)->nb_clusters > 0) { 130510f0ed8bSKevin Wolf QLIST_REMOVE(*m, next_in_flight); 130610f0ed8bSKevin Wolf } 130710f0ed8bSKevin Wolf return ret; 130810f0ed8bSKevin Wolf } 130910f0ed8bSKevin Wolf 131010f0ed8bSKevin Wolf /* 131145aba42fSKevin Wolf * alloc_cluster_offset 131245aba42fSKevin Wolf * 1313250196f1SKevin Wolf * For a given offset on the virtual disk, find the cluster offset in qcow2 1314250196f1SKevin Wolf * file. If the offset is not found, allocate a new cluster. 131545aba42fSKevin Wolf * 1316250196f1SKevin Wolf * If the cluster was already allocated, m->nb_clusters is set to 0 and 1317a7912369SFrediano Ziglio * other fields in m are meaningless. 131845aba42fSKevin Wolf * 1319148da7eaSKevin Wolf * If the cluster is newly allocated, m->nb_clusters is set to the number of 132068d100e9SKevin Wolf * contiguous clusters that have been allocated. In this case, the other 132168d100e9SKevin Wolf * fields of m are valid and contain information about the first allocated 132268d100e9SKevin Wolf * cluster. 1323148da7eaSKevin Wolf * 132468d100e9SKevin Wolf * If the request conflicts with another write request in flight, the coroutine 132568d100e9SKevin Wolf * is queued and will be reentered when the dependency has completed. 1326148da7eaSKevin Wolf * 1327148da7eaSKevin Wolf * Return 0 on success and -errno in error cases 132845aba42fSKevin Wolf */ 1329f4f0d391SKevin Wolf int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, 1330d46a0bb2SKevin Wolf unsigned int *bytes, uint64_t *host_offset, 1331d46a0bb2SKevin Wolf QCowL2Meta **m) 133245aba42fSKevin Wolf { 1333ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 1334710c2496SKevin Wolf uint64_t start, remaining; 1335250196f1SKevin Wolf uint64_t cluster_offset; 133665eb2e35SKevin Wolf uint64_t cur_bytes; 1337710c2496SKevin Wolf int ret; 133845aba42fSKevin Wolf 1339d46a0bb2SKevin Wolf trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *bytes); 1340710c2496SKevin Wolf 134172424114SKevin Wolf again: 134216f0587eSHu Tao start = offset; 1343d46a0bb2SKevin Wolf remaining = *bytes; 13440af729ecSKevin Wolf cluster_offset = 0; 13450af729ecSKevin Wolf *host_offset = 0; 1346ecdd5333SKevin Wolf cur_bytes = 0; 1347ecdd5333SKevin Wolf *m = NULL; 13480af729ecSKevin Wolf 13492c3b32d2SKevin Wolf while (true) { 1350ecdd5333SKevin Wolf 1351ecdd5333SKevin Wolf if (!*host_offset) { 1352ecdd5333SKevin Wolf *host_offset = start_of_cluster(s, cluster_offset); 1353ecdd5333SKevin Wolf } 1354ecdd5333SKevin Wolf 1355ecdd5333SKevin Wolf assert(remaining >= cur_bytes); 1356ecdd5333SKevin Wolf 1357ecdd5333SKevin Wolf start += cur_bytes; 1358ecdd5333SKevin Wolf remaining -= cur_bytes; 1359ecdd5333SKevin Wolf cluster_offset += cur_bytes; 1360ecdd5333SKevin Wolf 1361ecdd5333SKevin Wolf if (remaining == 0) { 1362ecdd5333SKevin Wolf break; 1363ecdd5333SKevin Wolf } 1364ecdd5333SKevin Wolf 1365ecdd5333SKevin Wolf cur_bytes = remaining; 1366ecdd5333SKevin Wolf 1367250196f1SKevin Wolf /* 136817a71e58SKevin Wolf * Now start gathering as many contiguous clusters as possible: 136917a71e58SKevin Wolf * 137017a71e58SKevin Wolf * 1. Check for overlaps with in-flight allocations 137117a71e58SKevin Wolf * 13722c3b32d2SKevin Wolf * a) Overlap not in the first cluster -> shorten this request and 13732c3b32d2SKevin Wolf * let the caller handle the rest in its next loop iteration. 137417a71e58SKevin Wolf * 13752c3b32d2SKevin Wolf * b) Real overlaps of two requests. Yield and restart the search 13762c3b32d2SKevin Wolf * for contiguous clusters (the situation could have changed 13772c3b32d2SKevin Wolf * while we were sleeping) 137817a71e58SKevin Wolf * 137917a71e58SKevin Wolf * c) TODO: Request starts in the same cluster as the in-flight 13802c3b32d2SKevin Wolf * allocation ends. Shorten the COW of the in-fight allocation, 13812c3b32d2SKevin Wolf * set cluster_offset to write to the same cluster and set up 13822c3b32d2SKevin Wolf * the right synchronisation between the in-flight request and 13832c3b32d2SKevin Wolf * the new one. 138417a71e58SKevin Wolf */ 1385ecdd5333SKevin Wolf ret = handle_dependencies(bs, start, &cur_bytes, m); 138617a71e58SKevin Wolf if (ret == -EAGAIN) { 1387ecdd5333SKevin Wolf /* Currently handle_dependencies() doesn't yield if we already had 1388ecdd5333SKevin Wolf * an allocation. If it did, we would have to clean up the L2Meta 1389ecdd5333SKevin Wolf * structs before starting over. */ 1390ecdd5333SKevin Wolf assert(*m == NULL); 139117a71e58SKevin Wolf goto again; 139217a71e58SKevin Wolf } else if (ret < 0) { 139317a71e58SKevin Wolf return ret; 1394ecdd5333SKevin Wolf } else if (cur_bytes == 0) { 1395ecdd5333SKevin Wolf break; 139617a71e58SKevin Wolf } else { 139717a71e58SKevin Wolf /* handle_dependencies() may have decreased cur_bytes (shortened 139817a71e58SKevin Wolf * the allocations below) so that the next dependency is processed 139917a71e58SKevin Wolf * correctly during the next loop iteration. */ 140017a71e58SKevin Wolf } 140117a71e58SKevin Wolf 140272424114SKevin Wolf /* 14030af729ecSKevin Wolf * 2. Count contiguous COPIED clusters. 140472424114SKevin Wolf */ 1405710c2496SKevin Wolf ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m); 140672424114SKevin Wolf if (ret < 0) { 140772424114SKevin Wolf return ret; 14080af729ecSKevin Wolf } else if (ret) { 1409ecdd5333SKevin Wolf continue; 1410e62daaf6SKevin Wolf } else if (cur_bytes == 0) { 14112c3b32d2SKevin Wolf break; 141272424114SKevin Wolf } 141372424114SKevin Wolf 14140af729ecSKevin Wolf /* 14150af729ecSKevin Wolf * 3. If the request still hasn't completed, allocate new clusters, 14160af729ecSKevin Wolf * considering any cluster_offset of steps 1c or 2. 14170af729ecSKevin Wolf */ 1418710c2496SKevin Wolf ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m); 1419037689d8SKevin Wolf if (ret < 0) { 1420037689d8SKevin Wolf return ret; 1421710c2496SKevin Wolf } else if (ret) { 1422ecdd5333SKevin Wolf continue; 14232c3b32d2SKevin Wolf } else { 14242c3b32d2SKevin Wolf assert(cur_bytes == 0); 14252c3b32d2SKevin Wolf break; 14262c3b32d2SKevin Wolf } 1427710c2496SKevin Wolf } 1428250196f1SKevin Wolf 1429d46a0bb2SKevin Wolf *bytes -= remaining; 1430d46a0bb2SKevin Wolf assert(*bytes > 0); 1431710c2496SKevin Wolf assert(*host_offset != 0); 143245aba42fSKevin Wolf 1433148da7eaSKevin Wolf return 0; 143445aba42fSKevin Wolf } 143545aba42fSKevin Wolf 143645aba42fSKevin Wolf static int decompress_buffer(uint8_t *out_buf, int out_buf_size, 143745aba42fSKevin Wolf const uint8_t *buf, int buf_size) 143845aba42fSKevin Wolf { 143945aba42fSKevin Wolf z_stream strm1, *strm = &strm1; 144045aba42fSKevin Wolf int ret, out_len; 144145aba42fSKevin Wolf 144245aba42fSKevin Wolf memset(strm, 0, sizeof(*strm)); 144345aba42fSKevin Wolf 144445aba42fSKevin Wolf strm->next_in = (uint8_t *)buf; 144545aba42fSKevin Wolf strm->avail_in = buf_size; 144645aba42fSKevin Wolf strm->next_out = out_buf; 144745aba42fSKevin Wolf strm->avail_out = out_buf_size; 144845aba42fSKevin Wolf 144945aba42fSKevin Wolf ret = inflateInit2(strm, -12); 145045aba42fSKevin Wolf if (ret != Z_OK) 145145aba42fSKevin Wolf return -1; 145245aba42fSKevin Wolf ret = inflate(strm, Z_FINISH); 145345aba42fSKevin Wolf out_len = strm->next_out - out_buf; 145445aba42fSKevin Wolf if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || 145545aba42fSKevin Wolf out_len != out_buf_size) { 145645aba42fSKevin Wolf inflateEnd(strm); 145745aba42fSKevin Wolf return -1; 145845aba42fSKevin Wolf } 145945aba42fSKevin Wolf inflateEnd(strm); 146045aba42fSKevin Wolf return 0; 146145aba42fSKevin Wolf } 146245aba42fSKevin Wolf 146366f82ceeSKevin Wolf int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) 146445aba42fSKevin Wolf { 1465ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 146645aba42fSKevin Wolf int ret, csize, nb_csectors, sector_offset; 146745aba42fSKevin Wolf uint64_t coffset; 146845aba42fSKevin Wolf 146945aba42fSKevin Wolf coffset = cluster_offset & s->cluster_offset_mask; 147045aba42fSKevin Wolf if (s->cluster_cache_offset != coffset) { 147145aba42fSKevin Wolf nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1; 147245aba42fSKevin Wolf sector_offset = coffset & 511; 147345aba42fSKevin Wolf csize = nb_csectors * 512 - sector_offset; 147466f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); 1475fbcbbf4eSKevin Wolf ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, 14769a4f4c31SKevin Wolf nb_csectors); 147745aba42fSKevin Wolf if (ret < 0) { 14788af36488SKevin Wolf return ret; 147945aba42fSKevin Wolf } 148045aba42fSKevin Wolf if (decompress_buffer(s->cluster_cache, s->cluster_size, 148145aba42fSKevin Wolf s->cluster_data + sector_offset, csize) < 0) { 14828af36488SKevin Wolf return -EIO; 148345aba42fSKevin Wolf } 148445aba42fSKevin Wolf s->cluster_cache_offset = coffset; 148545aba42fSKevin Wolf } 148645aba42fSKevin Wolf return 0; 148745aba42fSKevin Wolf } 14885ea929e3SKevin Wolf 14895ea929e3SKevin Wolf /* 14905ea929e3SKevin Wolf * This discards as many clusters of nb_clusters as possible at once (i.e. 14915ea929e3SKevin Wolf * all clusters in the same L2 table) and returns the number of discarded 14925ea929e3SKevin Wolf * clusters. 14935ea929e3SKevin Wolf */ 14945ea929e3SKevin Wolf static int discard_single_l2(BlockDriverState *bs, uint64_t offset, 1495b6d36defSMax Reitz uint64_t nb_clusters, enum qcow2_discard_type type, 1496b6d36defSMax Reitz bool full_discard) 14975ea929e3SKevin Wolf { 1498ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 14993948d1d4SKevin Wolf uint64_t *l2_table; 15005ea929e3SKevin Wolf int l2_index; 15015ea929e3SKevin Wolf int ret; 15025ea929e3SKevin Wolf int i; 15035ea929e3SKevin Wolf 15043948d1d4SKevin Wolf ret = get_cluster_table(bs, offset, &l2_table, &l2_index); 15055ea929e3SKevin Wolf if (ret < 0) { 15065ea929e3SKevin Wolf return ret; 15075ea929e3SKevin Wolf } 15085ea929e3SKevin Wolf 15095ea929e3SKevin Wolf /* Limit nb_clusters to one L2 table */ 15105ea929e3SKevin Wolf nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1511b6d36defSMax Reitz assert(nb_clusters <= INT_MAX); 15125ea929e3SKevin Wolf 15135ea929e3SKevin Wolf for (i = 0; i < nb_clusters; i++) { 1514c883db0dSMax Reitz uint64_t old_l2_entry; 15155ea929e3SKevin Wolf 1516c883db0dSMax Reitz old_l2_entry = be64_to_cpu(l2_table[l2_index + i]); 1517a71835a0SKevin Wolf 1518a71835a0SKevin Wolf /* 1519808c4b6fSMax Reitz * If full_discard is false, make sure that a discarded area reads back 1520808c4b6fSMax Reitz * as zeroes for v3 images (we cannot do it for v2 without actually 1521808c4b6fSMax Reitz * writing a zero-filled buffer). We can skip the operation if the 1522808c4b6fSMax Reitz * cluster is already marked as zero, or if it's unallocated and we 1523808c4b6fSMax Reitz * don't have a backing file. 1524a71835a0SKevin Wolf * 1525a71835a0SKevin Wolf * TODO We might want to use bdrv_get_block_status(bs) here, but we're 1526a71835a0SKevin Wolf * holding s->lock, so that doesn't work today. 1527808c4b6fSMax Reitz * 1528808c4b6fSMax Reitz * If full_discard is true, the sector should not read back as zeroes, 1529808c4b6fSMax Reitz * but rather fall through to the backing file. 1530a71835a0SKevin Wolf */ 1531c883db0dSMax Reitz switch (qcow2_get_cluster_type(old_l2_entry)) { 1532c883db0dSMax Reitz case QCOW2_CLUSTER_UNALLOCATED: 1533760e0063SKevin Wolf if (full_discard || !bs->backing) { 1534a71835a0SKevin Wolf continue; 1535a71835a0SKevin Wolf } 1536c883db0dSMax Reitz break; 1537a71835a0SKevin Wolf 1538c883db0dSMax Reitz case QCOW2_CLUSTER_ZERO: 1539293073a5SMax Reitz /* Preallocated zero clusters should be discarded in any case */ 1540293073a5SMax Reitz if (!full_discard && (old_l2_entry & L2E_OFFSET_MASK) == 0) { 15415ea929e3SKevin Wolf continue; 1542808c4b6fSMax Reitz } 1543808c4b6fSMax Reitz break; 1544c883db0dSMax Reitz 1545c883db0dSMax Reitz case QCOW2_CLUSTER_NORMAL: 1546c883db0dSMax Reitz case QCOW2_CLUSTER_COMPRESSED: 1547c883db0dSMax Reitz break; 1548c883db0dSMax Reitz 1549c883db0dSMax Reitz default: 1550c883db0dSMax Reitz abort(); 15515ea929e3SKevin Wolf } 15525ea929e3SKevin Wolf 15535ea929e3SKevin Wolf /* First remove L2 entries */ 155472e80b89SAlberto Garcia qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 1555808c4b6fSMax Reitz if (!full_discard && s->qcow_version >= 3) { 1556a71835a0SKevin Wolf l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); 1557a71835a0SKevin Wolf } else { 15585ea929e3SKevin Wolf l2_table[l2_index + i] = cpu_to_be64(0); 1559a71835a0SKevin Wolf } 15605ea929e3SKevin Wolf 15615ea929e3SKevin Wolf /* Then decrease the refcount */ 1562c883db0dSMax Reitz qcow2_free_any_clusters(bs, old_l2_entry, 1, type); 15635ea929e3SKevin Wolf } 15645ea929e3SKevin Wolf 1565a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 15665ea929e3SKevin Wolf 15675ea929e3SKevin Wolf return nb_clusters; 15685ea929e3SKevin Wolf } 15695ea929e3SKevin Wolf 15705ea929e3SKevin Wolf int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, 1571808c4b6fSMax Reitz int nb_sectors, enum qcow2_discard_type type, bool full_discard) 15725ea929e3SKevin Wolf { 1573ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 15745ea929e3SKevin Wolf uint64_t end_offset; 1575b6d36defSMax Reitz uint64_t nb_clusters; 15765ea929e3SKevin Wolf int ret; 15775ea929e3SKevin Wolf 15785ea929e3SKevin Wolf end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS); 15795ea929e3SKevin Wolf 15800c1bd469SEric Blake /* The caller must cluster-align start; round end down except at EOF */ 15810c1bd469SEric Blake assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 15820c1bd469SEric Blake if (end_offset != bs->total_sectors * BDRV_SECTOR_SIZE) { 1583ac95acdbSHu Tao end_offset = start_of_cluster(s, end_offset); 15845ea929e3SKevin Wolf } 15855ea929e3SKevin Wolf 15865ea929e3SKevin Wolf nb_clusters = size_to_clusters(s, end_offset - offset); 15875ea929e3SKevin Wolf 15880b919faeSKevin Wolf s->cache_discards = true; 15890b919faeSKevin Wolf 15905ea929e3SKevin Wolf /* Each L2 table is handled by its own loop iteration */ 15915ea929e3SKevin Wolf while (nb_clusters > 0) { 1592808c4b6fSMax Reitz ret = discard_single_l2(bs, offset, nb_clusters, type, full_discard); 15935ea929e3SKevin Wolf if (ret < 0) { 15940b919faeSKevin Wolf goto fail; 15955ea929e3SKevin Wolf } 15965ea929e3SKevin Wolf 15975ea929e3SKevin Wolf nb_clusters -= ret; 15985ea929e3SKevin Wolf offset += (ret * s->cluster_size); 15995ea929e3SKevin Wolf } 16005ea929e3SKevin Wolf 16010b919faeSKevin Wolf ret = 0; 16020b919faeSKevin Wolf fail: 16030b919faeSKevin Wolf s->cache_discards = false; 16040b919faeSKevin Wolf qcow2_process_discards(bs, ret); 16050b919faeSKevin Wolf 16060b919faeSKevin Wolf return ret; 16075ea929e3SKevin Wolf } 1608621f0589SKevin Wolf 1609621f0589SKevin Wolf /* 1610621f0589SKevin Wolf * This zeroes as many clusters of nb_clusters as possible at once (i.e. 1611621f0589SKevin Wolf * all clusters in the same L2 table) and returns the number of zeroed 1612621f0589SKevin Wolf * clusters. 1613621f0589SKevin Wolf */ 1614621f0589SKevin Wolf static int zero_single_l2(BlockDriverState *bs, uint64_t offset, 1615170f4b2eSFam Zheng uint64_t nb_clusters, int flags) 1616621f0589SKevin Wolf { 1617ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 1618621f0589SKevin Wolf uint64_t *l2_table; 1619621f0589SKevin Wolf int l2_index; 1620621f0589SKevin Wolf int ret; 1621621f0589SKevin Wolf int i; 1622621f0589SKevin Wolf 1623621f0589SKevin Wolf ret = get_cluster_table(bs, offset, &l2_table, &l2_index); 1624621f0589SKevin Wolf if (ret < 0) { 1625621f0589SKevin Wolf return ret; 1626621f0589SKevin Wolf } 1627621f0589SKevin Wolf 1628621f0589SKevin Wolf /* Limit nb_clusters to one L2 table */ 1629621f0589SKevin Wolf nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1630b6d36defSMax Reitz assert(nb_clusters <= INT_MAX); 1631621f0589SKevin Wolf 1632621f0589SKevin Wolf for (i = 0; i < nb_clusters; i++) { 1633621f0589SKevin Wolf uint64_t old_offset; 1634621f0589SKevin Wolf 1635621f0589SKevin Wolf old_offset = be64_to_cpu(l2_table[l2_index + i]); 1636621f0589SKevin Wolf 1637621f0589SKevin Wolf /* Update L2 entries */ 163872e80b89SAlberto Garcia qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 1639170f4b2eSFam Zheng if (old_offset & QCOW_OFLAG_COMPRESSED || flags & BDRV_REQ_MAY_UNMAP) { 1640621f0589SKevin Wolf l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); 16416cfcb9b8SKevin Wolf qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); 1642621f0589SKevin Wolf } else { 1643621f0589SKevin Wolf l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO); 1644621f0589SKevin Wolf } 1645621f0589SKevin Wolf } 1646621f0589SKevin Wolf 1647a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 1648621f0589SKevin Wolf 1649621f0589SKevin Wolf return nb_clusters; 1650621f0589SKevin Wolf } 1651621f0589SKevin Wolf 1652170f4b2eSFam Zheng int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors, 1653170f4b2eSFam Zheng int flags) 1654621f0589SKevin Wolf { 1655ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 1656b6d36defSMax Reitz uint64_t nb_clusters; 1657621f0589SKevin Wolf int ret; 1658621f0589SKevin Wolf 1659621f0589SKevin Wolf /* The zero flag is only supported by version 3 and newer */ 1660621f0589SKevin Wolf if (s->qcow_version < 3) { 1661621f0589SKevin Wolf return -ENOTSUP; 1662621f0589SKevin Wolf } 1663621f0589SKevin Wolf 1664621f0589SKevin Wolf /* Each L2 table is handled by its own loop iteration */ 1665621f0589SKevin Wolf nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS); 1666621f0589SKevin Wolf 16670b919faeSKevin Wolf s->cache_discards = true; 16680b919faeSKevin Wolf 1669621f0589SKevin Wolf while (nb_clusters > 0) { 1670170f4b2eSFam Zheng ret = zero_single_l2(bs, offset, nb_clusters, flags); 1671621f0589SKevin Wolf if (ret < 0) { 16720b919faeSKevin Wolf goto fail; 1673621f0589SKevin Wolf } 1674621f0589SKevin Wolf 1675621f0589SKevin Wolf nb_clusters -= ret; 1676621f0589SKevin Wolf offset += (ret * s->cluster_size); 1677621f0589SKevin Wolf } 1678621f0589SKevin Wolf 16790b919faeSKevin Wolf ret = 0; 16800b919faeSKevin Wolf fail: 16810b919faeSKevin Wolf s->cache_discards = false; 16820b919faeSKevin Wolf qcow2_process_discards(bs, ret); 16830b919faeSKevin Wolf 16840b919faeSKevin Wolf return ret; 1685621f0589SKevin Wolf } 168632b6444dSMax Reitz 168732b6444dSMax Reitz /* 168832b6444dSMax Reitz * Expands all zero clusters in a specific L1 table (or deallocates them, for 168932b6444dSMax Reitz * non-backed non-pre-allocated zero clusters). 169032b6444dSMax Reitz * 16914057a2b2SMax Reitz * l1_entries and *visited_l1_entries are used to keep track of progress for 16924057a2b2SMax Reitz * status_cb(). l1_entries contains the total number of L1 entries and 16934057a2b2SMax Reitz * *visited_l1_entries counts all visited L1 entries. 169432b6444dSMax Reitz */ 169532b6444dSMax Reitz static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, 1696ecf58777SMax Reitz int l1_size, int64_t *visited_l1_entries, 16974057a2b2SMax Reitz int64_t l1_entries, 16988b13976dSMax Reitz BlockDriverAmendStatusCB *status_cb, 16998b13976dSMax Reitz void *cb_opaque) 170032b6444dSMax Reitz { 1701ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 170232b6444dSMax Reitz bool is_active_l1 = (l1_table == s->l1_table); 170332b6444dSMax Reitz uint64_t *l2_table = NULL; 170432b6444dSMax Reitz int ret; 170532b6444dSMax Reitz int i, j; 170632b6444dSMax Reitz 170732b6444dSMax Reitz if (!is_active_l1) { 170832b6444dSMax Reitz /* inactive L2 tables require a buffer to be stored in when loading 170932b6444dSMax Reitz * them from disk */ 17109a4f4c31SKevin Wolf l2_table = qemu_try_blockalign(bs->file->bs, s->cluster_size); 1711de82815dSKevin Wolf if (l2_table == NULL) { 1712de82815dSKevin Wolf return -ENOMEM; 1713de82815dSKevin Wolf } 171432b6444dSMax Reitz } 171532b6444dSMax Reitz 171632b6444dSMax Reitz for (i = 0; i < l1_size; i++) { 171732b6444dSMax Reitz uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK; 171832b6444dSMax Reitz bool l2_dirty = false; 17190e06528eSMax Reitz uint64_t l2_refcount; 172032b6444dSMax Reitz 172132b6444dSMax Reitz if (!l2_offset) { 172232b6444dSMax Reitz /* unallocated */ 17234057a2b2SMax Reitz (*visited_l1_entries)++; 17244057a2b2SMax Reitz if (status_cb) { 17258b13976dSMax Reitz status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque); 17264057a2b2SMax Reitz } 172732b6444dSMax Reitz continue; 172832b6444dSMax Reitz } 172932b6444dSMax Reitz 17308dd93d93SMax Reitz if (offset_into_cluster(s, l2_offset)) { 17318dd93d93SMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" 17328dd93d93SMax Reitz PRIx64 " unaligned (L1 index: %#x)", 17338dd93d93SMax Reitz l2_offset, i); 17348dd93d93SMax Reitz ret = -EIO; 17358dd93d93SMax Reitz goto fail; 17368dd93d93SMax Reitz } 17378dd93d93SMax Reitz 173832b6444dSMax Reitz if (is_active_l1) { 173932b6444dSMax Reitz /* get active L2 tables from cache */ 174032b6444dSMax Reitz ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, 174132b6444dSMax Reitz (void **)&l2_table); 174232b6444dSMax Reitz } else { 174332b6444dSMax Reitz /* load inactive L2 tables from disk */ 1744fbcbbf4eSKevin Wolf ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE, 174532b6444dSMax Reitz (void *)l2_table, s->cluster_sectors); 174632b6444dSMax Reitz } 174732b6444dSMax Reitz if (ret < 0) { 174832b6444dSMax Reitz goto fail; 174932b6444dSMax Reitz } 175032b6444dSMax Reitz 17517324c10fSMax Reitz ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, 17527324c10fSMax Reitz &l2_refcount); 17537324c10fSMax Reitz if (ret < 0) { 1754ecf58777SMax Reitz goto fail; 1755ecf58777SMax Reitz } 1756ecf58777SMax Reitz 175732b6444dSMax Reitz for (j = 0; j < s->l2_size; j++) { 175832b6444dSMax Reitz uint64_t l2_entry = be64_to_cpu(l2_table[j]); 1759ecf58777SMax Reitz int64_t offset = l2_entry & L2E_OFFSET_MASK; 176032b6444dSMax Reitz int cluster_type = qcow2_get_cluster_type(l2_entry); 1761320c7066SMax Reitz bool preallocated = offset != 0; 176232b6444dSMax Reitz 1763ecf58777SMax Reitz if (cluster_type != QCOW2_CLUSTER_ZERO) { 176432b6444dSMax Reitz continue; 176532b6444dSMax Reitz } 176632b6444dSMax Reitz 1767320c7066SMax Reitz if (!preallocated) { 1768760e0063SKevin Wolf if (!bs->backing) { 176932b6444dSMax Reitz /* not backed; therefore we can simply deallocate the 177032b6444dSMax Reitz * cluster */ 177132b6444dSMax Reitz l2_table[j] = 0; 177232b6444dSMax Reitz l2_dirty = true; 177332b6444dSMax Reitz continue; 177432b6444dSMax Reitz } 177532b6444dSMax Reitz 177632b6444dSMax Reitz offset = qcow2_alloc_clusters(bs, s->cluster_size); 177732b6444dSMax Reitz if (offset < 0) { 177832b6444dSMax Reitz ret = offset; 177932b6444dSMax Reitz goto fail; 178032b6444dSMax Reitz } 1781ecf58777SMax Reitz 1782ecf58777SMax Reitz if (l2_refcount > 1) { 1783ecf58777SMax Reitz /* For shared L2 tables, set the refcount accordingly (it is 1784ecf58777SMax Reitz * already 1 and needs to be l2_refcount) */ 1785ecf58777SMax Reitz ret = qcow2_update_cluster_refcount(bs, 17862aabe7c7SMax Reitz offset >> s->cluster_bits, 17872aabe7c7SMax Reitz refcount_diff(1, l2_refcount), false, 1788ecf58777SMax Reitz QCOW2_DISCARD_OTHER); 1789ecf58777SMax Reitz if (ret < 0) { 1790ecf58777SMax Reitz qcow2_free_clusters(bs, offset, s->cluster_size, 1791ecf58777SMax Reitz QCOW2_DISCARD_OTHER); 1792ecf58777SMax Reitz goto fail; 1793ecf58777SMax Reitz } 1794ecf58777SMax Reitz } 179532b6444dSMax Reitz } 179632b6444dSMax Reitz 17978dd93d93SMax Reitz if (offset_into_cluster(s, offset)) { 17988dd93d93SMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset " 17998dd93d93SMax Reitz "%#" PRIx64 " unaligned (L2 offset: %#" 18008dd93d93SMax Reitz PRIx64 ", L2 index: %#x)", offset, 18018dd93d93SMax Reitz l2_offset, j); 18028dd93d93SMax Reitz if (!preallocated) { 18038dd93d93SMax Reitz qcow2_free_clusters(bs, offset, s->cluster_size, 18048dd93d93SMax Reitz QCOW2_DISCARD_ALWAYS); 18058dd93d93SMax Reitz } 18068dd93d93SMax Reitz ret = -EIO; 18078dd93d93SMax Reitz goto fail; 18088dd93d93SMax Reitz } 18098dd93d93SMax Reitz 1810231bb267SMax Reitz ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size); 181132b6444dSMax Reitz if (ret < 0) { 1812320c7066SMax Reitz if (!preallocated) { 181332b6444dSMax Reitz qcow2_free_clusters(bs, offset, s->cluster_size, 181432b6444dSMax Reitz QCOW2_DISCARD_ALWAYS); 1815320c7066SMax Reitz } 181632b6444dSMax Reitz goto fail; 181732b6444dSMax Reitz } 181832b6444dSMax Reitz 1819720ff280SKevin Wolf ret = bdrv_pwrite_zeroes(bs->file, offset, s->cluster_size, 0); 182032b6444dSMax Reitz if (ret < 0) { 1821320c7066SMax Reitz if (!preallocated) { 182232b6444dSMax Reitz qcow2_free_clusters(bs, offset, s->cluster_size, 182332b6444dSMax Reitz QCOW2_DISCARD_ALWAYS); 1824320c7066SMax Reitz } 182532b6444dSMax Reitz goto fail; 182632b6444dSMax Reitz } 182732b6444dSMax Reitz 1828ecf58777SMax Reitz if (l2_refcount == 1) { 182932b6444dSMax Reitz l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED); 1830ecf58777SMax Reitz } else { 1831ecf58777SMax Reitz l2_table[j] = cpu_to_be64(offset); 1832e390cf5aSMax Reitz } 1833ecf58777SMax Reitz l2_dirty = true; 183432b6444dSMax Reitz } 183532b6444dSMax Reitz 183632b6444dSMax Reitz if (is_active_l1) { 183732b6444dSMax Reitz if (l2_dirty) { 183872e80b89SAlberto Garcia qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 183932b6444dSMax Reitz qcow2_cache_depends_on_flush(s->l2_table_cache); 184032b6444dSMax Reitz } 1841a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 184232b6444dSMax Reitz } else { 184332b6444dSMax Reitz if (l2_dirty) { 1844231bb267SMax Reitz ret = qcow2_pre_write_overlap_check(bs, 1845231bb267SMax Reitz QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2, l2_offset, 184632b6444dSMax Reitz s->cluster_size); 184732b6444dSMax Reitz if (ret < 0) { 184832b6444dSMax Reitz goto fail; 184932b6444dSMax Reitz } 185032b6444dSMax Reitz 185118d51c4bSKevin Wolf ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE, 185232b6444dSMax Reitz (void *)l2_table, s->cluster_sectors); 185332b6444dSMax Reitz if (ret < 0) { 185432b6444dSMax Reitz goto fail; 185532b6444dSMax Reitz } 185632b6444dSMax Reitz } 185732b6444dSMax Reitz } 18584057a2b2SMax Reitz 18594057a2b2SMax Reitz (*visited_l1_entries)++; 18604057a2b2SMax Reitz if (status_cb) { 18618b13976dSMax Reitz status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque); 18624057a2b2SMax Reitz } 186332b6444dSMax Reitz } 186432b6444dSMax Reitz 186532b6444dSMax Reitz ret = 0; 186632b6444dSMax Reitz 186732b6444dSMax Reitz fail: 186832b6444dSMax Reitz if (l2_table) { 186932b6444dSMax Reitz if (!is_active_l1) { 187032b6444dSMax Reitz qemu_vfree(l2_table); 187132b6444dSMax Reitz } else { 187232b6444dSMax Reitz qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 187332b6444dSMax Reitz } 187432b6444dSMax Reitz } 187532b6444dSMax Reitz return ret; 187632b6444dSMax Reitz } 187732b6444dSMax Reitz 187832b6444dSMax Reitz /* 187932b6444dSMax Reitz * For backed images, expands all zero clusters on the image. For non-backed 188032b6444dSMax Reitz * images, deallocates all non-pre-allocated zero clusters (and claims the 188132b6444dSMax Reitz * allocation for pre-allocated ones). This is important for downgrading to a 188232b6444dSMax Reitz * qcow2 version which doesn't yet support metadata zero clusters. 188332b6444dSMax Reitz */ 18844057a2b2SMax Reitz int qcow2_expand_zero_clusters(BlockDriverState *bs, 18858b13976dSMax Reitz BlockDriverAmendStatusCB *status_cb, 18868b13976dSMax Reitz void *cb_opaque) 188732b6444dSMax Reitz { 1888ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 188932b6444dSMax Reitz uint64_t *l1_table = NULL; 18904057a2b2SMax Reitz int64_t l1_entries = 0, visited_l1_entries = 0; 189132b6444dSMax Reitz int ret; 189232b6444dSMax Reitz int i, j; 189332b6444dSMax Reitz 18944057a2b2SMax Reitz if (status_cb) { 18954057a2b2SMax Reitz l1_entries = s->l1_size; 18964057a2b2SMax Reitz for (i = 0; i < s->nb_snapshots; i++) { 18974057a2b2SMax Reitz l1_entries += s->snapshots[i].l1_size; 18984057a2b2SMax Reitz } 18994057a2b2SMax Reitz } 19004057a2b2SMax Reitz 190132b6444dSMax Reitz ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size, 19024057a2b2SMax Reitz &visited_l1_entries, l1_entries, 19038b13976dSMax Reitz status_cb, cb_opaque); 190432b6444dSMax Reitz if (ret < 0) { 190532b6444dSMax Reitz goto fail; 190632b6444dSMax Reitz } 190732b6444dSMax Reitz 190832b6444dSMax Reitz /* Inactive L1 tables may point to active L2 tables - therefore it is 190932b6444dSMax Reitz * necessary to flush the L2 table cache before trying to access the L2 191032b6444dSMax Reitz * tables pointed to by inactive L1 entries (else we might try to expand 191132b6444dSMax Reitz * zero clusters that have already been expanded); furthermore, it is also 191232b6444dSMax Reitz * necessary to empty the L2 table cache, since it may contain tables which 191332b6444dSMax Reitz * are now going to be modified directly on disk, bypassing the cache. 191432b6444dSMax Reitz * qcow2_cache_empty() does both for us. */ 191532b6444dSMax Reitz ret = qcow2_cache_empty(bs, s->l2_table_cache); 191632b6444dSMax Reitz if (ret < 0) { 191732b6444dSMax Reitz goto fail; 191832b6444dSMax Reitz } 191932b6444dSMax Reitz 192032b6444dSMax Reitz for (i = 0; i < s->nb_snapshots; i++) { 1921d737b78cSLaurent Vivier int l1_sectors = DIV_ROUND_UP(s->snapshots[i].l1_size * 1922d737b78cSLaurent Vivier sizeof(uint64_t), BDRV_SECTOR_SIZE); 192332b6444dSMax Reitz 192432b6444dSMax Reitz l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE); 192532b6444dSMax Reitz 1926fbcbbf4eSKevin Wolf ret = bdrv_read(bs->file, 19279a4f4c31SKevin Wolf s->snapshots[i].l1_table_offset / BDRV_SECTOR_SIZE, 19289a4f4c31SKevin Wolf (void *)l1_table, l1_sectors); 192932b6444dSMax Reitz if (ret < 0) { 193032b6444dSMax Reitz goto fail; 193132b6444dSMax Reitz } 193232b6444dSMax Reitz 193332b6444dSMax Reitz for (j = 0; j < s->snapshots[i].l1_size; j++) { 193432b6444dSMax Reitz be64_to_cpus(&l1_table[j]); 193532b6444dSMax Reitz } 193632b6444dSMax Reitz 193732b6444dSMax Reitz ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size, 19384057a2b2SMax Reitz &visited_l1_entries, l1_entries, 19398b13976dSMax Reitz status_cb, cb_opaque); 194032b6444dSMax Reitz if (ret < 0) { 194132b6444dSMax Reitz goto fail; 194232b6444dSMax Reitz } 194332b6444dSMax Reitz } 194432b6444dSMax Reitz 194532b6444dSMax Reitz ret = 0; 194632b6444dSMax Reitz 194732b6444dSMax Reitz fail: 194832b6444dSMax Reitz g_free(l1_table); 194932b6444dSMax Reitz return ret; 195032b6444dSMax Reitz } 1951