145aba42fSKevin Wolf /* 245aba42fSKevin Wolf * Block driver for the QCOW version 2 format 345aba42fSKevin Wolf * 445aba42fSKevin Wolf * Copyright (c) 2004-2006 Fabrice Bellard 545aba42fSKevin Wolf * 645aba42fSKevin Wolf * Permission is hereby granted, free of charge, to any person obtaining a copy 745aba42fSKevin Wolf * of this software and associated documentation files (the "Software"), to deal 845aba42fSKevin Wolf * in the Software without restriction, including without limitation the rights 945aba42fSKevin Wolf * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 1045aba42fSKevin Wolf * copies of the Software, and to permit persons to whom the Software is 1145aba42fSKevin Wolf * furnished to do so, subject to the following conditions: 1245aba42fSKevin Wolf * 1345aba42fSKevin Wolf * The above copyright notice and this permission notice shall be included in 1445aba42fSKevin Wolf * all copies or substantial portions of the Software. 1545aba42fSKevin Wolf * 1645aba42fSKevin Wolf * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1745aba42fSKevin Wolf * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1845aba42fSKevin Wolf * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1945aba42fSKevin Wolf * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2045aba42fSKevin Wolf * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 2145aba42fSKevin Wolf * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 2245aba42fSKevin Wolf * THE SOFTWARE. 2345aba42fSKevin Wolf */ 2445aba42fSKevin Wolf 2580c71a24SPeter Maydell #include "qemu/osdep.h" 2645aba42fSKevin Wolf #include <zlib.h> 2745aba42fSKevin Wolf 28da34e65cSMarkus Armbruster #include "qapi/error.h" 2945aba42fSKevin Wolf #include "qemu-common.h" 30737e150eSPaolo Bonzini #include "block/block_int.h" 3145aba42fSKevin Wolf #include "block/qcow2.h" 3258369e22SPaolo Bonzini #include "qemu/bswap.h" 333cce16f4SKevin Wolf #include "trace.h" 3445aba42fSKevin Wolf 352cf7cfa1SKevin Wolf int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, 362cf7cfa1SKevin Wolf bool exact_size) 3745aba42fSKevin Wolf { 38ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 392cf7cfa1SKevin Wolf int new_l1_size2, ret, i; 4045aba42fSKevin Wolf uint64_t *new_l1_table; 41fda74f82SMax Reitz int64_t old_l1_table_offset, old_l1_size; 422cf7cfa1SKevin Wolf int64_t new_l1_table_offset, new_l1_size; 4345aba42fSKevin Wolf uint8_t data[12]; 4445aba42fSKevin Wolf 4572893756SStefan Hajnoczi if (min_size <= s->l1_size) 4645aba42fSKevin Wolf return 0; 4772893756SStefan Hajnoczi 48b93f9950SMax Reitz /* Do a sanity check on min_size before trying to calculate new_l1_size 49b93f9950SMax Reitz * (this prevents overflows during the while loop for the calculation of 50b93f9950SMax Reitz * new_l1_size) */ 51b93f9950SMax Reitz if (min_size > INT_MAX / sizeof(uint64_t)) { 52b93f9950SMax Reitz return -EFBIG; 53b93f9950SMax Reitz } 54b93f9950SMax Reitz 5572893756SStefan Hajnoczi if (exact_size) { 5672893756SStefan Hajnoczi new_l1_size = min_size; 5772893756SStefan Hajnoczi } else { 5872893756SStefan Hajnoczi /* Bump size up to reduce the number of times we have to grow */ 5972893756SStefan Hajnoczi new_l1_size = s->l1_size; 60d191d12dSStefan Weil if (new_l1_size == 0) { 61d191d12dSStefan Weil new_l1_size = 1; 62d191d12dSStefan Weil } 6345aba42fSKevin Wolf while (min_size > new_l1_size) { 6445aba42fSKevin Wolf new_l1_size = (new_l1_size * 3 + 1) / 2; 6545aba42fSKevin Wolf } 6672893756SStefan Hajnoczi } 6772893756SStefan Hajnoczi 6884c26520SMax Reitz QEMU_BUILD_BUG_ON(QCOW_MAX_L1_SIZE > INT_MAX); 6984c26520SMax Reitz if (new_l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) { 702cf7cfa1SKevin Wolf return -EFBIG; 712cf7cfa1SKevin Wolf } 722cf7cfa1SKevin Wolf 7345aba42fSKevin Wolf #ifdef DEBUG_ALLOC2 742cf7cfa1SKevin Wolf fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n", 752cf7cfa1SKevin Wolf s->l1_size, new_l1_size); 7645aba42fSKevin Wolf #endif 7745aba42fSKevin Wolf 7845aba42fSKevin Wolf new_l1_size2 = sizeof(uint64_t) * new_l1_size; 799a4f4c31SKevin Wolf new_l1_table = qemu_try_blockalign(bs->file->bs, 80de82815dSKevin Wolf align_offset(new_l1_size2, 512)); 81de82815dSKevin Wolf if (new_l1_table == NULL) { 82de82815dSKevin Wolf return -ENOMEM; 83de82815dSKevin Wolf } 84de82815dSKevin Wolf memset(new_l1_table, 0, align_offset(new_l1_size2, 512)); 85de82815dSKevin Wolf 860647d47cSStefan Hajnoczi if (s->l1_size) { 8745aba42fSKevin Wolf memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t)); 880647d47cSStefan Hajnoczi } 8945aba42fSKevin Wolf 9045aba42fSKevin Wolf /* write new table (align to cluster) */ 9166f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE); 92ed6ccf0fSKevin Wolf new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2); 935d757b56SKevin Wolf if (new_l1_table_offset < 0) { 94de82815dSKevin Wolf qemu_vfree(new_l1_table); 955d757b56SKevin Wolf return new_l1_table_offset; 965d757b56SKevin Wolf } 9729c1a730SKevin Wolf 9829c1a730SKevin Wolf ret = qcow2_cache_flush(bs, s->refcount_block_cache); 9929c1a730SKevin Wolf if (ret < 0) { 10080fa3341SKevin Wolf goto fail; 10129c1a730SKevin Wolf } 10245aba42fSKevin Wolf 103cf93980eSMax Reitz /* the L1 position has not yet been updated, so these clusters must 104cf93980eSMax Reitz * indeed be completely free */ 105231bb267SMax Reitz ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset, 106231bb267SMax Reitz new_l1_size2); 107cf93980eSMax Reitz if (ret < 0) { 108cf93980eSMax Reitz goto fail; 109cf93980eSMax Reitz } 110cf93980eSMax Reitz 11166f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); 11245aba42fSKevin Wolf for(i = 0; i < s->l1_size; i++) 11345aba42fSKevin Wolf new_l1_table[i] = cpu_to_be64(new_l1_table[i]); 114d9ca2ea2SKevin Wolf ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, 1159a4f4c31SKevin Wolf new_l1_table, new_l1_size2); 1168b3b7206SKevin Wolf if (ret < 0) 11745aba42fSKevin Wolf goto fail; 11845aba42fSKevin Wolf for(i = 0; i < s->l1_size; i++) 11945aba42fSKevin Wolf new_l1_table[i] = be64_to_cpu(new_l1_table[i]); 12045aba42fSKevin Wolf 12145aba42fSKevin Wolf /* set new table */ 12266f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE); 123f1f7a1ddSPeter Maydell stl_be_p(data, new_l1_size); 124e4ef9f46SPeter Maydell stq_be_p(data + 4, new_l1_table_offset); 125d9ca2ea2SKevin Wolf ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), 1269a4f4c31SKevin Wolf data, sizeof(data)); 1278b3b7206SKevin Wolf if (ret < 0) { 12845aba42fSKevin Wolf goto fail; 129fb8fa77cSKevin Wolf } 130de82815dSKevin Wolf qemu_vfree(s->l1_table); 131fda74f82SMax Reitz old_l1_table_offset = s->l1_table_offset; 13245aba42fSKevin Wolf s->l1_table_offset = new_l1_table_offset; 13345aba42fSKevin Wolf s->l1_table = new_l1_table; 134fda74f82SMax Reitz old_l1_size = s->l1_size; 13545aba42fSKevin Wolf s->l1_size = new_l1_size; 136fda74f82SMax Reitz qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * sizeof(uint64_t), 137fda74f82SMax Reitz QCOW2_DISCARD_OTHER); 13845aba42fSKevin Wolf return 0; 13945aba42fSKevin Wolf fail: 140de82815dSKevin Wolf qemu_vfree(new_l1_table); 1416cfcb9b8SKevin Wolf qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2, 1426cfcb9b8SKevin Wolf QCOW2_DISCARD_OTHER); 1438b3b7206SKevin Wolf return ret; 14445aba42fSKevin Wolf } 14545aba42fSKevin Wolf 14645aba42fSKevin Wolf /* 14745aba42fSKevin Wolf * l2_load 14845aba42fSKevin Wolf * 14945aba42fSKevin Wolf * Loads a L2 table into memory. If the table is in the cache, the cache 15045aba42fSKevin Wolf * is used; otherwise the L2 table is loaded from the image file. 15145aba42fSKevin Wolf * 15245aba42fSKevin Wolf * Returns a pointer to the L2 table on success, or NULL if the read from 15345aba42fSKevin Wolf * the image file failed. 15445aba42fSKevin Wolf */ 15545aba42fSKevin Wolf 15655c17e98SKevin Wolf static int l2_load(BlockDriverState *bs, uint64_t l2_offset, 15755c17e98SKevin Wolf uint64_t **l2_table) 15845aba42fSKevin Wolf { 159ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 16045aba42fSKevin Wolf 1619be38598SEduardo Habkost return qcow2_cache_get(bs, s->l2_table_cache, l2_offset, 1629be38598SEduardo Habkost (void **)l2_table); 16355c17e98SKevin Wolf } 16455c17e98SKevin Wolf 16545aba42fSKevin Wolf /* 1666583e3c7SKevin Wolf * Writes one sector of the L1 table to the disk (can't update single entries 1676583e3c7SKevin Wolf * and we really don't want bdrv_pread to perform a read-modify-write) 1686583e3c7SKevin Wolf */ 1696583e3c7SKevin Wolf #define L1_ENTRIES_PER_SECTOR (512 / 8) 170e23e400eSMax Reitz int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index) 1716583e3c7SKevin Wolf { 172ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 173a1391444SMax Reitz uint64_t buf[L1_ENTRIES_PER_SECTOR] = { 0 }; 1746583e3c7SKevin Wolf int l1_start_index; 175f7defcb6SKevin Wolf int i, ret; 1766583e3c7SKevin Wolf 1776583e3c7SKevin Wolf l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1); 178a1391444SMax Reitz for (i = 0; i < L1_ENTRIES_PER_SECTOR && l1_start_index + i < s->l1_size; 179a1391444SMax Reitz i++) 180a1391444SMax Reitz { 1816583e3c7SKevin Wolf buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]); 1826583e3c7SKevin Wolf } 1836583e3c7SKevin Wolf 184231bb267SMax Reitz ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1, 185cf93980eSMax Reitz s->l1_table_offset + 8 * l1_start_index, sizeof(buf)); 186cf93980eSMax Reitz if (ret < 0) { 187cf93980eSMax Reitz return ret; 188cf93980eSMax Reitz } 189cf93980eSMax Reitz 19066f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); 191d9ca2ea2SKevin Wolf ret = bdrv_pwrite_sync(bs->file, 1929a4f4c31SKevin Wolf s->l1_table_offset + 8 * l1_start_index, 193f7defcb6SKevin Wolf buf, sizeof(buf)); 194f7defcb6SKevin Wolf if (ret < 0) { 195f7defcb6SKevin Wolf return ret; 1966583e3c7SKevin Wolf } 1976583e3c7SKevin Wolf 1986583e3c7SKevin Wolf return 0; 1996583e3c7SKevin Wolf } 2006583e3c7SKevin Wolf 2016583e3c7SKevin Wolf /* 20245aba42fSKevin Wolf * l2_allocate 20345aba42fSKevin Wolf * 20445aba42fSKevin Wolf * Allocate a new l2 entry in the file. If l1_index points to an already 20545aba42fSKevin Wolf * used entry in the L2 table (i.e. we are doing a copy on write for the L2 20645aba42fSKevin Wolf * table) copy the contents of the old L2 table into the newly allocated one. 20745aba42fSKevin Wolf * Otherwise the new table is initialized with zeros. 20845aba42fSKevin Wolf * 20945aba42fSKevin Wolf */ 21045aba42fSKevin Wolf 211c46e1167SKevin Wolf static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) 21245aba42fSKevin Wolf { 213ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 2146583e3c7SKevin Wolf uint64_t old_l2_offset; 2158585afd8SMax Reitz uint64_t *l2_table = NULL; 216f4f0d391SKevin Wolf int64_t l2_offset; 217c46e1167SKevin Wolf int ret; 21845aba42fSKevin Wolf 21945aba42fSKevin Wolf old_l2_offset = s->l1_table[l1_index]; 22045aba42fSKevin Wolf 2213cce16f4SKevin Wolf trace_qcow2_l2_allocate(bs, l1_index); 2223cce16f4SKevin Wolf 22345aba42fSKevin Wolf /* allocate a new l2 entry */ 22445aba42fSKevin Wolf 225ed6ccf0fSKevin Wolf l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t)); 2265d757b56SKevin Wolf if (l2_offset < 0) { 227be0b742eSMax Reitz ret = l2_offset; 228be0b742eSMax Reitz goto fail; 2295d757b56SKevin Wolf } 23029c1a730SKevin Wolf 23129c1a730SKevin Wolf ret = qcow2_cache_flush(bs, s->refcount_block_cache); 23229c1a730SKevin Wolf if (ret < 0) { 23329c1a730SKevin Wolf goto fail; 23429c1a730SKevin Wolf } 23545aba42fSKevin Wolf 23645aba42fSKevin Wolf /* allocate a new entry in the l2 cache */ 23745aba42fSKevin Wolf 2383cce16f4SKevin Wolf trace_qcow2_l2_allocate_get_empty(bs, l1_index); 23929c1a730SKevin Wolf ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table); 24029c1a730SKevin Wolf if (ret < 0) { 241be0b742eSMax Reitz goto fail; 24229c1a730SKevin Wolf } 24329c1a730SKevin Wolf 24429c1a730SKevin Wolf l2_table = *table; 24545aba42fSKevin Wolf 2468e37f681SKevin Wolf if ((old_l2_offset & L1E_OFFSET_MASK) == 0) { 24745aba42fSKevin Wolf /* if there was no old l2 table, clear the new table */ 24845aba42fSKevin Wolf memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); 24945aba42fSKevin Wolf } else { 25029c1a730SKevin Wolf uint64_t* old_table; 25129c1a730SKevin Wolf 25245aba42fSKevin Wolf /* if there was an old l2 table, read it from the disk */ 25366f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ); 2548e37f681SKevin Wolf ret = qcow2_cache_get(bs, s->l2_table_cache, 2558e37f681SKevin Wolf old_l2_offset & L1E_OFFSET_MASK, 25629c1a730SKevin Wolf (void**) &old_table); 25729c1a730SKevin Wolf if (ret < 0) { 25829c1a730SKevin Wolf goto fail; 25929c1a730SKevin Wolf } 26029c1a730SKevin Wolf 26129c1a730SKevin Wolf memcpy(l2_table, old_table, s->cluster_size); 26229c1a730SKevin Wolf 263a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &old_table); 26445aba42fSKevin Wolf } 26529c1a730SKevin Wolf 26645aba42fSKevin Wolf /* write the l2 table to the file */ 26766f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE); 26829c1a730SKevin Wolf 2693cce16f4SKevin Wolf trace_qcow2_l2_allocate_write_l2(bs, l1_index); 27072e80b89SAlberto Garcia qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 27129c1a730SKevin Wolf ret = qcow2_cache_flush(bs, s->l2_table_cache); 272c46e1167SKevin Wolf if (ret < 0) { 273175e1152SKevin Wolf goto fail; 274175e1152SKevin Wolf } 275175e1152SKevin Wolf 276175e1152SKevin Wolf /* update the L1 entry */ 2773cce16f4SKevin Wolf trace_qcow2_l2_allocate_write_l1(bs, l1_index); 278175e1152SKevin Wolf s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; 279e23e400eSMax Reitz ret = qcow2_write_l1_entry(bs, l1_index); 280175e1152SKevin Wolf if (ret < 0) { 281175e1152SKevin Wolf goto fail; 282c46e1167SKevin Wolf } 28345aba42fSKevin Wolf 284c46e1167SKevin Wolf *table = l2_table; 2853cce16f4SKevin Wolf trace_qcow2_l2_allocate_done(bs, l1_index, 0); 286c46e1167SKevin Wolf return 0; 287175e1152SKevin Wolf 288175e1152SKevin Wolf fail: 2893cce16f4SKevin Wolf trace_qcow2_l2_allocate_done(bs, l1_index, ret); 2908585afd8SMax Reitz if (l2_table != NULL) { 29129c1a730SKevin Wolf qcow2_cache_put(bs, s->l2_table_cache, (void**) table); 2928585afd8SMax Reitz } 29368dba0bfSKevin Wolf s->l1_table[l1_index] = old_l2_offset; 294e3b21ef9SMax Reitz if (l2_offset > 0) { 295e3b21ef9SMax Reitz qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), 296e3b21ef9SMax Reitz QCOW2_DISCARD_ALWAYS); 297e3b21ef9SMax Reitz } 298175e1152SKevin Wolf return ret; 29945aba42fSKevin Wolf } 30045aba42fSKevin Wolf 3012bfcc4a0SKevin Wolf /* 3022bfcc4a0SKevin Wolf * Checks how many clusters in a given L2 table are contiguous in the image 3032bfcc4a0SKevin Wolf * file. As soon as one of the flags in the bitmask stop_flags changes compared 3042bfcc4a0SKevin Wolf * to the first cluster, the search is stopped and the cluster is not counted 3052bfcc4a0SKevin Wolf * as contiguous. (This allows it, for example, to stop at the first compressed 3062bfcc4a0SKevin Wolf * cluster which may require a different handling) 3072bfcc4a0SKevin Wolf */ 308b6d36defSMax Reitz static int count_contiguous_clusters(int nb_clusters, int cluster_size, 30961653008SKevin Wolf uint64_t *l2_table, uint64_t stop_flags) 31045aba42fSKevin Wolf { 31145aba42fSKevin Wolf int i; 3123ef95218SEric Blake QCow2ClusterType first_cluster_type; 31378a52ad5SPeter Lieven uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED; 31415684a47SMax Reitz uint64_t first_entry = be64_to_cpu(l2_table[0]); 31515684a47SMax Reitz uint64_t offset = first_entry & mask; 31645aba42fSKevin Wolf 317564a6b69SMax Reitz if (!offset) { 31845aba42fSKevin Wolf return 0; 319564a6b69SMax Reitz } 32045aba42fSKevin Wolf 321564a6b69SMax Reitz /* must be allocated */ 322564a6b69SMax Reitz first_cluster_type = qcow2_get_cluster_type(first_entry); 323564a6b69SMax Reitz assert(first_cluster_type == QCOW2_CLUSTER_NORMAL || 324fdfab37dSEric Blake first_cluster_type == QCOW2_CLUSTER_ZERO_ALLOC); 32515684a47SMax Reitz 32661653008SKevin Wolf for (i = 0; i < nb_clusters; i++) { 3272bfcc4a0SKevin Wolf uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask; 3282bfcc4a0SKevin Wolf if (offset + (uint64_t) i * cluster_size != l2_entry) { 32945aba42fSKevin Wolf break; 3302bfcc4a0SKevin Wolf } 3312bfcc4a0SKevin Wolf } 33245aba42fSKevin Wolf 33361653008SKevin Wolf return i; 33445aba42fSKevin Wolf } 33545aba42fSKevin Wolf 3364341df8aSEric Blake /* 3374341df8aSEric Blake * Checks how many consecutive unallocated clusters in a given L2 3384341df8aSEric Blake * table have the same cluster type. 3394341df8aSEric Blake */ 3404341df8aSEric Blake static int count_contiguous_clusters_unallocated(int nb_clusters, 341a99dfb45SKevin Wolf uint64_t *l2_table, 3423ef95218SEric Blake QCow2ClusterType wanted_type) 34345aba42fSKevin Wolf { 3442bfcc4a0SKevin Wolf int i; 34545aba42fSKevin Wolf 346fdfab37dSEric Blake assert(wanted_type == QCOW2_CLUSTER_ZERO_PLAIN || 3474341df8aSEric Blake wanted_type == QCOW2_CLUSTER_UNALLOCATED); 3482bfcc4a0SKevin Wolf for (i = 0; i < nb_clusters; i++) { 3494341df8aSEric Blake uint64_t entry = be64_to_cpu(l2_table[i]); 3503ef95218SEric Blake QCow2ClusterType type = qcow2_get_cluster_type(entry); 3512bfcc4a0SKevin Wolf 352fdfab37dSEric Blake if (type != wanted_type) { 3532bfcc4a0SKevin Wolf break; 3542bfcc4a0SKevin Wolf } 3552bfcc4a0SKevin Wolf } 35645aba42fSKevin Wolf 35745aba42fSKevin Wolf return i; 35845aba42fSKevin Wolf } 35945aba42fSKevin Wolf 36045aba42fSKevin Wolf /* The crypt function is compatible with the linux cryptoloop 36145aba42fSKevin Wolf algorithm for < 4 GB images. NOTE: out_buf == in_buf is 36245aba42fSKevin Wolf supported */ 363ff99129aSKevin Wolf int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num, 36445aba42fSKevin Wolf uint8_t *out_buf, const uint8_t *in_buf, 365f6fa64f6SDaniel P. Berrange int nb_sectors, bool enc, 366f6fa64f6SDaniel P. Berrange Error **errp) 36745aba42fSKevin Wolf { 36845aba42fSKevin Wolf union { 36945aba42fSKevin Wolf uint64_t ll[2]; 37045aba42fSKevin Wolf uint8_t b[16]; 37145aba42fSKevin Wolf } ivec; 37245aba42fSKevin Wolf int i; 373f6fa64f6SDaniel P. Berrange int ret; 37445aba42fSKevin Wolf 37545aba42fSKevin Wolf for(i = 0; i < nb_sectors; i++) { 37645aba42fSKevin Wolf ivec.ll[0] = cpu_to_le64(sector_num); 37745aba42fSKevin Wolf ivec.ll[1] = 0; 378f6fa64f6SDaniel P. Berrange if (qcrypto_cipher_setiv(s->cipher, 379f6fa64f6SDaniel P. Berrange ivec.b, G_N_ELEMENTS(ivec.b), 380f6fa64f6SDaniel P. Berrange errp) < 0) { 381f6fa64f6SDaniel P. Berrange return -1; 382f6fa64f6SDaniel P. Berrange } 383f6fa64f6SDaniel P. Berrange if (enc) { 384f6fa64f6SDaniel P. Berrange ret = qcrypto_cipher_encrypt(s->cipher, 385f6fa64f6SDaniel P. Berrange in_buf, 386f6fa64f6SDaniel P. Berrange out_buf, 387f6fa64f6SDaniel P. Berrange 512, 388f6fa64f6SDaniel P. Berrange errp); 389f6fa64f6SDaniel P. Berrange } else { 390f6fa64f6SDaniel P. Berrange ret = qcrypto_cipher_decrypt(s->cipher, 391f6fa64f6SDaniel P. Berrange in_buf, 392f6fa64f6SDaniel P. Berrange out_buf, 393f6fa64f6SDaniel P. Berrange 512, 394f6fa64f6SDaniel P. Berrange errp); 395f6fa64f6SDaniel P. Berrange } 396f6fa64f6SDaniel P. Berrange if (ret < 0) { 397f6fa64f6SDaniel P. Berrange return -1; 398f6fa64f6SDaniel P. Berrange } 39945aba42fSKevin Wolf sector_num++; 40045aba42fSKevin Wolf in_buf += 512; 40145aba42fSKevin Wolf out_buf += 512; 40245aba42fSKevin Wolf } 403f6fa64f6SDaniel P. Berrange return 0; 40445aba42fSKevin Wolf } 40545aba42fSKevin Wolf 406*672f0f2cSAlberto Garcia static int coroutine_fn do_perform_cow_read(BlockDriverState *bs, 407aaa4d20bSKevin Wolf uint64_t src_cluster_offset, 408e034f5bcSAlberto Garcia unsigned offset_in_cluster, 409*672f0f2cSAlberto Garcia uint8_t *buffer, 410e034f5bcSAlberto Garcia unsigned bytes) 41145aba42fSKevin Wolf { 412aef4acb6SStefan Hajnoczi QEMUIOVector qiov; 413*672f0f2cSAlberto Garcia struct iovec iov = { .iov_base = buffer, .iov_len = bytes }; 414aaa4d20bSKevin Wolf int ret; 4151b9f1491SKevin Wolf 41699450c6fSAlberto Garcia if (bytes == 0) { 41799450c6fSAlberto Garcia return 0; 41899450c6fSAlberto Garcia } 41999450c6fSAlberto Garcia 420aef4acb6SStefan Hajnoczi qemu_iovec_init_external(&qiov, &iov, 1); 4211b9f1491SKevin Wolf 42266f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); 423aef4acb6SStefan Hajnoczi 424dba28555SMax Reitz if (!bs->drv) { 425*672f0f2cSAlberto Garcia return -ENOMEDIUM; 426dba28555SMax Reitz } 427dba28555SMax Reitz 428aef4acb6SStefan Hajnoczi /* Call .bdrv_co_readv() directly instead of using the public block-layer 429aef4acb6SStefan Hajnoczi * interface. This avoids double I/O throttling and request tracking, 430aef4acb6SStefan Hajnoczi * which can lead to deadlock when block layer copy-on-read is enabled. 431aef4acb6SStefan Hajnoczi */ 432aaa4d20bSKevin Wolf ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster, 433aaa4d20bSKevin Wolf bytes, &qiov, 0); 4341b9f1491SKevin Wolf if (ret < 0) { 435*672f0f2cSAlberto Garcia return ret; 4361b9f1491SKevin Wolf } 4371b9f1491SKevin Wolf 438*672f0f2cSAlberto Garcia return 0; 439*672f0f2cSAlberto Garcia } 440*672f0f2cSAlberto Garcia 441*672f0f2cSAlberto Garcia static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs, 442*672f0f2cSAlberto Garcia uint64_t src_cluster_offset, 443*672f0f2cSAlberto Garcia unsigned offset_in_cluster, 444*672f0f2cSAlberto Garcia uint8_t *buffer, 445*672f0f2cSAlberto Garcia unsigned bytes) 446*672f0f2cSAlberto Garcia { 447*672f0f2cSAlberto Garcia if (bytes && bs->encrypted) { 448*672f0f2cSAlberto Garcia BDRVQcow2State *s = bs->opaque; 449bb9f8dd0SDaniel P. Berrange int64_t sector = (src_cluster_offset + offset_in_cluster) 450aaa4d20bSKevin Wolf >> BDRV_SECTOR_BITS; 451f6fa64f6SDaniel P. Berrange assert(s->cipher); 452aaa4d20bSKevin Wolf assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0); 453aaa4d20bSKevin Wolf assert((bytes & ~BDRV_SECTOR_MASK) == 0); 454*672f0f2cSAlberto Garcia if (qcow2_encrypt_sectors(s, sector, buffer, buffer, 455026ac158SAlberto Garcia bytes >> BDRV_SECTOR_BITS, true, NULL) < 0) { 456*672f0f2cSAlberto Garcia return false; 457f6fa64f6SDaniel P. Berrange } 45845aba42fSKevin Wolf } 459*672f0f2cSAlberto Garcia return true; 460*672f0f2cSAlberto Garcia } 461*672f0f2cSAlberto Garcia 462*672f0f2cSAlberto Garcia static int coroutine_fn do_perform_cow_write(BlockDriverState *bs, 463*672f0f2cSAlberto Garcia uint64_t cluster_offset, 464*672f0f2cSAlberto Garcia unsigned offset_in_cluster, 465*672f0f2cSAlberto Garcia uint8_t *buffer, 466*672f0f2cSAlberto Garcia unsigned bytes) 467*672f0f2cSAlberto Garcia { 468*672f0f2cSAlberto Garcia QEMUIOVector qiov; 469*672f0f2cSAlberto Garcia struct iovec iov = { .iov_base = buffer, .iov_len = bytes }; 470*672f0f2cSAlberto Garcia int ret; 471*672f0f2cSAlberto Garcia 472*672f0f2cSAlberto Garcia if (bytes == 0) { 473*672f0f2cSAlberto Garcia return 0; 474*672f0f2cSAlberto Garcia } 475*672f0f2cSAlberto Garcia 476*672f0f2cSAlberto Garcia qemu_iovec_init_external(&qiov, &iov, 1); 4771b9f1491SKevin Wolf 478231bb267SMax Reitz ret = qcow2_pre_write_overlap_check(bs, 0, 479aaa4d20bSKevin Wolf cluster_offset + offset_in_cluster, bytes); 480cf93980eSMax Reitz if (ret < 0) { 481*672f0f2cSAlberto Garcia return ret; 482cf93980eSMax Reitz } 483cf93980eSMax Reitz 48466f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); 485a03ef88fSKevin Wolf ret = bdrv_co_pwritev(bs->file, cluster_offset + offset_in_cluster, 486aaa4d20bSKevin Wolf bytes, &qiov, 0); 4871b9f1491SKevin Wolf if (ret < 0) { 488*672f0f2cSAlberto Garcia return ret; 4891b9f1491SKevin Wolf } 4901b9f1491SKevin Wolf 491*672f0f2cSAlberto Garcia return 0; 49245aba42fSKevin Wolf } 49345aba42fSKevin Wolf 49445aba42fSKevin Wolf 49545aba42fSKevin Wolf /* 49645aba42fSKevin Wolf * get_cluster_offset 49745aba42fSKevin Wolf * 498ecfe1863SKevin Wolf * For a given offset of the virtual disk, find the cluster type and offset in 499ecfe1863SKevin Wolf * the qcow2 file. The offset is stored in *cluster_offset. 50045aba42fSKevin Wolf * 501ecfe1863SKevin Wolf * On entry, *bytes is the maximum number of contiguous bytes starting at 502ecfe1863SKevin Wolf * offset that we are interested in. 50345aba42fSKevin Wolf * 504ecfe1863SKevin Wolf * On exit, *bytes is the number of bytes starting at offset that have the same 505ecfe1863SKevin Wolf * cluster type and (if applicable) are stored contiguously in the image file. 506ecfe1863SKevin Wolf * Compressed clusters are always returned one by one. 50745aba42fSKevin Wolf * 50868d000a3SKevin Wolf * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error 50968d000a3SKevin Wolf * cases. 51045aba42fSKevin Wolf */ 5111c46efaaSKevin Wolf int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, 512ecfe1863SKevin Wolf unsigned int *bytes, uint64_t *cluster_offset) 51345aba42fSKevin Wolf { 514ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 5152cf7cfa1SKevin Wolf unsigned int l2_index; 5162cf7cfa1SKevin Wolf uint64_t l1_index, l2_offset, *l2_table; 51745aba42fSKevin Wolf int l1_bits, c; 518c834cba9SMax Reitz unsigned int offset_in_cluster; 519c834cba9SMax Reitz uint64_t bytes_available, bytes_needed, nb_clusters; 5203ef95218SEric Blake QCow2ClusterType type; 52155c17e98SKevin Wolf int ret; 522b2f65d6bSKevin Wolf 523b2f65d6bSKevin Wolf offset_in_cluster = offset_into_cluster(s, offset); 524ecfe1863SKevin Wolf bytes_needed = (uint64_t) *bytes + offset_in_cluster; 52545aba42fSKevin Wolf 52645aba42fSKevin Wolf l1_bits = s->l2_bits + s->cluster_bits; 52745aba42fSKevin Wolf 528b2f65d6bSKevin Wolf /* compute how many bytes there are between the start of the cluster 529b2f65d6bSKevin Wolf * containing offset and the end of the l1 entry */ 530b2f65d6bSKevin Wolf bytes_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1)) 531b2f65d6bSKevin Wolf + offset_in_cluster; 53245aba42fSKevin Wolf 533b2f65d6bSKevin Wolf if (bytes_needed > bytes_available) { 534b2f65d6bSKevin Wolf bytes_needed = bytes_available; 53545aba42fSKevin Wolf } 53645aba42fSKevin Wolf 5371c46efaaSKevin Wolf *cluster_offset = 0; 53845aba42fSKevin Wolf 539b6af0975SDaniel P. Berrange /* seek to the l2 offset in the l1 table */ 54045aba42fSKevin Wolf 54145aba42fSKevin Wolf l1_index = offset >> l1_bits; 54268d000a3SKevin Wolf if (l1_index >= s->l1_size) { 5433ef95218SEric Blake type = QCOW2_CLUSTER_UNALLOCATED; 54445aba42fSKevin Wolf goto out; 54568d000a3SKevin Wolf } 54645aba42fSKevin Wolf 54768d000a3SKevin Wolf l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; 54868d000a3SKevin Wolf if (!l2_offset) { 5493ef95218SEric Blake type = QCOW2_CLUSTER_UNALLOCATED; 55045aba42fSKevin Wolf goto out; 55168d000a3SKevin Wolf } 55245aba42fSKevin Wolf 553a97c67eeSMax Reitz if (offset_into_cluster(s, l2_offset)) { 554a97c67eeSMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 555a97c67eeSMax Reitz " unaligned (L1 index: %#" PRIx64 ")", 556a97c67eeSMax Reitz l2_offset, l1_index); 557a97c67eeSMax Reitz return -EIO; 558a97c67eeSMax Reitz } 559a97c67eeSMax Reitz 56045aba42fSKevin Wolf /* load the l2 table in memory */ 56145aba42fSKevin Wolf 56255c17e98SKevin Wolf ret = l2_load(bs, l2_offset, &l2_table); 56355c17e98SKevin Wolf if (ret < 0) { 56455c17e98SKevin Wolf return ret; 5651c46efaaSKevin Wolf } 56645aba42fSKevin Wolf 56745aba42fSKevin Wolf /* find the cluster offset for the given disk offset */ 56845aba42fSKevin Wolf 56945aba42fSKevin Wolf l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); 5701c46efaaSKevin Wolf *cluster_offset = be64_to_cpu(l2_table[l2_index]); 571b6d36defSMax Reitz 572b2f65d6bSKevin Wolf nb_clusters = size_to_clusters(s, bytes_needed); 573c834cba9SMax Reitz /* bytes_needed <= *bytes + offset_in_cluster, both of which are unsigned 574c834cba9SMax Reitz * integers; the minimum cluster size is 512, so this assertion is always 575c834cba9SMax Reitz * true */ 576c834cba9SMax Reitz assert(nb_clusters <= INT_MAX); 57745aba42fSKevin Wolf 5783ef95218SEric Blake type = qcow2_get_cluster_type(*cluster_offset); 579fdfab37dSEric Blake if (s->qcow_version < 3 && (type == QCOW2_CLUSTER_ZERO_PLAIN || 580fdfab37dSEric Blake type == QCOW2_CLUSTER_ZERO_ALLOC)) { 581a97c67eeSMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found" 582a97c67eeSMax Reitz " in pre-v3 image (L2 offset: %#" PRIx64 583a97c67eeSMax Reitz ", L2 index: %#x)", l2_offset, l2_index); 584a97c67eeSMax Reitz ret = -EIO; 585a97c67eeSMax Reitz goto fail; 586381b487dSPaolo Bonzini } 587fdfab37dSEric Blake switch (type) { 588fdfab37dSEric Blake case QCOW2_CLUSTER_COMPRESSED: 589fdfab37dSEric Blake /* Compressed clusters can only be processed one by one */ 590fdfab37dSEric Blake c = 1; 591fdfab37dSEric Blake *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK; 5926377af48SKevin Wolf break; 593fdfab37dSEric Blake case QCOW2_CLUSTER_ZERO_PLAIN: 59468d000a3SKevin Wolf case QCOW2_CLUSTER_UNALLOCATED: 59545aba42fSKevin Wolf /* how many empty clusters ? */ 5964341df8aSEric Blake c = count_contiguous_clusters_unallocated(nb_clusters, 597fdfab37dSEric Blake &l2_table[l2_index], type); 59868d000a3SKevin Wolf *cluster_offset = 0; 59968d000a3SKevin Wolf break; 600fdfab37dSEric Blake case QCOW2_CLUSTER_ZERO_ALLOC: 60168d000a3SKevin Wolf case QCOW2_CLUSTER_NORMAL: 60245aba42fSKevin Wolf /* how many allocated clusters ? */ 60345aba42fSKevin Wolf c = count_contiguous_clusters(nb_clusters, s->cluster_size, 60461653008SKevin Wolf &l2_table[l2_index], QCOW_OFLAG_ZERO); 60568d000a3SKevin Wolf *cluster_offset &= L2E_OFFSET_MASK; 606a97c67eeSMax Reitz if (offset_into_cluster(s, *cluster_offset)) { 607fdfab37dSEric Blake qcow2_signal_corruption(bs, true, -1, -1, 608fdfab37dSEric Blake "Cluster allocation offset %#" 609a97c67eeSMax Reitz PRIx64 " unaligned (L2 offset: %#" PRIx64 610a97c67eeSMax Reitz ", L2 index: %#x)", *cluster_offset, 611a97c67eeSMax Reitz l2_offset, l2_index); 612a97c67eeSMax Reitz ret = -EIO; 613a97c67eeSMax Reitz goto fail; 614a97c67eeSMax Reitz } 61568d000a3SKevin Wolf break; 6161417d7e4SKevin Wolf default: 6171417d7e4SKevin Wolf abort(); 61845aba42fSKevin Wolf } 61945aba42fSKevin Wolf 62029c1a730SKevin Wolf qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 62129c1a730SKevin Wolf 622c834cba9SMax Reitz bytes_available = (int64_t)c * s->cluster_size; 62368d000a3SKevin Wolf 62445aba42fSKevin Wolf out: 625b2f65d6bSKevin Wolf if (bytes_available > bytes_needed) { 626b2f65d6bSKevin Wolf bytes_available = bytes_needed; 627b2f65d6bSKevin Wolf } 62845aba42fSKevin Wolf 629c834cba9SMax Reitz /* bytes_available <= bytes_needed <= *bytes + offset_in_cluster; 630c834cba9SMax Reitz * subtracting offset_in_cluster will therefore definitely yield something 631c834cba9SMax Reitz * not exceeding UINT_MAX */ 632c834cba9SMax Reitz assert(bytes_available - offset_in_cluster <= UINT_MAX); 633ecfe1863SKevin Wolf *bytes = bytes_available - offset_in_cluster; 63445aba42fSKevin Wolf 6353ef95218SEric Blake return type; 636a97c67eeSMax Reitz 637a97c67eeSMax Reitz fail: 638a97c67eeSMax Reitz qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table); 639a97c67eeSMax Reitz return ret; 64045aba42fSKevin Wolf } 64145aba42fSKevin Wolf 64245aba42fSKevin Wolf /* 64345aba42fSKevin Wolf * get_cluster_table 64445aba42fSKevin Wolf * 64545aba42fSKevin Wolf * for a given disk offset, load (and allocate if needed) 64645aba42fSKevin Wolf * the l2 table. 64745aba42fSKevin Wolf * 64845aba42fSKevin Wolf * the l2 table offset in the qcow2 file and the cluster index 64945aba42fSKevin Wolf * in the l2 table are given to the caller. 65045aba42fSKevin Wolf * 6511e3e8f1aSKevin Wolf * Returns 0 on success, -errno in failure case 65245aba42fSKevin Wolf */ 65345aba42fSKevin Wolf static int get_cluster_table(BlockDriverState *bs, uint64_t offset, 65445aba42fSKevin Wolf uint64_t **new_l2_table, 65545aba42fSKevin Wolf int *new_l2_index) 65645aba42fSKevin Wolf { 657ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 6582cf7cfa1SKevin Wolf unsigned int l2_index; 6592cf7cfa1SKevin Wolf uint64_t l1_index, l2_offset; 660c46e1167SKevin Wolf uint64_t *l2_table = NULL; 66180ee15a6SKevin Wolf int ret; 66245aba42fSKevin Wolf 663b6af0975SDaniel P. Berrange /* seek to the l2 offset in the l1 table */ 66445aba42fSKevin Wolf 66545aba42fSKevin Wolf l1_index = offset >> (s->l2_bits + s->cluster_bits); 66645aba42fSKevin Wolf if (l1_index >= s->l1_size) { 66772893756SStefan Hajnoczi ret = qcow2_grow_l1_table(bs, l1_index + 1, false); 6681e3e8f1aSKevin Wolf if (ret < 0) { 6691e3e8f1aSKevin Wolf return ret; 6701e3e8f1aSKevin Wolf } 67145aba42fSKevin Wolf } 6728e37f681SKevin Wolf 6732cf7cfa1SKevin Wolf assert(l1_index < s->l1_size); 6748e37f681SKevin Wolf l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; 675a97c67eeSMax Reitz if (offset_into_cluster(s, l2_offset)) { 676a97c67eeSMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 677a97c67eeSMax Reitz " unaligned (L1 index: %#" PRIx64 ")", 678a97c67eeSMax Reitz l2_offset, l1_index); 679a97c67eeSMax Reitz return -EIO; 680a97c67eeSMax Reitz } 68145aba42fSKevin Wolf 68245aba42fSKevin Wolf /* seek the l2 table of the given l2 offset */ 68345aba42fSKevin Wolf 6848e37f681SKevin Wolf if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) { 68545aba42fSKevin Wolf /* load the l2 table in memory */ 68655c17e98SKevin Wolf ret = l2_load(bs, l2_offset, &l2_table); 68755c17e98SKevin Wolf if (ret < 0) { 68855c17e98SKevin Wolf return ret; 6891e3e8f1aSKevin Wolf } 69045aba42fSKevin Wolf } else { 69116fde5f2SKevin Wolf /* First allocate a new L2 table (and do COW if needed) */ 692c46e1167SKevin Wolf ret = l2_allocate(bs, l1_index, &l2_table); 693c46e1167SKevin Wolf if (ret < 0) { 694c46e1167SKevin Wolf return ret; 6951e3e8f1aSKevin Wolf } 69616fde5f2SKevin Wolf 69716fde5f2SKevin Wolf /* Then decrease the refcount of the old table */ 69816fde5f2SKevin Wolf if (l2_offset) { 6996cfcb9b8SKevin Wolf qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), 7006cfcb9b8SKevin Wolf QCOW2_DISCARD_OTHER); 70116fde5f2SKevin Wolf } 70245aba42fSKevin Wolf } 70345aba42fSKevin Wolf 70445aba42fSKevin Wolf /* find the cluster offset for the given disk offset */ 70545aba42fSKevin Wolf 70645aba42fSKevin Wolf l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); 70745aba42fSKevin Wolf 70845aba42fSKevin Wolf *new_l2_table = l2_table; 70945aba42fSKevin Wolf *new_l2_index = l2_index; 71045aba42fSKevin Wolf 7111e3e8f1aSKevin Wolf return 0; 71245aba42fSKevin Wolf } 71345aba42fSKevin Wolf 71445aba42fSKevin Wolf /* 71545aba42fSKevin Wolf * alloc_compressed_cluster_offset 71645aba42fSKevin Wolf * 71745aba42fSKevin Wolf * For a given offset of the disk image, return cluster offset in 71845aba42fSKevin Wolf * qcow2 file. 71945aba42fSKevin Wolf * 72045aba42fSKevin Wolf * If the offset is not found, allocate a new compressed cluster. 72145aba42fSKevin Wolf * 72245aba42fSKevin Wolf * Return the cluster offset if successful, 72345aba42fSKevin Wolf * Return 0, otherwise. 72445aba42fSKevin Wolf * 72545aba42fSKevin Wolf */ 72645aba42fSKevin Wolf 727ed6ccf0fSKevin Wolf uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, 72845aba42fSKevin Wolf uint64_t offset, 72945aba42fSKevin Wolf int compressed_size) 73045aba42fSKevin Wolf { 731ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 73245aba42fSKevin Wolf int l2_index, ret; 7333948d1d4SKevin Wolf uint64_t *l2_table; 734f4f0d391SKevin Wolf int64_t cluster_offset; 73545aba42fSKevin Wolf int nb_csectors; 73645aba42fSKevin Wolf 7373948d1d4SKevin Wolf ret = get_cluster_table(bs, offset, &l2_table, &l2_index); 7381e3e8f1aSKevin Wolf if (ret < 0) { 73945aba42fSKevin Wolf return 0; 7401e3e8f1aSKevin Wolf } 74145aba42fSKevin Wolf 742b0b6862eSKevin Wolf /* Compression can't overwrite anything. Fail if the cluster was already 743b0b6862eSKevin Wolf * allocated. */ 74445aba42fSKevin Wolf cluster_offset = be64_to_cpu(l2_table[l2_index]); 745b0b6862eSKevin Wolf if (cluster_offset & L2E_OFFSET_MASK) { 7468f1efd00SKevin Wolf qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 7478f1efd00SKevin Wolf return 0; 7488f1efd00SKevin Wolf } 74945aba42fSKevin Wolf 750ed6ccf0fSKevin Wolf cluster_offset = qcow2_alloc_bytes(bs, compressed_size); 7515d757b56SKevin Wolf if (cluster_offset < 0) { 75229c1a730SKevin Wolf qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 7535d757b56SKevin Wolf return 0; 7545d757b56SKevin Wolf } 7555d757b56SKevin Wolf 75645aba42fSKevin Wolf nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) - 75745aba42fSKevin Wolf (cluster_offset >> 9); 75845aba42fSKevin Wolf 75945aba42fSKevin Wolf cluster_offset |= QCOW_OFLAG_COMPRESSED | 76045aba42fSKevin Wolf ((uint64_t)nb_csectors << s->csize_shift); 76145aba42fSKevin Wolf 76245aba42fSKevin Wolf /* update L2 table */ 76345aba42fSKevin Wolf 76445aba42fSKevin Wolf /* compressed clusters never have the copied flag */ 76545aba42fSKevin Wolf 76666f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); 76772e80b89SAlberto Garcia qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 76845aba42fSKevin Wolf l2_table[l2_index] = cpu_to_be64(cluster_offset); 769a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 77045aba42fSKevin Wolf 77145aba42fSKevin Wolf return cluster_offset; 77245aba42fSKevin Wolf } 77345aba42fSKevin Wolf 77499450c6fSAlberto Garcia static int perform_cow(BlockDriverState *bs, QCowL2Meta *m) 775593fb83cSKevin Wolf { 776ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 77799450c6fSAlberto Garcia Qcow2COWRegion *start = &m->cow_start; 77899450c6fSAlberto Garcia Qcow2COWRegion *end = &m->cow_end; 779*672f0f2cSAlberto Garcia unsigned buffer_size; 780*672f0f2cSAlberto Garcia uint8_t *start_buffer, *end_buffer; 781593fb83cSKevin Wolf int ret; 782593fb83cSKevin Wolf 783*672f0f2cSAlberto Garcia assert(start->nb_bytes <= UINT_MAX - end->nb_bytes); 784*672f0f2cSAlberto Garcia 78599450c6fSAlberto Garcia if (start->nb_bytes == 0 && end->nb_bytes == 0) { 786593fb83cSKevin Wolf return 0; 787593fb83cSKevin Wolf } 788593fb83cSKevin Wolf 789*672f0f2cSAlberto Garcia /* Reserve a buffer large enough to store the data from both the 790*672f0f2cSAlberto Garcia * start and end COW regions. Add some padding in the middle if 791*672f0f2cSAlberto Garcia * necessary to make sure that the end region is optimally aligned */ 792*672f0f2cSAlberto Garcia buffer_size = QEMU_ALIGN_UP(start->nb_bytes, bdrv_opt_mem_align(bs)) + 793*672f0f2cSAlberto Garcia end->nb_bytes; 794*672f0f2cSAlberto Garcia start_buffer = qemu_try_blockalign(bs, buffer_size); 795*672f0f2cSAlberto Garcia if (start_buffer == NULL) { 796*672f0f2cSAlberto Garcia return -ENOMEM; 797*672f0f2cSAlberto Garcia } 798*672f0f2cSAlberto Garcia /* The part of the buffer where the end region is located */ 799*672f0f2cSAlberto Garcia end_buffer = start_buffer + buffer_size - end->nb_bytes; 800*672f0f2cSAlberto Garcia 801593fb83cSKevin Wolf qemu_co_mutex_unlock(&s->lock); 802*672f0f2cSAlberto Garcia /* First we read the existing data from both COW regions */ 803*672f0f2cSAlberto Garcia ret = do_perform_cow_read(bs, m->offset, start->offset, 804*672f0f2cSAlberto Garcia start_buffer, start->nb_bytes); 805593fb83cSKevin Wolf if (ret < 0) { 80699450c6fSAlberto Garcia goto fail; 807593fb83cSKevin Wolf } 808593fb83cSKevin Wolf 809*672f0f2cSAlberto Garcia ret = do_perform_cow_read(bs, m->offset, end->offset, 810*672f0f2cSAlberto Garcia end_buffer, end->nb_bytes); 811*672f0f2cSAlberto Garcia if (ret < 0) { 812*672f0f2cSAlberto Garcia goto fail; 813*672f0f2cSAlberto Garcia } 81499450c6fSAlberto Garcia 815*672f0f2cSAlberto Garcia /* Encrypt the data if necessary before writing it */ 816*672f0f2cSAlberto Garcia if (bs->encrypted) { 817*672f0f2cSAlberto Garcia if (!do_perform_cow_encrypt(bs, m->offset, start->offset, 818*672f0f2cSAlberto Garcia start_buffer, start->nb_bytes) || 819*672f0f2cSAlberto Garcia !do_perform_cow_encrypt(bs, m->offset, end->offset, 820*672f0f2cSAlberto Garcia end_buffer, end->nb_bytes)) { 821*672f0f2cSAlberto Garcia ret = -EIO; 822*672f0f2cSAlberto Garcia goto fail; 823*672f0f2cSAlberto Garcia } 824*672f0f2cSAlberto Garcia } 825*672f0f2cSAlberto Garcia 826*672f0f2cSAlberto Garcia /* And now we can write everything */ 827*672f0f2cSAlberto Garcia ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, 828*672f0f2cSAlberto Garcia start_buffer, start->nb_bytes); 829*672f0f2cSAlberto Garcia if (ret < 0) { 830*672f0f2cSAlberto Garcia goto fail; 831*672f0f2cSAlberto Garcia } 832*672f0f2cSAlberto Garcia 833*672f0f2cSAlberto Garcia ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, 834*672f0f2cSAlberto Garcia end_buffer, end->nb_bytes); 83599450c6fSAlberto Garcia fail: 83699450c6fSAlberto Garcia qemu_co_mutex_lock(&s->lock); 83799450c6fSAlberto Garcia 838593fb83cSKevin Wolf /* 839593fb83cSKevin Wolf * Before we update the L2 table to actually point to the new cluster, we 840593fb83cSKevin Wolf * need to be sure that the refcounts have been increased and COW was 841593fb83cSKevin Wolf * handled. 842593fb83cSKevin Wolf */ 84399450c6fSAlberto Garcia if (ret == 0) { 844593fb83cSKevin Wolf qcow2_cache_depends_on_flush(s->l2_table_cache); 84599450c6fSAlberto Garcia } 846593fb83cSKevin Wolf 847*672f0f2cSAlberto Garcia qemu_vfree(start_buffer); 84899450c6fSAlberto Garcia return ret; 849593fb83cSKevin Wolf } 850593fb83cSKevin Wolf 851148da7eaSKevin Wolf int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) 85245aba42fSKevin Wolf { 853ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 85445aba42fSKevin Wolf int i, j = 0, l2_index, ret; 855593fb83cSKevin Wolf uint64_t *old_cluster, *l2_table; 856250196f1SKevin Wolf uint64_t cluster_offset = m->alloc_offset; 85745aba42fSKevin Wolf 8583cce16f4SKevin Wolf trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters); 859f50f88b9SKevin Wolf assert(m->nb_clusters > 0); 86045aba42fSKevin Wolf 8615839e53bSMarkus Armbruster old_cluster = g_try_new(uint64_t, m->nb_clusters); 862de82815dSKevin Wolf if (old_cluster == NULL) { 863de82815dSKevin Wolf ret = -ENOMEM; 864de82815dSKevin Wolf goto err; 865de82815dSKevin Wolf } 86645aba42fSKevin Wolf 86745aba42fSKevin Wolf /* copy content of unmodified sectors */ 86899450c6fSAlberto Garcia ret = perform_cow(bs, m); 869593fb83cSKevin Wolf if (ret < 0) { 87045aba42fSKevin Wolf goto err; 87145aba42fSKevin Wolf } 87245aba42fSKevin Wolf 873593fb83cSKevin Wolf /* Update L2 table. */ 87474c4510aSKevin Wolf if (s->use_lazy_refcounts) { 875280d3735SKevin Wolf qcow2_mark_dirty(bs); 876280d3735SKevin Wolf } 877bfe8043eSStefan Hajnoczi if (qcow2_need_accurate_refcounts(s)) { 878bfe8043eSStefan Hajnoczi qcow2_cache_set_dependency(bs, s->l2_table_cache, 879bfe8043eSStefan Hajnoczi s->refcount_block_cache); 880bfe8043eSStefan Hajnoczi } 881280d3735SKevin Wolf 8823948d1d4SKevin Wolf ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index); 8831e3e8f1aSKevin Wolf if (ret < 0) { 88445aba42fSKevin Wolf goto err; 8851e3e8f1aSKevin Wolf } 88672e80b89SAlberto Garcia qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 88745aba42fSKevin Wolf 888c01dbccbSMax Reitz assert(l2_index + m->nb_clusters <= s->l2_size); 88945aba42fSKevin Wolf for (i = 0; i < m->nb_clusters; i++) { 89045aba42fSKevin Wolf /* if two concurrent writes happen to the same unallocated cluster 89145aba42fSKevin Wolf * each write allocates separate cluster and writes data concurrently. 89245aba42fSKevin Wolf * The first one to complete updates l2 table with pointer to its 89345aba42fSKevin Wolf * cluster the second one has to do RMW (which is done above by 894aaa4d20bSKevin Wolf * perform_cow()), update l2 table with its cluster pointer and free 89545aba42fSKevin Wolf * old cluster. This is what this loop does */ 896aaa4d20bSKevin Wolf if (l2_table[l2_index + i] != 0) { 89745aba42fSKevin Wolf old_cluster[j++] = l2_table[l2_index + i]; 898aaa4d20bSKevin Wolf } 89945aba42fSKevin Wolf 90045aba42fSKevin Wolf l2_table[l2_index + i] = cpu_to_be64((cluster_offset + 90145aba42fSKevin Wolf (i << s->cluster_bits)) | QCOW_OFLAG_COPIED); 90245aba42fSKevin Wolf } 90345aba42fSKevin Wolf 9049f8e668eSKevin Wolf 905a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 90645aba42fSKevin Wolf 9077ec5e6a4SKevin Wolf /* 9087ec5e6a4SKevin Wolf * If this was a COW, we need to decrease the refcount of the old cluster. 9096cfcb9b8SKevin Wolf * 9106cfcb9b8SKevin Wolf * Don't discard clusters that reach a refcount of 0 (e.g. compressed 9116cfcb9b8SKevin Wolf * clusters), the next write will reuse them anyway. 9127ec5e6a4SKevin Wolf */ 913564a6b69SMax Reitz if (!m->keep_old_clusters && j != 0) { 9147ec5e6a4SKevin Wolf for (i = 0; i < j; i++) { 9156cfcb9b8SKevin Wolf qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1, 9166cfcb9b8SKevin Wolf QCOW2_DISCARD_NEVER); 9177ec5e6a4SKevin Wolf } 9187ec5e6a4SKevin Wolf } 91945aba42fSKevin Wolf 92045aba42fSKevin Wolf ret = 0; 92145aba42fSKevin Wolf err: 9227267c094SAnthony Liguori g_free(old_cluster); 92345aba42fSKevin Wolf return ret; 92445aba42fSKevin Wolf } 92545aba42fSKevin Wolf 92645aba42fSKevin Wolf /* 927bf319eceSKevin Wolf * Returns the number of contiguous clusters that can be used for an allocating 928bf319eceSKevin Wolf * write, but require COW to be performed (this includes yet unallocated space, 929bf319eceSKevin Wolf * which must copy from the backing file) 930bf319eceSKevin Wolf */ 931ff99129aSKevin Wolf static int count_cow_clusters(BDRVQcow2State *s, int nb_clusters, 932bf319eceSKevin Wolf uint64_t *l2_table, int l2_index) 933bf319eceSKevin Wolf { 934143550a8SKevin Wolf int i; 935bf319eceSKevin Wolf 936143550a8SKevin Wolf for (i = 0; i < nb_clusters; i++) { 937143550a8SKevin Wolf uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]); 9383ef95218SEric Blake QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry); 939143550a8SKevin Wolf 940143550a8SKevin Wolf switch(cluster_type) { 941143550a8SKevin Wolf case QCOW2_CLUSTER_NORMAL: 942143550a8SKevin Wolf if (l2_entry & QCOW_OFLAG_COPIED) { 943143550a8SKevin Wolf goto out; 944143550a8SKevin Wolf } 945bf319eceSKevin Wolf break; 946143550a8SKevin Wolf case QCOW2_CLUSTER_UNALLOCATED: 947143550a8SKevin Wolf case QCOW2_CLUSTER_COMPRESSED: 948fdfab37dSEric Blake case QCOW2_CLUSTER_ZERO_PLAIN: 949fdfab37dSEric Blake case QCOW2_CLUSTER_ZERO_ALLOC: 950143550a8SKevin Wolf break; 951143550a8SKevin Wolf default: 952143550a8SKevin Wolf abort(); 953143550a8SKevin Wolf } 954bf319eceSKevin Wolf } 955bf319eceSKevin Wolf 956143550a8SKevin Wolf out: 957bf319eceSKevin Wolf assert(i <= nb_clusters); 958bf319eceSKevin Wolf return i; 959bf319eceSKevin Wolf } 960bf319eceSKevin Wolf 961bf319eceSKevin Wolf /* 962250196f1SKevin Wolf * Check if there already is an AIO write request in flight which allocates 963250196f1SKevin Wolf * the same cluster. In this case we need to wait until the previous 964250196f1SKevin Wolf * request has completed and updated the L2 table accordingly. 96565eb2e35SKevin Wolf * 96665eb2e35SKevin Wolf * Returns: 96765eb2e35SKevin Wolf * 0 if there was no dependency. *cur_bytes indicates the number of 96865eb2e35SKevin Wolf * bytes from guest_offset that can be read before the next 96965eb2e35SKevin Wolf * dependency must be processed (or the request is complete) 97065eb2e35SKevin Wolf * 97165eb2e35SKevin Wolf * -EAGAIN if we had to wait for another request, previously gathered 97265eb2e35SKevin Wolf * information on cluster allocation may be invalid now. The caller 97365eb2e35SKevin Wolf * must start over anyway, so consider *cur_bytes undefined. 974250196f1SKevin Wolf */ 975226c3c26SKevin Wolf static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, 976ecdd5333SKevin Wolf uint64_t *cur_bytes, QCowL2Meta **m) 977226c3c26SKevin Wolf { 978ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 979226c3c26SKevin Wolf QCowL2Meta *old_alloc; 98065eb2e35SKevin Wolf uint64_t bytes = *cur_bytes; 981226c3c26SKevin Wolf 982250196f1SKevin Wolf QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) { 983250196f1SKevin Wolf 98465eb2e35SKevin Wolf uint64_t start = guest_offset; 98565eb2e35SKevin Wolf uint64_t end = start + bytes; 98665eb2e35SKevin Wolf uint64_t old_start = l2meta_cow_start(old_alloc); 98765eb2e35SKevin Wolf uint64_t old_end = l2meta_cow_end(old_alloc); 988250196f1SKevin Wolf 989d9d74f41SKevin Wolf if (end <= old_start || start >= old_end) { 990250196f1SKevin Wolf /* No intersection */ 991250196f1SKevin Wolf } else { 992250196f1SKevin Wolf if (start < old_start) { 993250196f1SKevin Wolf /* Stop at the start of a running allocation */ 99465eb2e35SKevin Wolf bytes = old_start - start; 995250196f1SKevin Wolf } else { 99665eb2e35SKevin Wolf bytes = 0; 997250196f1SKevin Wolf } 998250196f1SKevin Wolf 999ecdd5333SKevin Wolf /* Stop if already an l2meta exists. After yielding, it wouldn't 1000ecdd5333SKevin Wolf * be valid any more, so we'd have to clean up the old L2Metas 1001ecdd5333SKevin Wolf * and deal with requests depending on them before starting to 1002ecdd5333SKevin Wolf * gather new ones. Not worth the trouble. */ 1003ecdd5333SKevin Wolf if (bytes == 0 && *m) { 1004ecdd5333SKevin Wolf *cur_bytes = 0; 1005ecdd5333SKevin Wolf return 0; 1006ecdd5333SKevin Wolf } 1007ecdd5333SKevin Wolf 100865eb2e35SKevin Wolf if (bytes == 0) { 1009250196f1SKevin Wolf /* Wait for the dependency to complete. We need to recheck 1010250196f1SKevin Wolf * the free/allocated clusters when we continue. */ 10111ace7ceaSPaolo Bonzini qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock); 1012250196f1SKevin Wolf return -EAGAIN; 1013250196f1SKevin Wolf } 1014250196f1SKevin Wolf } 1015250196f1SKevin Wolf } 1016250196f1SKevin Wolf 101765eb2e35SKevin Wolf /* Make sure that existing clusters and new allocations are only used up to 101865eb2e35SKevin Wolf * the next dependency if we shortened the request above */ 101965eb2e35SKevin Wolf *cur_bytes = bytes; 1020250196f1SKevin Wolf 1021226c3c26SKevin Wolf return 0; 1022226c3c26SKevin Wolf } 1023226c3c26SKevin Wolf 1024226c3c26SKevin Wolf /* 10250af729ecSKevin Wolf * Checks how many already allocated clusters that don't require a copy on 10260af729ecSKevin Wolf * write there are at the given guest_offset (up to *bytes). If 10270af729ecSKevin Wolf * *host_offset is not zero, only physically contiguous clusters beginning at 10280af729ecSKevin Wolf * this host offset are counted. 10290af729ecSKevin Wolf * 1030411d62b0SKevin Wolf * Note that guest_offset may not be cluster aligned. In this case, the 1031411d62b0SKevin Wolf * returned *host_offset points to exact byte referenced by guest_offset and 1032411d62b0SKevin Wolf * therefore isn't cluster aligned as well. 10330af729ecSKevin Wolf * 10340af729ecSKevin Wolf * Returns: 10350af729ecSKevin Wolf * 0: if no allocated clusters are available at the given offset. 10360af729ecSKevin Wolf * *bytes is normally unchanged. It is set to 0 if the cluster 10370af729ecSKevin Wolf * is allocated and doesn't need COW, but doesn't have the right 10380af729ecSKevin Wolf * physical offset. 10390af729ecSKevin Wolf * 10400af729ecSKevin Wolf * 1: if allocated clusters that don't require a COW are available at 10410af729ecSKevin Wolf * the requested offset. *bytes may have decreased and describes 10420af729ecSKevin Wolf * the length of the area that can be written to. 10430af729ecSKevin Wolf * 10440af729ecSKevin Wolf * -errno: in error cases 10450af729ecSKevin Wolf */ 10460af729ecSKevin Wolf static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, 1047c53ede9fSKevin Wolf uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) 10480af729ecSKevin Wolf { 1049ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 10500af729ecSKevin Wolf int l2_index; 10510af729ecSKevin Wolf uint64_t cluster_offset; 10520af729ecSKevin Wolf uint64_t *l2_table; 1053b6d36defSMax Reitz uint64_t nb_clusters; 1054c53ede9fSKevin Wolf unsigned int keep_clusters; 1055a3f1afb4SAlberto Garcia int ret; 10560af729ecSKevin Wolf 10570af729ecSKevin Wolf trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset, 10580af729ecSKevin Wolf *bytes); 10590af729ecSKevin Wolf 1060411d62b0SKevin Wolf assert(*host_offset == 0 || offset_into_cluster(s, guest_offset) 1061411d62b0SKevin Wolf == offset_into_cluster(s, *host_offset)); 1062411d62b0SKevin Wolf 1063acb0467fSKevin Wolf /* 1064acb0467fSKevin Wolf * Calculate the number of clusters to look for. We stop at L2 table 1065acb0467fSKevin Wolf * boundaries to keep things simple. 1066acb0467fSKevin Wolf */ 1067acb0467fSKevin Wolf nb_clusters = 1068acb0467fSKevin Wolf size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); 1069acb0467fSKevin Wolf 1070acb0467fSKevin Wolf l2_index = offset_to_l2_index(s, guest_offset); 1071acb0467fSKevin Wolf nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1072b6d36defSMax Reitz assert(nb_clusters <= INT_MAX); 1073acb0467fSKevin Wolf 10740af729ecSKevin Wolf /* Find L2 entry for the first involved cluster */ 10750af729ecSKevin Wolf ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); 10760af729ecSKevin Wolf if (ret < 0) { 10770af729ecSKevin Wolf return ret; 10780af729ecSKevin Wolf } 10790af729ecSKevin Wolf 10800af729ecSKevin Wolf cluster_offset = be64_to_cpu(l2_table[l2_index]); 10810af729ecSKevin Wolf 10820af729ecSKevin Wolf /* Check how many clusters are already allocated and don't need COW */ 10830af729ecSKevin Wolf if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL 10840af729ecSKevin Wolf && (cluster_offset & QCOW_OFLAG_COPIED)) 10850af729ecSKevin Wolf { 1086e62daaf6SKevin Wolf /* If a specific host_offset is required, check it */ 1087e62daaf6SKevin Wolf bool offset_matches = 1088e62daaf6SKevin Wolf (cluster_offset & L2E_OFFSET_MASK) == *host_offset; 1089e62daaf6SKevin Wolf 1090a97c67eeSMax Reitz if (offset_into_cluster(s, cluster_offset & L2E_OFFSET_MASK)) { 1091a97c67eeSMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset " 1092a97c67eeSMax Reitz "%#llx unaligned (guest offset: %#" PRIx64 1093a97c67eeSMax Reitz ")", cluster_offset & L2E_OFFSET_MASK, 1094a97c67eeSMax Reitz guest_offset); 1095a97c67eeSMax Reitz ret = -EIO; 1096a97c67eeSMax Reitz goto out; 1097a97c67eeSMax Reitz } 1098a97c67eeSMax Reitz 1099e62daaf6SKevin Wolf if (*host_offset != 0 && !offset_matches) { 1100e62daaf6SKevin Wolf *bytes = 0; 1101e62daaf6SKevin Wolf ret = 0; 1102e62daaf6SKevin Wolf goto out; 1103e62daaf6SKevin Wolf } 1104e62daaf6SKevin Wolf 11050af729ecSKevin Wolf /* We keep all QCOW_OFLAG_COPIED clusters */ 1106c53ede9fSKevin Wolf keep_clusters = 1107acb0467fSKevin Wolf count_contiguous_clusters(nb_clusters, s->cluster_size, 110861653008SKevin Wolf &l2_table[l2_index], 11090af729ecSKevin Wolf QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO); 1110c53ede9fSKevin Wolf assert(keep_clusters <= nb_clusters); 1111c53ede9fSKevin Wolf 1112c53ede9fSKevin Wolf *bytes = MIN(*bytes, 1113c53ede9fSKevin Wolf keep_clusters * s->cluster_size 1114c53ede9fSKevin Wolf - offset_into_cluster(s, guest_offset)); 11150af729ecSKevin Wolf 11160af729ecSKevin Wolf ret = 1; 11170af729ecSKevin Wolf } else { 11180af729ecSKevin Wolf ret = 0; 11190af729ecSKevin Wolf } 11200af729ecSKevin Wolf 11210af729ecSKevin Wolf /* Cleanup */ 1122e62daaf6SKevin Wolf out: 1123a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 11240af729ecSKevin Wolf 1125e62daaf6SKevin Wolf /* Only return a host offset if we actually made progress. Otherwise we 1126e62daaf6SKevin Wolf * would make requirements for handle_alloc() that it can't fulfill */ 1127a97c67eeSMax Reitz if (ret > 0) { 1128411d62b0SKevin Wolf *host_offset = (cluster_offset & L2E_OFFSET_MASK) 1129411d62b0SKevin Wolf + offset_into_cluster(s, guest_offset); 1130e62daaf6SKevin Wolf } 1131e62daaf6SKevin Wolf 11320af729ecSKevin Wolf return ret; 11330af729ecSKevin Wolf } 11340af729ecSKevin Wolf 11350af729ecSKevin Wolf /* 1136226c3c26SKevin Wolf * Allocates new clusters for the given guest_offset. 1137226c3c26SKevin Wolf * 1138226c3c26SKevin Wolf * At most *nb_clusters are allocated, and on return *nb_clusters is updated to 1139226c3c26SKevin Wolf * contain the number of clusters that have been allocated and are contiguous 1140226c3c26SKevin Wolf * in the image file. 1141226c3c26SKevin Wolf * 1142226c3c26SKevin Wolf * If *host_offset is non-zero, it specifies the offset in the image file at 1143226c3c26SKevin Wolf * which the new clusters must start. *nb_clusters can be 0 on return in this 1144226c3c26SKevin Wolf * case if the cluster at host_offset is already in use. If *host_offset is 1145226c3c26SKevin Wolf * zero, the clusters can be allocated anywhere in the image file. 1146226c3c26SKevin Wolf * 1147226c3c26SKevin Wolf * *host_offset is updated to contain the offset into the image file at which 1148226c3c26SKevin Wolf * the first allocated cluster starts. 1149226c3c26SKevin Wolf * 1150226c3c26SKevin Wolf * Return 0 on success and -errno in error cases. -EAGAIN means that the 1151226c3c26SKevin Wolf * function has been waiting for another request and the allocation must be 1152226c3c26SKevin Wolf * restarted, but the whole request should not be failed. 1153226c3c26SKevin Wolf */ 1154226c3c26SKevin Wolf static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, 1155b6d36defSMax Reitz uint64_t *host_offset, uint64_t *nb_clusters) 1156226c3c26SKevin Wolf { 1157ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 1158226c3c26SKevin Wolf 1159226c3c26SKevin Wolf trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset, 1160226c3c26SKevin Wolf *host_offset, *nb_clusters); 1161226c3c26SKevin Wolf 1162250196f1SKevin Wolf /* Allocate new clusters */ 1163250196f1SKevin Wolf trace_qcow2_cluster_alloc_phys(qemu_coroutine_self()); 1164250196f1SKevin Wolf if (*host_offset == 0) { 1165df021791SKevin Wolf int64_t cluster_offset = 1166df021791SKevin Wolf qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size); 1167250196f1SKevin Wolf if (cluster_offset < 0) { 1168250196f1SKevin Wolf return cluster_offset; 1169250196f1SKevin Wolf } 1170250196f1SKevin Wolf *host_offset = cluster_offset; 1171250196f1SKevin Wolf return 0; 1172df021791SKevin Wolf } else { 1173b6d36defSMax Reitz int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters); 1174df021791SKevin Wolf if (ret < 0) { 1175df021791SKevin Wolf return ret; 1176df021791SKevin Wolf } 1177df021791SKevin Wolf *nb_clusters = ret; 1178df021791SKevin Wolf return 0; 1179df021791SKevin Wolf } 1180250196f1SKevin Wolf } 1181250196f1SKevin Wolf 1182250196f1SKevin Wolf /* 118310f0ed8bSKevin Wolf * Allocates new clusters for an area that either is yet unallocated or needs a 118410f0ed8bSKevin Wolf * copy on write. If *host_offset is non-zero, clusters are only allocated if 118510f0ed8bSKevin Wolf * the new allocation can match the specified host offset. 118610f0ed8bSKevin Wolf * 1187411d62b0SKevin Wolf * Note that guest_offset may not be cluster aligned. In this case, the 1188411d62b0SKevin Wolf * returned *host_offset points to exact byte referenced by guest_offset and 1189411d62b0SKevin Wolf * therefore isn't cluster aligned as well. 119010f0ed8bSKevin Wolf * 119110f0ed8bSKevin Wolf * Returns: 119210f0ed8bSKevin Wolf * 0: if no clusters could be allocated. *bytes is set to 0, 119310f0ed8bSKevin Wolf * *host_offset is left unchanged. 119410f0ed8bSKevin Wolf * 119510f0ed8bSKevin Wolf * 1: if new clusters were allocated. *bytes may be decreased if the 119610f0ed8bSKevin Wolf * new allocation doesn't cover all of the requested area. 119710f0ed8bSKevin Wolf * *host_offset is updated to contain the host offset of the first 119810f0ed8bSKevin Wolf * newly allocated cluster. 119910f0ed8bSKevin Wolf * 120010f0ed8bSKevin Wolf * -errno: in error cases 120110f0ed8bSKevin Wolf */ 120210f0ed8bSKevin Wolf static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, 1203c37f4cd7SKevin Wolf uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) 120410f0ed8bSKevin Wolf { 1205ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 120610f0ed8bSKevin Wolf int l2_index; 120710f0ed8bSKevin Wolf uint64_t *l2_table; 120810f0ed8bSKevin Wolf uint64_t entry; 1209b6d36defSMax Reitz uint64_t nb_clusters; 121010f0ed8bSKevin Wolf int ret; 1211564a6b69SMax Reitz bool keep_old_clusters = false; 121210f0ed8bSKevin Wolf 1213564a6b69SMax Reitz uint64_t alloc_cluster_offset = 0; 121410f0ed8bSKevin Wolf 121510f0ed8bSKevin Wolf trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset, 121610f0ed8bSKevin Wolf *bytes); 121710f0ed8bSKevin Wolf assert(*bytes > 0); 121810f0ed8bSKevin Wolf 1219f5bc6350SKevin Wolf /* 1220f5bc6350SKevin Wolf * Calculate the number of clusters to look for. We stop at L2 table 1221f5bc6350SKevin Wolf * boundaries to keep things simple. 1222f5bc6350SKevin Wolf */ 1223c37f4cd7SKevin Wolf nb_clusters = 1224c37f4cd7SKevin Wolf size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); 1225c37f4cd7SKevin Wolf 1226f5bc6350SKevin Wolf l2_index = offset_to_l2_index(s, guest_offset); 1227c37f4cd7SKevin Wolf nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1228b6d36defSMax Reitz assert(nb_clusters <= INT_MAX); 1229f5bc6350SKevin Wolf 123010f0ed8bSKevin Wolf /* Find L2 entry for the first involved cluster */ 123110f0ed8bSKevin Wolf ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); 123210f0ed8bSKevin Wolf if (ret < 0) { 123310f0ed8bSKevin Wolf return ret; 123410f0ed8bSKevin Wolf } 123510f0ed8bSKevin Wolf 12363b8e2e26SKevin Wolf entry = be64_to_cpu(l2_table[l2_index]); 123710f0ed8bSKevin Wolf 123810f0ed8bSKevin Wolf /* For the moment, overwrite compressed clusters one by one */ 123910f0ed8bSKevin Wolf if (entry & QCOW_OFLAG_COMPRESSED) { 124010f0ed8bSKevin Wolf nb_clusters = 1; 124110f0ed8bSKevin Wolf } else { 12423b8e2e26SKevin Wolf nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index); 124310f0ed8bSKevin Wolf } 124410f0ed8bSKevin Wolf 1245ecdd5333SKevin Wolf /* This function is only called when there were no non-COW clusters, so if 1246ecdd5333SKevin Wolf * we can't find any unallocated or COW clusters either, something is 1247ecdd5333SKevin Wolf * wrong with our code. */ 1248ecdd5333SKevin Wolf assert(nb_clusters > 0); 1249ecdd5333SKevin Wolf 1250fdfab37dSEric Blake if (qcow2_get_cluster_type(entry) == QCOW2_CLUSTER_ZERO_ALLOC && 1251fdfab37dSEric Blake (entry & QCOW_OFLAG_COPIED) && 1252564a6b69SMax Reitz (!*host_offset || 1253564a6b69SMax Reitz start_of_cluster(s, *host_offset) == (entry & L2E_OFFSET_MASK))) 1254564a6b69SMax Reitz { 1255564a6b69SMax Reitz /* Try to reuse preallocated zero clusters; contiguous normal clusters 1256564a6b69SMax Reitz * would be fine, too, but count_cow_clusters() above has limited 1257564a6b69SMax Reitz * nb_clusters already to a range of COW clusters */ 1258564a6b69SMax Reitz int preallocated_nb_clusters = 1259564a6b69SMax Reitz count_contiguous_clusters(nb_clusters, s->cluster_size, 1260564a6b69SMax Reitz &l2_table[l2_index], QCOW_OFLAG_COPIED); 1261564a6b69SMax Reitz assert(preallocated_nb_clusters > 0); 1262564a6b69SMax Reitz 1263564a6b69SMax Reitz nb_clusters = preallocated_nb_clusters; 1264564a6b69SMax Reitz alloc_cluster_offset = entry & L2E_OFFSET_MASK; 1265564a6b69SMax Reitz 1266564a6b69SMax Reitz /* We want to reuse these clusters, so qcow2_alloc_cluster_link_l2() 1267564a6b69SMax Reitz * should not free them. */ 1268564a6b69SMax Reitz keep_old_clusters = true; 1269564a6b69SMax Reitz } 1270564a6b69SMax Reitz 1271a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 127210f0ed8bSKevin Wolf 1273564a6b69SMax Reitz if (!alloc_cluster_offset) { 127410f0ed8bSKevin Wolf /* Allocate, if necessary at a given offset in the image file */ 1275411d62b0SKevin Wolf alloc_cluster_offset = start_of_cluster(s, *host_offset); 127683baa9a4SKevin Wolf ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset, 127710f0ed8bSKevin Wolf &nb_clusters); 127810f0ed8bSKevin Wolf if (ret < 0) { 127910f0ed8bSKevin Wolf goto fail; 128010f0ed8bSKevin Wolf } 128110f0ed8bSKevin Wolf 128283baa9a4SKevin Wolf /* Can't extend contiguous allocation */ 128383baa9a4SKevin Wolf if (nb_clusters == 0) { 128483baa9a4SKevin Wolf *bytes = 0; 128583baa9a4SKevin Wolf return 0; 128683baa9a4SKevin Wolf } 128783baa9a4SKevin Wolf 1288564a6b69SMax Reitz /* !*host_offset would overwrite the image header and is reserved for 1289564a6b69SMax Reitz * "no host offset preferred". If 0 was a valid host offset, it'd 1290564a6b69SMax Reitz * trigger the following overlap check; do that now to avoid having an 1291564a6b69SMax Reitz * invalid value in *host_offset. */ 1292ff52aab2SMax Reitz if (!alloc_cluster_offset) { 1293ff52aab2SMax Reitz ret = qcow2_pre_write_overlap_check(bs, 0, alloc_cluster_offset, 1294ff52aab2SMax Reitz nb_clusters * s->cluster_size); 1295ff52aab2SMax Reitz assert(ret < 0); 1296ff52aab2SMax Reitz goto fail; 1297ff52aab2SMax Reitz } 1298564a6b69SMax Reitz } 1299ff52aab2SMax Reitz 130010f0ed8bSKevin Wolf /* 130183baa9a4SKevin Wolf * Save info needed for meta data update. 130283baa9a4SKevin Wolf * 130385567393SKevin Wolf * requested_bytes: Number of bytes from the start of the first 130410f0ed8bSKevin Wolf * newly allocated cluster to the end of the (possibly shortened 130510f0ed8bSKevin Wolf * before) write request. 130610f0ed8bSKevin Wolf * 130785567393SKevin Wolf * avail_bytes: Number of bytes from the start of the first 130810f0ed8bSKevin Wolf * newly allocated to the end of the last newly allocated cluster. 130910f0ed8bSKevin Wolf * 131085567393SKevin Wolf * nb_bytes: The number of bytes from the start of the first 131183baa9a4SKevin Wolf * newly allocated cluster to the end of the area that the write 131210f0ed8bSKevin Wolf * request actually writes to (excluding COW at the end) 131310f0ed8bSKevin Wolf */ 131485567393SKevin Wolf uint64_t requested_bytes = *bytes + offset_into_cluster(s, guest_offset); 131585567393SKevin Wolf int avail_bytes = MIN(INT_MAX, nb_clusters << s->cluster_bits); 131685567393SKevin Wolf int nb_bytes = MIN(requested_bytes, avail_bytes); 131788c6588cSKevin Wolf QCowL2Meta *old_m = *m; 131810f0ed8bSKevin Wolf 131910f0ed8bSKevin Wolf *m = g_malloc0(sizeof(**m)); 132010f0ed8bSKevin Wolf 132110f0ed8bSKevin Wolf **m = (QCowL2Meta) { 132288c6588cSKevin Wolf .next = old_m, 132388c6588cSKevin Wolf 1324411d62b0SKevin Wolf .alloc_offset = alloc_cluster_offset, 132583baa9a4SKevin Wolf .offset = start_of_cluster(s, guest_offset), 132610f0ed8bSKevin Wolf .nb_clusters = nb_clusters, 132710f0ed8bSKevin Wolf 1328564a6b69SMax Reitz .keep_old_clusters = keep_old_clusters, 1329564a6b69SMax Reitz 133010f0ed8bSKevin Wolf .cow_start = { 133110f0ed8bSKevin Wolf .offset = 0, 133285567393SKevin Wolf .nb_bytes = offset_into_cluster(s, guest_offset), 133310f0ed8bSKevin Wolf }, 133410f0ed8bSKevin Wolf .cow_end = { 133585567393SKevin Wolf .offset = nb_bytes, 133685567393SKevin Wolf .nb_bytes = avail_bytes - nb_bytes, 133710f0ed8bSKevin Wolf }, 133810f0ed8bSKevin Wolf }; 133910f0ed8bSKevin Wolf qemu_co_queue_init(&(*m)->dependent_requests); 134010f0ed8bSKevin Wolf QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight); 134110f0ed8bSKevin Wolf 1342411d62b0SKevin Wolf *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset); 134385567393SKevin Wolf *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset)); 1344c37f4cd7SKevin Wolf assert(*bytes != 0); 134510f0ed8bSKevin Wolf 134610f0ed8bSKevin Wolf return 1; 134710f0ed8bSKevin Wolf 134810f0ed8bSKevin Wolf fail: 134910f0ed8bSKevin Wolf if (*m && (*m)->nb_clusters > 0) { 135010f0ed8bSKevin Wolf QLIST_REMOVE(*m, next_in_flight); 135110f0ed8bSKevin Wolf } 135210f0ed8bSKevin Wolf return ret; 135310f0ed8bSKevin Wolf } 135410f0ed8bSKevin Wolf 135510f0ed8bSKevin Wolf /* 135645aba42fSKevin Wolf * alloc_cluster_offset 135745aba42fSKevin Wolf * 1358250196f1SKevin Wolf * For a given offset on the virtual disk, find the cluster offset in qcow2 1359250196f1SKevin Wolf * file. If the offset is not found, allocate a new cluster. 136045aba42fSKevin Wolf * 1361250196f1SKevin Wolf * If the cluster was already allocated, m->nb_clusters is set to 0 and 1362a7912369SFrediano Ziglio * other fields in m are meaningless. 136345aba42fSKevin Wolf * 1364148da7eaSKevin Wolf * If the cluster is newly allocated, m->nb_clusters is set to the number of 136568d100e9SKevin Wolf * contiguous clusters that have been allocated. In this case, the other 136668d100e9SKevin Wolf * fields of m are valid and contain information about the first allocated 136768d100e9SKevin Wolf * cluster. 1368148da7eaSKevin Wolf * 136968d100e9SKevin Wolf * If the request conflicts with another write request in flight, the coroutine 137068d100e9SKevin Wolf * is queued and will be reentered when the dependency has completed. 1371148da7eaSKevin Wolf * 1372148da7eaSKevin Wolf * Return 0 on success and -errno in error cases 137345aba42fSKevin Wolf */ 1374f4f0d391SKevin Wolf int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, 1375d46a0bb2SKevin Wolf unsigned int *bytes, uint64_t *host_offset, 1376d46a0bb2SKevin Wolf QCowL2Meta **m) 137745aba42fSKevin Wolf { 1378ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 1379710c2496SKevin Wolf uint64_t start, remaining; 1380250196f1SKevin Wolf uint64_t cluster_offset; 138165eb2e35SKevin Wolf uint64_t cur_bytes; 1382710c2496SKevin Wolf int ret; 138345aba42fSKevin Wolf 1384d46a0bb2SKevin Wolf trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *bytes); 1385710c2496SKevin Wolf 138672424114SKevin Wolf again: 138716f0587eSHu Tao start = offset; 1388d46a0bb2SKevin Wolf remaining = *bytes; 13890af729ecSKevin Wolf cluster_offset = 0; 13900af729ecSKevin Wolf *host_offset = 0; 1391ecdd5333SKevin Wolf cur_bytes = 0; 1392ecdd5333SKevin Wolf *m = NULL; 13930af729ecSKevin Wolf 13942c3b32d2SKevin Wolf while (true) { 1395ecdd5333SKevin Wolf 1396ecdd5333SKevin Wolf if (!*host_offset) { 1397ecdd5333SKevin Wolf *host_offset = start_of_cluster(s, cluster_offset); 1398ecdd5333SKevin Wolf } 1399ecdd5333SKevin Wolf 1400ecdd5333SKevin Wolf assert(remaining >= cur_bytes); 1401ecdd5333SKevin Wolf 1402ecdd5333SKevin Wolf start += cur_bytes; 1403ecdd5333SKevin Wolf remaining -= cur_bytes; 1404ecdd5333SKevin Wolf cluster_offset += cur_bytes; 1405ecdd5333SKevin Wolf 1406ecdd5333SKevin Wolf if (remaining == 0) { 1407ecdd5333SKevin Wolf break; 1408ecdd5333SKevin Wolf } 1409ecdd5333SKevin Wolf 1410ecdd5333SKevin Wolf cur_bytes = remaining; 1411ecdd5333SKevin Wolf 1412250196f1SKevin Wolf /* 141317a71e58SKevin Wolf * Now start gathering as many contiguous clusters as possible: 141417a71e58SKevin Wolf * 141517a71e58SKevin Wolf * 1. Check for overlaps with in-flight allocations 141617a71e58SKevin Wolf * 14172c3b32d2SKevin Wolf * a) Overlap not in the first cluster -> shorten this request and 14182c3b32d2SKevin Wolf * let the caller handle the rest in its next loop iteration. 141917a71e58SKevin Wolf * 14202c3b32d2SKevin Wolf * b) Real overlaps of two requests. Yield and restart the search 14212c3b32d2SKevin Wolf * for contiguous clusters (the situation could have changed 14222c3b32d2SKevin Wolf * while we were sleeping) 142317a71e58SKevin Wolf * 142417a71e58SKevin Wolf * c) TODO: Request starts in the same cluster as the in-flight 14252c3b32d2SKevin Wolf * allocation ends. Shorten the COW of the in-fight allocation, 14262c3b32d2SKevin Wolf * set cluster_offset to write to the same cluster and set up 14272c3b32d2SKevin Wolf * the right synchronisation between the in-flight request and 14282c3b32d2SKevin Wolf * the new one. 142917a71e58SKevin Wolf */ 1430ecdd5333SKevin Wolf ret = handle_dependencies(bs, start, &cur_bytes, m); 143117a71e58SKevin Wolf if (ret == -EAGAIN) { 1432ecdd5333SKevin Wolf /* Currently handle_dependencies() doesn't yield if we already had 1433ecdd5333SKevin Wolf * an allocation. If it did, we would have to clean up the L2Meta 1434ecdd5333SKevin Wolf * structs before starting over. */ 1435ecdd5333SKevin Wolf assert(*m == NULL); 143617a71e58SKevin Wolf goto again; 143717a71e58SKevin Wolf } else if (ret < 0) { 143817a71e58SKevin Wolf return ret; 1439ecdd5333SKevin Wolf } else if (cur_bytes == 0) { 1440ecdd5333SKevin Wolf break; 144117a71e58SKevin Wolf } else { 144217a71e58SKevin Wolf /* handle_dependencies() may have decreased cur_bytes (shortened 144317a71e58SKevin Wolf * the allocations below) so that the next dependency is processed 144417a71e58SKevin Wolf * correctly during the next loop iteration. */ 144517a71e58SKevin Wolf } 144617a71e58SKevin Wolf 144772424114SKevin Wolf /* 14480af729ecSKevin Wolf * 2. Count contiguous COPIED clusters. 144972424114SKevin Wolf */ 1450710c2496SKevin Wolf ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m); 145172424114SKevin Wolf if (ret < 0) { 145272424114SKevin Wolf return ret; 14530af729ecSKevin Wolf } else if (ret) { 1454ecdd5333SKevin Wolf continue; 1455e62daaf6SKevin Wolf } else if (cur_bytes == 0) { 14562c3b32d2SKevin Wolf break; 145772424114SKevin Wolf } 145872424114SKevin Wolf 14590af729ecSKevin Wolf /* 14600af729ecSKevin Wolf * 3. If the request still hasn't completed, allocate new clusters, 14610af729ecSKevin Wolf * considering any cluster_offset of steps 1c or 2. 14620af729ecSKevin Wolf */ 1463710c2496SKevin Wolf ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m); 1464037689d8SKevin Wolf if (ret < 0) { 1465037689d8SKevin Wolf return ret; 1466710c2496SKevin Wolf } else if (ret) { 1467ecdd5333SKevin Wolf continue; 14682c3b32d2SKevin Wolf } else { 14692c3b32d2SKevin Wolf assert(cur_bytes == 0); 14702c3b32d2SKevin Wolf break; 14712c3b32d2SKevin Wolf } 1472710c2496SKevin Wolf } 1473250196f1SKevin Wolf 1474d46a0bb2SKevin Wolf *bytes -= remaining; 1475d46a0bb2SKevin Wolf assert(*bytes > 0); 1476710c2496SKevin Wolf assert(*host_offset != 0); 147745aba42fSKevin Wolf 1478148da7eaSKevin Wolf return 0; 147945aba42fSKevin Wolf } 148045aba42fSKevin Wolf 148145aba42fSKevin Wolf static int decompress_buffer(uint8_t *out_buf, int out_buf_size, 148245aba42fSKevin Wolf const uint8_t *buf, int buf_size) 148345aba42fSKevin Wolf { 148445aba42fSKevin Wolf z_stream strm1, *strm = &strm1; 148545aba42fSKevin Wolf int ret, out_len; 148645aba42fSKevin Wolf 148745aba42fSKevin Wolf memset(strm, 0, sizeof(*strm)); 148845aba42fSKevin Wolf 148945aba42fSKevin Wolf strm->next_in = (uint8_t *)buf; 149045aba42fSKevin Wolf strm->avail_in = buf_size; 149145aba42fSKevin Wolf strm->next_out = out_buf; 149245aba42fSKevin Wolf strm->avail_out = out_buf_size; 149345aba42fSKevin Wolf 149445aba42fSKevin Wolf ret = inflateInit2(strm, -12); 149545aba42fSKevin Wolf if (ret != Z_OK) 149645aba42fSKevin Wolf return -1; 149745aba42fSKevin Wolf ret = inflate(strm, Z_FINISH); 149845aba42fSKevin Wolf out_len = strm->next_out - out_buf; 149945aba42fSKevin Wolf if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || 150045aba42fSKevin Wolf out_len != out_buf_size) { 150145aba42fSKevin Wolf inflateEnd(strm); 150245aba42fSKevin Wolf return -1; 150345aba42fSKevin Wolf } 150445aba42fSKevin Wolf inflateEnd(strm); 150545aba42fSKevin Wolf return 0; 150645aba42fSKevin Wolf } 150745aba42fSKevin Wolf 150866f82ceeSKevin Wolf int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) 150945aba42fSKevin Wolf { 1510ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 151145aba42fSKevin Wolf int ret, csize, nb_csectors, sector_offset; 151245aba42fSKevin Wolf uint64_t coffset; 151345aba42fSKevin Wolf 151445aba42fSKevin Wolf coffset = cluster_offset & s->cluster_offset_mask; 151545aba42fSKevin Wolf if (s->cluster_cache_offset != coffset) { 151645aba42fSKevin Wolf nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1; 151745aba42fSKevin Wolf sector_offset = coffset & 511; 151845aba42fSKevin Wolf csize = nb_csectors * 512 - sector_offset; 151966f82ceeSKevin Wolf BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); 1520fbcbbf4eSKevin Wolf ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, 15219a4f4c31SKevin Wolf nb_csectors); 152245aba42fSKevin Wolf if (ret < 0) { 15238af36488SKevin Wolf return ret; 152445aba42fSKevin Wolf } 152545aba42fSKevin Wolf if (decompress_buffer(s->cluster_cache, s->cluster_size, 152645aba42fSKevin Wolf s->cluster_data + sector_offset, csize) < 0) { 15278af36488SKevin Wolf return -EIO; 152845aba42fSKevin Wolf } 152945aba42fSKevin Wolf s->cluster_cache_offset = coffset; 153045aba42fSKevin Wolf } 153145aba42fSKevin Wolf return 0; 153245aba42fSKevin Wolf } 15335ea929e3SKevin Wolf 15345ea929e3SKevin Wolf /* 15355ea929e3SKevin Wolf * This discards as many clusters of nb_clusters as possible at once (i.e. 15365ea929e3SKevin Wolf * all clusters in the same L2 table) and returns the number of discarded 15375ea929e3SKevin Wolf * clusters. 15385ea929e3SKevin Wolf */ 15395ea929e3SKevin Wolf static int discard_single_l2(BlockDriverState *bs, uint64_t offset, 1540b6d36defSMax Reitz uint64_t nb_clusters, enum qcow2_discard_type type, 1541b6d36defSMax Reitz bool full_discard) 15425ea929e3SKevin Wolf { 1543ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 15443948d1d4SKevin Wolf uint64_t *l2_table; 15455ea929e3SKevin Wolf int l2_index; 15465ea929e3SKevin Wolf int ret; 15475ea929e3SKevin Wolf int i; 15485ea929e3SKevin Wolf 15493948d1d4SKevin Wolf ret = get_cluster_table(bs, offset, &l2_table, &l2_index); 15505ea929e3SKevin Wolf if (ret < 0) { 15515ea929e3SKevin Wolf return ret; 15525ea929e3SKevin Wolf } 15535ea929e3SKevin Wolf 15545ea929e3SKevin Wolf /* Limit nb_clusters to one L2 table */ 15555ea929e3SKevin Wolf nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1556b6d36defSMax Reitz assert(nb_clusters <= INT_MAX); 15575ea929e3SKevin Wolf 15585ea929e3SKevin Wolf for (i = 0; i < nb_clusters; i++) { 1559c883db0dSMax Reitz uint64_t old_l2_entry; 15605ea929e3SKevin Wolf 1561c883db0dSMax Reitz old_l2_entry = be64_to_cpu(l2_table[l2_index + i]); 1562a71835a0SKevin Wolf 1563a71835a0SKevin Wolf /* 1564808c4b6fSMax Reitz * If full_discard is false, make sure that a discarded area reads back 1565808c4b6fSMax Reitz * as zeroes for v3 images (we cannot do it for v2 without actually 1566808c4b6fSMax Reitz * writing a zero-filled buffer). We can skip the operation if the 1567808c4b6fSMax Reitz * cluster is already marked as zero, or if it's unallocated and we 1568808c4b6fSMax Reitz * don't have a backing file. 1569a71835a0SKevin Wolf * 1570a71835a0SKevin Wolf * TODO We might want to use bdrv_get_block_status(bs) here, but we're 1571a71835a0SKevin Wolf * holding s->lock, so that doesn't work today. 1572808c4b6fSMax Reitz * 1573808c4b6fSMax Reitz * If full_discard is true, the sector should not read back as zeroes, 1574808c4b6fSMax Reitz * but rather fall through to the backing file. 1575a71835a0SKevin Wolf */ 1576c883db0dSMax Reitz switch (qcow2_get_cluster_type(old_l2_entry)) { 1577c883db0dSMax Reitz case QCOW2_CLUSTER_UNALLOCATED: 1578760e0063SKevin Wolf if (full_discard || !bs->backing) { 1579a71835a0SKevin Wolf continue; 1580a71835a0SKevin Wolf } 1581c883db0dSMax Reitz break; 1582a71835a0SKevin Wolf 1583fdfab37dSEric Blake case QCOW2_CLUSTER_ZERO_PLAIN: 1584fdfab37dSEric Blake if (!full_discard) { 15855ea929e3SKevin Wolf continue; 1586808c4b6fSMax Reitz } 1587808c4b6fSMax Reitz break; 1588c883db0dSMax Reitz 1589fdfab37dSEric Blake case QCOW2_CLUSTER_ZERO_ALLOC: 1590c883db0dSMax Reitz case QCOW2_CLUSTER_NORMAL: 1591c883db0dSMax Reitz case QCOW2_CLUSTER_COMPRESSED: 1592c883db0dSMax Reitz break; 1593c883db0dSMax Reitz 1594c883db0dSMax Reitz default: 1595c883db0dSMax Reitz abort(); 15965ea929e3SKevin Wolf } 15975ea929e3SKevin Wolf 15985ea929e3SKevin Wolf /* First remove L2 entries */ 159972e80b89SAlberto Garcia qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 1600808c4b6fSMax Reitz if (!full_discard && s->qcow_version >= 3) { 1601a71835a0SKevin Wolf l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); 1602a71835a0SKevin Wolf } else { 16035ea929e3SKevin Wolf l2_table[l2_index + i] = cpu_to_be64(0); 1604a71835a0SKevin Wolf } 16055ea929e3SKevin Wolf 16065ea929e3SKevin Wolf /* Then decrease the refcount */ 1607c883db0dSMax Reitz qcow2_free_any_clusters(bs, old_l2_entry, 1, type); 16085ea929e3SKevin Wolf } 16095ea929e3SKevin Wolf 1610a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 16115ea929e3SKevin Wolf 16125ea929e3SKevin Wolf return nb_clusters; 16135ea929e3SKevin Wolf } 16145ea929e3SKevin Wolf 1615d2cb36afSEric Blake int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset, 1616d2cb36afSEric Blake uint64_t bytes, enum qcow2_discard_type type, 1617d2cb36afSEric Blake bool full_discard) 16185ea929e3SKevin Wolf { 1619ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 1620d2cb36afSEric Blake uint64_t end_offset = offset + bytes; 1621b6d36defSMax Reitz uint64_t nb_clusters; 1622d2cb36afSEric Blake int64_t cleared; 16235ea929e3SKevin Wolf int ret; 16245ea929e3SKevin Wolf 1625f10ee139SEric Blake /* Caller must pass aligned values, except at image end */ 16260c1bd469SEric Blake assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 1627f10ee139SEric Blake assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) || 1628f10ee139SEric Blake end_offset == bs->total_sectors << BDRV_SECTOR_BITS); 16295ea929e3SKevin Wolf 1630d2cb36afSEric Blake nb_clusters = size_to_clusters(s, bytes); 16315ea929e3SKevin Wolf 16320b919faeSKevin Wolf s->cache_discards = true; 16330b919faeSKevin Wolf 16345ea929e3SKevin Wolf /* Each L2 table is handled by its own loop iteration */ 16355ea929e3SKevin Wolf while (nb_clusters > 0) { 1636d2cb36afSEric Blake cleared = discard_single_l2(bs, offset, nb_clusters, type, 1637d2cb36afSEric Blake full_discard); 1638d2cb36afSEric Blake if (cleared < 0) { 1639d2cb36afSEric Blake ret = cleared; 16400b919faeSKevin Wolf goto fail; 16415ea929e3SKevin Wolf } 16425ea929e3SKevin Wolf 1643d2cb36afSEric Blake nb_clusters -= cleared; 1644d2cb36afSEric Blake offset += (cleared * s->cluster_size); 16455ea929e3SKevin Wolf } 16465ea929e3SKevin Wolf 16470b919faeSKevin Wolf ret = 0; 16480b919faeSKevin Wolf fail: 16490b919faeSKevin Wolf s->cache_discards = false; 16500b919faeSKevin Wolf qcow2_process_discards(bs, ret); 16510b919faeSKevin Wolf 16520b919faeSKevin Wolf return ret; 16535ea929e3SKevin Wolf } 1654621f0589SKevin Wolf 1655621f0589SKevin Wolf /* 1656621f0589SKevin Wolf * This zeroes as many clusters of nb_clusters as possible at once (i.e. 1657621f0589SKevin Wolf * all clusters in the same L2 table) and returns the number of zeroed 1658621f0589SKevin Wolf * clusters. 1659621f0589SKevin Wolf */ 1660621f0589SKevin Wolf static int zero_single_l2(BlockDriverState *bs, uint64_t offset, 1661170f4b2eSFam Zheng uint64_t nb_clusters, int flags) 1662621f0589SKevin Wolf { 1663ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 1664621f0589SKevin Wolf uint64_t *l2_table; 1665621f0589SKevin Wolf int l2_index; 1666621f0589SKevin Wolf int ret; 1667621f0589SKevin Wolf int i; 166806cc5e2bSEric Blake bool unmap = !!(flags & BDRV_REQ_MAY_UNMAP); 1669621f0589SKevin Wolf 1670621f0589SKevin Wolf ret = get_cluster_table(bs, offset, &l2_table, &l2_index); 1671621f0589SKevin Wolf if (ret < 0) { 1672621f0589SKevin Wolf return ret; 1673621f0589SKevin Wolf } 1674621f0589SKevin Wolf 1675621f0589SKevin Wolf /* Limit nb_clusters to one L2 table */ 1676621f0589SKevin Wolf nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1677b6d36defSMax Reitz assert(nb_clusters <= INT_MAX); 1678621f0589SKevin Wolf 1679621f0589SKevin Wolf for (i = 0; i < nb_clusters; i++) { 1680621f0589SKevin Wolf uint64_t old_offset; 168106cc5e2bSEric Blake QCow2ClusterType cluster_type; 1682621f0589SKevin Wolf 1683621f0589SKevin Wolf old_offset = be64_to_cpu(l2_table[l2_index + i]); 1684621f0589SKevin Wolf 168506cc5e2bSEric Blake /* 168606cc5e2bSEric Blake * Minimize L2 changes if the cluster already reads back as 168706cc5e2bSEric Blake * zeroes with correct allocation. 168806cc5e2bSEric Blake */ 168906cc5e2bSEric Blake cluster_type = qcow2_get_cluster_type(old_offset); 169006cc5e2bSEric Blake if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN || 169106cc5e2bSEric Blake (cluster_type == QCOW2_CLUSTER_ZERO_ALLOC && !unmap)) { 169206cc5e2bSEric Blake continue; 169306cc5e2bSEric Blake } 169406cc5e2bSEric Blake 169572e80b89SAlberto Garcia qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 169606cc5e2bSEric Blake if (cluster_type == QCOW2_CLUSTER_COMPRESSED || unmap) { 1697621f0589SKevin Wolf l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); 16986cfcb9b8SKevin Wolf qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); 1699621f0589SKevin Wolf } else { 1700621f0589SKevin Wolf l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO); 1701621f0589SKevin Wolf } 1702621f0589SKevin Wolf } 1703621f0589SKevin Wolf 1704a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 1705621f0589SKevin Wolf 1706621f0589SKevin Wolf return nb_clusters; 1707621f0589SKevin Wolf } 1708621f0589SKevin Wolf 1709d2cb36afSEric Blake int qcow2_cluster_zeroize(BlockDriverState *bs, uint64_t offset, 1710d2cb36afSEric Blake uint64_t bytes, int flags) 1711621f0589SKevin Wolf { 1712ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 1713d2cb36afSEric Blake uint64_t end_offset = offset + bytes; 1714b6d36defSMax Reitz uint64_t nb_clusters; 1715d2cb36afSEric Blake int64_t cleared; 1716621f0589SKevin Wolf int ret; 1717621f0589SKevin Wolf 1718f10ee139SEric Blake /* Caller must pass aligned values, except at image end */ 1719f10ee139SEric Blake assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 1720f10ee139SEric Blake assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) || 1721f10ee139SEric Blake end_offset == bs->total_sectors << BDRV_SECTOR_BITS); 1722f10ee139SEric Blake 1723621f0589SKevin Wolf /* The zero flag is only supported by version 3 and newer */ 1724621f0589SKevin Wolf if (s->qcow_version < 3) { 1725621f0589SKevin Wolf return -ENOTSUP; 1726621f0589SKevin Wolf } 1727621f0589SKevin Wolf 1728621f0589SKevin Wolf /* Each L2 table is handled by its own loop iteration */ 1729d2cb36afSEric Blake nb_clusters = size_to_clusters(s, bytes); 1730621f0589SKevin Wolf 17310b919faeSKevin Wolf s->cache_discards = true; 17320b919faeSKevin Wolf 1733621f0589SKevin Wolf while (nb_clusters > 0) { 1734d2cb36afSEric Blake cleared = zero_single_l2(bs, offset, nb_clusters, flags); 1735d2cb36afSEric Blake if (cleared < 0) { 1736d2cb36afSEric Blake ret = cleared; 17370b919faeSKevin Wolf goto fail; 1738621f0589SKevin Wolf } 1739621f0589SKevin Wolf 1740d2cb36afSEric Blake nb_clusters -= cleared; 1741d2cb36afSEric Blake offset += (cleared * s->cluster_size); 1742621f0589SKevin Wolf } 1743621f0589SKevin Wolf 17440b919faeSKevin Wolf ret = 0; 17450b919faeSKevin Wolf fail: 17460b919faeSKevin Wolf s->cache_discards = false; 17470b919faeSKevin Wolf qcow2_process_discards(bs, ret); 17480b919faeSKevin Wolf 17490b919faeSKevin Wolf return ret; 1750621f0589SKevin Wolf } 175132b6444dSMax Reitz 175232b6444dSMax Reitz /* 175332b6444dSMax Reitz * Expands all zero clusters in a specific L1 table (or deallocates them, for 175432b6444dSMax Reitz * non-backed non-pre-allocated zero clusters). 175532b6444dSMax Reitz * 17564057a2b2SMax Reitz * l1_entries and *visited_l1_entries are used to keep track of progress for 17574057a2b2SMax Reitz * status_cb(). l1_entries contains the total number of L1 entries and 17584057a2b2SMax Reitz * *visited_l1_entries counts all visited L1 entries. 175932b6444dSMax Reitz */ 176032b6444dSMax Reitz static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, 1761ecf58777SMax Reitz int l1_size, int64_t *visited_l1_entries, 17624057a2b2SMax Reitz int64_t l1_entries, 17638b13976dSMax Reitz BlockDriverAmendStatusCB *status_cb, 17648b13976dSMax Reitz void *cb_opaque) 176532b6444dSMax Reitz { 1766ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 176732b6444dSMax Reitz bool is_active_l1 = (l1_table == s->l1_table); 176832b6444dSMax Reitz uint64_t *l2_table = NULL; 176932b6444dSMax Reitz int ret; 177032b6444dSMax Reitz int i, j; 177132b6444dSMax Reitz 177232b6444dSMax Reitz if (!is_active_l1) { 177332b6444dSMax Reitz /* inactive L2 tables require a buffer to be stored in when loading 177432b6444dSMax Reitz * them from disk */ 17759a4f4c31SKevin Wolf l2_table = qemu_try_blockalign(bs->file->bs, s->cluster_size); 1776de82815dSKevin Wolf if (l2_table == NULL) { 1777de82815dSKevin Wolf return -ENOMEM; 1778de82815dSKevin Wolf } 177932b6444dSMax Reitz } 178032b6444dSMax Reitz 178132b6444dSMax Reitz for (i = 0; i < l1_size; i++) { 178232b6444dSMax Reitz uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK; 178332b6444dSMax Reitz bool l2_dirty = false; 17840e06528eSMax Reitz uint64_t l2_refcount; 178532b6444dSMax Reitz 178632b6444dSMax Reitz if (!l2_offset) { 178732b6444dSMax Reitz /* unallocated */ 17884057a2b2SMax Reitz (*visited_l1_entries)++; 17894057a2b2SMax Reitz if (status_cb) { 17908b13976dSMax Reitz status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque); 17914057a2b2SMax Reitz } 179232b6444dSMax Reitz continue; 179332b6444dSMax Reitz } 179432b6444dSMax Reitz 17958dd93d93SMax Reitz if (offset_into_cluster(s, l2_offset)) { 17968dd93d93SMax Reitz qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" 17978dd93d93SMax Reitz PRIx64 " unaligned (L1 index: %#x)", 17988dd93d93SMax Reitz l2_offset, i); 17998dd93d93SMax Reitz ret = -EIO; 18008dd93d93SMax Reitz goto fail; 18018dd93d93SMax Reitz } 18028dd93d93SMax Reitz 180332b6444dSMax Reitz if (is_active_l1) { 180432b6444dSMax Reitz /* get active L2 tables from cache */ 180532b6444dSMax Reitz ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, 180632b6444dSMax Reitz (void **)&l2_table); 180732b6444dSMax Reitz } else { 180832b6444dSMax Reitz /* load inactive L2 tables from disk */ 1809fbcbbf4eSKevin Wolf ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE, 181032b6444dSMax Reitz (void *)l2_table, s->cluster_sectors); 181132b6444dSMax Reitz } 181232b6444dSMax Reitz if (ret < 0) { 181332b6444dSMax Reitz goto fail; 181432b6444dSMax Reitz } 181532b6444dSMax Reitz 18167324c10fSMax Reitz ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, 18177324c10fSMax Reitz &l2_refcount); 18187324c10fSMax Reitz if (ret < 0) { 1819ecf58777SMax Reitz goto fail; 1820ecf58777SMax Reitz } 1821ecf58777SMax Reitz 182232b6444dSMax Reitz for (j = 0; j < s->l2_size; j++) { 182332b6444dSMax Reitz uint64_t l2_entry = be64_to_cpu(l2_table[j]); 1824ecf58777SMax Reitz int64_t offset = l2_entry & L2E_OFFSET_MASK; 18253ef95218SEric Blake QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry); 182632b6444dSMax Reitz 1827fdfab37dSEric Blake if (cluster_type != QCOW2_CLUSTER_ZERO_PLAIN && 1828fdfab37dSEric Blake cluster_type != QCOW2_CLUSTER_ZERO_ALLOC) { 182932b6444dSMax Reitz continue; 183032b6444dSMax Reitz } 183132b6444dSMax Reitz 1832fdfab37dSEric Blake if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { 1833760e0063SKevin Wolf if (!bs->backing) { 183432b6444dSMax Reitz /* not backed; therefore we can simply deallocate the 183532b6444dSMax Reitz * cluster */ 183632b6444dSMax Reitz l2_table[j] = 0; 183732b6444dSMax Reitz l2_dirty = true; 183832b6444dSMax Reitz continue; 183932b6444dSMax Reitz } 184032b6444dSMax Reitz 184132b6444dSMax Reitz offset = qcow2_alloc_clusters(bs, s->cluster_size); 184232b6444dSMax Reitz if (offset < 0) { 184332b6444dSMax Reitz ret = offset; 184432b6444dSMax Reitz goto fail; 184532b6444dSMax Reitz } 1846ecf58777SMax Reitz 1847ecf58777SMax Reitz if (l2_refcount > 1) { 1848ecf58777SMax Reitz /* For shared L2 tables, set the refcount accordingly (it is 1849ecf58777SMax Reitz * already 1 and needs to be l2_refcount) */ 1850ecf58777SMax Reitz ret = qcow2_update_cluster_refcount(bs, 18512aabe7c7SMax Reitz offset >> s->cluster_bits, 18522aabe7c7SMax Reitz refcount_diff(1, l2_refcount), false, 1853ecf58777SMax Reitz QCOW2_DISCARD_OTHER); 1854ecf58777SMax Reitz if (ret < 0) { 1855ecf58777SMax Reitz qcow2_free_clusters(bs, offset, s->cluster_size, 1856ecf58777SMax Reitz QCOW2_DISCARD_OTHER); 1857ecf58777SMax Reitz goto fail; 1858ecf58777SMax Reitz } 1859ecf58777SMax Reitz } 186032b6444dSMax Reitz } 186132b6444dSMax Reitz 18628dd93d93SMax Reitz if (offset_into_cluster(s, offset)) { 1863bcb07dbaSEric Blake qcow2_signal_corruption(bs, true, -1, -1, 1864bcb07dbaSEric Blake "Cluster allocation offset " 18658dd93d93SMax Reitz "%#" PRIx64 " unaligned (L2 offset: %#" 18668dd93d93SMax Reitz PRIx64 ", L2 index: %#x)", offset, 18678dd93d93SMax Reitz l2_offset, j); 1868fdfab37dSEric Blake if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { 18698dd93d93SMax Reitz qcow2_free_clusters(bs, offset, s->cluster_size, 18708dd93d93SMax Reitz QCOW2_DISCARD_ALWAYS); 18718dd93d93SMax Reitz } 18728dd93d93SMax Reitz ret = -EIO; 18738dd93d93SMax Reitz goto fail; 18748dd93d93SMax Reitz } 18758dd93d93SMax Reitz 1876231bb267SMax Reitz ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size); 187732b6444dSMax Reitz if (ret < 0) { 1878fdfab37dSEric Blake if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { 187932b6444dSMax Reitz qcow2_free_clusters(bs, offset, s->cluster_size, 188032b6444dSMax Reitz QCOW2_DISCARD_ALWAYS); 1881320c7066SMax Reitz } 188232b6444dSMax Reitz goto fail; 188332b6444dSMax Reitz } 188432b6444dSMax Reitz 1885720ff280SKevin Wolf ret = bdrv_pwrite_zeroes(bs->file, offset, s->cluster_size, 0); 188632b6444dSMax Reitz if (ret < 0) { 1887fdfab37dSEric Blake if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { 188832b6444dSMax Reitz qcow2_free_clusters(bs, offset, s->cluster_size, 188932b6444dSMax Reitz QCOW2_DISCARD_ALWAYS); 1890320c7066SMax Reitz } 189132b6444dSMax Reitz goto fail; 189232b6444dSMax Reitz } 189332b6444dSMax Reitz 1894ecf58777SMax Reitz if (l2_refcount == 1) { 189532b6444dSMax Reitz l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED); 1896ecf58777SMax Reitz } else { 1897ecf58777SMax Reitz l2_table[j] = cpu_to_be64(offset); 1898e390cf5aSMax Reitz } 1899ecf58777SMax Reitz l2_dirty = true; 190032b6444dSMax Reitz } 190132b6444dSMax Reitz 190232b6444dSMax Reitz if (is_active_l1) { 190332b6444dSMax Reitz if (l2_dirty) { 190472e80b89SAlberto Garcia qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 190532b6444dSMax Reitz qcow2_cache_depends_on_flush(s->l2_table_cache); 190632b6444dSMax Reitz } 1907a3f1afb4SAlberto Garcia qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 190832b6444dSMax Reitz } else { 190932b6444dSMax Reitz if (l2_dirty) { 1910231bb267SMax Reitz ret = qcow2_pre_write_overlap_check(bs, 1911231bb267SMax Reitz QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2, l2_offset, 191232b6444dSMax Reitz s->cluster_size); 191332b6444dSMax Reitz if (ret < 0) { 191432b6444dSMax Reitz goto fail; 191532b6444dSMax Reitz } 191632b6444dSMax Reitz 191718d51c4bSKevin Wolf ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE, 191832b6444dSMax Reitz (void *)l2_table, s->cluster_sectors); 191932b6444dSMax Reitz if (ret < 0) { 192032b6444dSMax Reitz goto fail; 192132b6444dSMax Reitz } 192232b6444dSMax Reitz } 192332b6444dSMax Reitz } 19244057a2b2SMax Reitz 19254057a2b2SMax Reitz (*visited_l1_entries)++; 19264057a2b2SMax Reitz if (status_cb) { 19278b13976dSMax Reitz status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque); 19284057a2b2SMax Reitz } 192932b6444dSMax Reitz } 193032b6444dSMax Reitz 193132b6444dSMax Reitz ret = 0; 193232b6444dSMax Reitz 193332b6444dSMax Reitz fail: 193432b6444dSMax Reitz if (l2_table) { 193532b6444dSMax Reitz if (!is_active_l1) { 193632b6444dSMax Reitz qemu_vfree(l2_table); 193732b6444dSMax Reitz } else { 193832b6444dSMax Reitz qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 193932b6444dSMax Reitz } 194032b6444dSMax Reitz } 194132b6444dSMax Reitz return ret; 194232b6444dSMax Reitz } 194332b6444dSMax Reitz 194432b6444dSMax Reitz /* 194532b6444dSMax Reitz * For backed images, expands all zero clusters on the image. For non-backed 194632b6444dSMax Reitz * images, deallocates all non-pre-allocated zero clusters (and claims the 194732b6444dSMax Reitz * allocation for pre-allocated ones). This is important for downgrading to a 194832b6444dSMax Reitz * qcow2 version which doesn't yet support metadata zero clusters. 194932b6444dSMax Reitz */ 19504057a2b2SMax Reitz int qcow2_expand_zero_clusters(BlockDriverState *bs, 19518b13976dSMax Reitz BlockDriverAmendStatusCB *status_cb, 19528b13976dSMax Reitz void *cb_opaque) 195332b6444dSMax Reitz { 1954ff99129aSKevin Wolf BDRVQcow2State *s = bs->opaque; 195532b6444dSMax Reitz uint64_t *l1_table = NULL; 19564057a2b2SMax Reitz int64_t l1_entries = 0, visited_l1_entries = 0; 195732b6444dSMax Reitz int ret; 195832b6444dSMax Reitz int i, j; 195932b6444dSMax Reitz 19604057a2b2SMax Reitz if (status_cb) { 19614057a2b2SMax Reitz l1_entries = s->l1_size; 19624057a2b2SMax Reitz for (i = 0; i < s->nb_snapshots; i++) { 19634057a2b2SMax Reitz l1_entries += s->snapshots[i].l1_size; 19644057a2b2SMax Reitz } 19654057a2b2SMax Reitz } 19664057a2b2SMax Reitz 196732b6444dSMax Reitz ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size, 19684057a2b2SMax Reitz &visited_l1_entries, l1_entries, 19698b13976dSMax Reitz status_cb, cb_opaque); 197032b6444dSMax Reitz if (ret < 0) { 197132b6444dSMax Reitz goto fail; 197232b6444dSMax Reitz } 197332b6444dSMax Reitz 197432b6444dSMax Reitz /* Inactive L1 tables may point to active L2 tables - therefore it is 197532b6444dSMax Reitz * necessary to flush the L2 table cache before trying to access the L2 197632b6444dSMax Reitz * tables pointed to by inactive L1 entries (else we might try to expand 197732b6444dSMax Reitz * zero clusters that have already been expanded); furthermore, it is also 197832b6444dSMax Reitz * necessary to empty the L2 table cache, since it may contain tables which 197932b6444dSMax Reitz * are now going to be modified directly on disk, bypassing the cache. 198032b6444dSMax Reitz * qcow2_cache_empty() does both for us. */ 198132b6444dSMax Reitz ret = qcow2_cache_empty(bs, s->l2_table_cache); 198232b6444dSMax Reitz if (ret < 0) { 198332b6444dSMax Reitz goto fail; 198432b6444dSMax Reitz } 198532b6444dSMax Reitz 198632b6444dSMax Reitz for (i = 0; i < s->nb_snapshots; i++) { 1987d737b78cSLaurent Vivier int l1_sectors = DIV_ROUND_UP(s->snapshots[i].l1_size * 1988d737b78cSLaurent Vivier sizeof(uint64_t), BDRV_SECTOR_SIZE); 198932b6444dSMax Reitz 199032b6444dSMax Reitz l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE); 199132b6444dSMax Reitz 1992fbcbbf4eSKevin Wolf ret = bdrv_read(bs->file, 19939a4f4c31SKevin Wolf s->snapshots[i].l1_table_offset / BDRV_SECTOR_SIZE, 19949a4f4c31SKevin Wolf (void *)l1_table, l1_sectors); 199532b6444dSMax Reitz if (ret < 0) { 199632b6444dSMax Reitz goto fail; 199732b6444dSMax Reitz } 199832b6444dSMax Reitz 199932b6444dSMax Reitz for (j = 0; j < s->snapshots[i].l1_size; j++) { 200032b6444dSMax Reitz be64_to_cpus(&l1_table[j]); 200132b6444dSMax Reitz } 200232b6444dSMax Reitz 200332b6444dSMax Reitz ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size, 20044057a2b2SMax Reitz &visited_l1_entries, l1_entries, 20058b13976dSMax Reitz status_cb, cb_opaque); 200632b6444dSMax Reitz if (ret < 0) { 200732b6444dSMax Reitz goto fail; 200832b6444dSMax Reitz } 200932b6444dSMax Reitz } 201032b6444dSMax Reitz 201132b6444dSMax Reitz ret = 0; 201232b6444dSMax Reitz 201332b6444dSMax Reitz fail: 201432b6444dSMax Reitz g_free(l1_table); 201532b6444dSMax Reitz return ret; 201632b6444dSMax Reitz } 2017