14f1043b4SStefan Hajnoczi /* 24f1043b4SStefan Hajnoczi * Image streaming 34f1043b4SStefan Hajnoczi * 44f1043b4SStefan Hajnoczi * Copyright IBM, Corp. 2011 54f1043b4SStefan Hajnoczi * 64f1043b4SStefan Hajnoczi * Authors: 74f1043b4SStefan Hajnoczi * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> 84f1043b4SStefan Hajnoczi * 94f1043b4SStefan Hajnoczi * This work is licensed under the terms of the GNU LGPL, version 2 or later. 104f1043b4SStefan Hajnoczi * See the COPYING.LIB file in the top-level directory. 114f1043b4SStefan Hajnoczi * 124f1043b4SStefan Hajnoczi */ 134f1043b4SStefan Hajnoczi 1480c71a24SPeter Maydell #include "qemu/osdep.h" 154f1043b4SStefan Hajnoczi #include "trace.h" 16737e150eSPaolo Bonzini #include "block/block_int.h" 17c87621eaSJohn Snow #include "block/blockjob_int.h" 18da34e65cSMarkus Armbruster #include "qapi/error.h" 19cc7a8ea7SMarkus Armbruster #include "qapi/qmp/qerror.h" 206ef228fcSPaolo Bonzini #include "qemu/ratelimit.h" 21373340b2SMax Reitz #include "sysemu/block-backend.h" 224f1043b4SStefan Hajnoczi 234f1043b4SStefan Hajnoczi enum { 244f1043b4SStefan Hajnoczi /* 254f1043b4SStefan Hajnoczi * Size of data buffer for populating the image file. This should be large 264f1043b4SStefan Hajnoczi * enough to process multiple clusters in a single call, so that populating 274f1043b4SStefan Hajnoczi * contiguous regions of the image is efficient. 284f1043b4SStefan Hajnoczi */ 294f1043b4SStefan Hajnoczi STREAM_BUFFER_SIZE = 512 * 1024, /* in bytes */ 304f1043b4SStefan Hajnoczi }; 314f1043b4SStefan Hajnoczi 325094a6c0SStefan Hajnoczi #define SLICE_TIME 100000000ULL /* ns */ 335094a6c0SStefan Hajnoczi 344f1043b4SStefan Hajnoczi typedef struct StreamBlockJob { 354f1043b4SStefan Hajnoczi BlockJob common; 365094a6c0SStefan Hajnoczi RateLimit limit; 374f1043b4SStefan Hajnoczi BlockDriverState *base; 381d809098SPaolo Bonzini BlockdevOnError on_error; 3913d8cc51SJeff Cody char *backing_file_str; 4061b49e48SAlberto Garcia int bs_flags; 414f1043b4SStefan Hajnoczi } StreamBlockJob; 424f1043b4SStefan Hajnoczi 4303e35d82SKevin Wolf static int coroutine_fn stream_populate(BlockBackend *blk, 444f1043b4SStefan Hajnoczi int64_t sector_num, int nb_sectors, 454f1043b4SStefan Hajnoczi void *buf) 464f1043b4SStefan Hajnoczi { 474f1043b4SStefan Hajnoczi struct iovec iov = { 484f1043b4SStefan Hajnoczi .iov_base = buf, 494f1043b4SStefan Hajnoczi .iov_len = nb_sectors * BDRV_SECTOR_SIZE, 504f1043b4SStefan Hajnoczi }; 514f1043b4SStefan Hajnoczi QEMUIOVector qiov; 524f1043b4SStefan Hajnoczi 534f1043b4SStefan Hajnoczi qemu_iovec_init_external(&qiov, &iov, 1); 544f1043b4SStefan Hajnoczi 554f1043b4SStefan Hajnoczi /* Copy-on-read the unallocated clusters */ 5603e35d82SKevin Wolf return blk_co_preadv(blk, sector_num * BDRV_SECTOR_SIZE, qiov.size, &qiov, 5703e35d82SKevin Wolf BDRV_REQ_COPY_ON_READ); 584f1043b4SStefan Hajnoczi } 594f1043b4SStefan Hajnoczi 60f3e69bebSStefan Hajnoczi typedef struct { 61f3e69bebSStefan Hajnoczi int ret; 62f3e69bebSStefan Hajnoczi bool reached_end; 63f3e69bebSStefan Hajnoczi } StreamCompleteData; 64f3e69bebSStefan Hajnoczi 65f3e69bebSStefan Hajnoczi static void stream_complete(BlockJob *job, void *opaque) 66f3e69bebSStefan Hajnoczi { 67f3e69bebSStefan Hajnoczi StreamBlockJob *s = container_of(job, StreamBlockJob, common); 68f3e69bebSStefan Hajnoczi StreamCompleteData *data = opaque; 6903e35d82SKevin Wolf BlockDriverState *bs = blk_bs(job->blk); 70f3e69bebSStefan Hajnoczi BlockDriverState *base = s->base; 71f3e69bebSStefan Hajnoczi 72f3e69bebSStefan Hajnoczi if (!block_job_is_cancelled(&s->common) && data->reached_end && 73f3e69bebSStefan Hajnoczi data->ret == 0) { 74f3e69bebSStefan Hajnoczi const char *base_id = NULL, *base_fmt = NULL; 75f3e69bebSStefan Hajnoczi if (base) { 76f3e69bebSStefan Hajnoczi base_id = s->backing_file_str; 77f3e69bebSStefan Hajnoczi if (base->drv) { 78f3e69bebSStefan Hajnoczi base_fmt = base->drv->format_name; 79f3e69bebSStefan Hajnoczi } 80f3e69bebSStefan Hajnoczi } 8103e35d82SKevin Wolf data->ret = bdrv_change_backing_file(bs, base_id, base_fmt); 8203e35d82SKevin Wolf bdrv_set_backing_hd(bs, base); 83f3e69bebSStefan Hajnoczi } 84f3e69bebSStefan Hajnoczi 8561b49e48SAlberto Garcia /* Reopen the image back in read-only mode if necessary */ 8661b49e48SAlberto Garcia if (s->bs_flags != bdrv_get_flags(bs)) { 87*a170a91fSKevin Wolf /* Give up write permissions before making it read-only */ 88*a170a91fSKevin Wolf blk_set_perm(job->blk, 0, BLK_PERM_ALL, &error_abort); 8961b49e48SAlberto Garcia bdrv_reopen(bs, s->bs_flags, NULL); 9061b49e48SAlberto Garcia } 9161b49e48SAlberto Garcia 92f3e69bebSStefan Hajnoczi g_free(s->backing_file_str); 93f3e69bebSStefan Hajnoczi block_job_completed(&s->common, data->ret); 94f3e69bebSStefan Hajnoczi g_free(data); 95f3e69bebSStefan Hajnoczi } 96f3e69bebSStefan Hajnoczi 974f1043b4SStefan Hajnoczi static void coroutine_fn stream_run(void *opaque) 984f1043b4SStefan Hajnoczi { 994f1043b4SStefan Hajnoczi StreamBlockJob *s = opaque; 100f3e69bebSStefan Hajnoczi StreamCompleteData *data; 10103e35d82SKevin Wolf BlockBackend *blk = s->common.blk; 10203e35d82SKevin Wolf BlockDriverState *bs = blk_bs(blk); 103c8c3080fSMarcelo Tosatti BlockDriverState *base = s->base; 1046578629eSAlberto Garcia int64_t sector_num = 0; 1056578629eSAlberto Garcia int64_t end = -1; 106f14a39ccSSascha Silbe uint64_t delay_ns = 0; 1071d809098SPaolo Bonzini int error = 0; 1084f1043b4SStefan Hajnoczi int ret = 0; 10904120e3bSAnthony Liguori int n = 0; 1104f1043b4SStefan Hajnoczi void *buf; 1114f1043b4SStefan Hajnoczi 112760e0063SKevin Wolf if (!bs->backing) { 1136578629eSAlberto Garcia goto out; 114f4a193e7SMax Reitz } 115f4a193e7SMax Reitz 1164f1043b4SStefan Hajnoczi s->common.len = bdrv_getlength(bs); 1174f1043b4SStefan Hajnoczi if (s->common.len < 0) { 1186578629eSAlberto Garcia ret = s->common.len; 1196578629eSAlberto Garcia goto out; 1204f1043b4SStefan Hajnoczi } 1214f1043b4SStefan Hajnoczi 1224f1043b4SStefan Hajnoczi end = s->common.len >> BDRV_SECTOR_BITS; 1234f1043b4SStefan Hajnoczi buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE); 1244f1043b4SStefan Hajnoczi 1254f1043b4SStefan Hajnoczi /* Turn on copy-on-read for the whole block device so that guest read 1264f1043b4SStefan Hajnoczi * requests help us make progress. Only do this when copying the entire 1274f1043b4SStefan Hajnoczi * backing chain since the copy-on-read operation does not take base into 1284f1043b4SStefan Hajnoczi * account. 1294f1043b4SStefan Hajnoczi */ 1304f1043b4SStefan Hajnoczi if (!base) { 1314f1043b4SStefan Hajnoczi bdrv_enable_copy_on_read(bs); 1324f1043b4SStefan Hajnoczi } 1334f1043b4SStefan Hajnoczi 1344f1043b4SStefan Hajnoczi for (sector_num = 0; sector_num < end; sector_num += n) { 135f9749f28SPaolo Bonzini bool copy; 1364513eafeSPaolo Bonzini 1374513eafeSPaolo Bonzini /* Note that even when no rate limit is applied we need to yield 138c57b6656SKevin Wolf * with no pending I/O here so that bdrv_drain_all() returns. 1394513eafeSPaolo Bonzini */ 1407483d1e5SAlex Bligh block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); 1414f1043b4SStefan Hajnoczi if (block_job_is_cancelled(&s->common)) { 1424f1043b4SStefan Hajnoczi break; 1434f1043b4SStefan Hajnoczi } 1444f1043b4SStefan Hajnoczi 145c3e4f43aSStefan Weil copy = false; 146c3e4f43aSStefan Weil 147bdad13b9SPaolo Bonzini ret = bdrv_is_allocated(bs, sector_num, 1484f1043b4SStefan Hajnoczi STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n); 149f9749f28SPaolo Bonzini if (ret == 1) { 150f9749f28SPaolo Bonzini /* Allocated in the top, no need to copy. */ 151d663640cSPaolo Bonzini } else if (ret >= 0) { 152f9749f28SPaolo Bonzini /* Copy if allocated in the intermediate images. Limit to the 153f9749f28SPaolo Bonzini * known-unallocated area [sector_num, sector_num+n). */ 154760e0063SKevin Wolf ret = bdrv_is_allocated_above(backing_bs(bs), base, 155188a7bbfSPaolo Bonzini sector_num, n, &n); 156571cd9dcSStefan Hajnoczi 157571cd9dcSStefan Hajnoczi /* Finish early if end of backing file has been reached */ 158571cd9dcSStefan Hajnoczi if (ret == 0 && n == 0) { 159571cd9dcSStefan Hajnoczi n = end - sector_num; 160571cd9dcSStefan Hajnoczi } 161571cd9dcSStefan Hajnoczi 162f9749f28SPaolo Bonzini copy = (ret == 1); 163f9749f28SPaolo Bonzini } 1644f1043b4SStefan Hajnoczi trace_stream_one_iteration(s, sector_num, n, ret); 165c3e4f43aSStefan Weil if (copy) { 16603e35d82SKevin Wolf ret = stream_populate(blk, sector_num, n, buf); 1674f1043b4SStefan Hajnoczi } 1684f1043b4SStefan Hajnoczi if (ret < 0) { 1691d809098SPaolo Bonzini BlockErrorAction action = 17081e254dcSKevin Wolf block_job_error_action(&s->common, s->on_error, true, -ret); 171a589569fSWenchao Xia if (action == BLOCK_ERROR_ACTION_STOP) { 1721d809098SPaolo Bonzini n = 0; 1731d809098SPaolo Bonzini continue; 1741d809098SPaolo Bonzini } 1751d809098SPaolo Bonzini if (error == 0) { 1761d809098SPaolo Bonzini error = ret; 1771d809098SPaolo Bonzini } 178a589569fSWenchao Xia if (action == BLOCK_ERROR_ACTION_REPORT) { 1794f1043b4SStefan Hajnoczi break; 1804f1043b4SStefan Hajnoczi } 1811d809098SPaolo Bonzini } 182c8c3080fSMarcelo Tosatti ret = 0; 1834f1043b4SStefan Hajnoczi 1844f1043b4SStefan Hajnoczi /* Publish progress */ 1854f1043b4SStefan Hajnoczi s->common.offset += n * BDRV_SECTOR_SIZE; 186f14a39ccSSascha Silbe if (copy && s->common.speed) { 187f14a39ccSSascha Silbe delay_ns = ratelimit_calculate_delay(&s->limit, n); 188f14a39ccSSascha Silbe } 1894f1043b4SStefan Hajnoczi } 1904f1043b4SStefan Hajnoczi 1914f1043b4SStefan Hajnoczi if (!base) { 1924f1043b4SStefan Hajnoczi bdrv_disable_copy_on_read(bs); 1934f1043b4SStefan Hajnoczi } 1944f1043b4SStefan Hajnoczi 1951d809098SPaolo Bonzini /* Do not remove the backing file if an error was there but ignored. */ 1961d809098SPaolo Bonzini ret = error; 1971d809098SPaolo Bonzini 1984f1043b4SStefan Hajnoczi qemu_vfree(buf); 199f3e69bebSStefan Hajnoczi 2006578629eSAlberto Garcia out: 201f3e69bebSStefan Hajnoczi /* Modify backing chain and close BDSes in main loop */ 202f3e69bebSStefan Hajnoczi data = g_malloc(sizeof(*data)); 203f3e69bebSStefan Hajnoczi data->ret = ret; 204f3e69bebSStefan Hajnoczi data->reached_end = sector_num == end; 205f3e69bebSStefan Hajnoczi block_job_defer_to_main_loop(&s->common, stream_complete, data); 2064f1043b4SStefan Hajnoczi } 2074f1043b4SStefan Hajnoczi 208882ec7ceSStefan Hajnoczi static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp) 2095094a6c0SStefan Hajnoczi { 2105094a6c0SStefan Hajnoczi StreamBlockJob *s = container_of(job, StreamBlockJob, common); 2115094a6c0SStefan Hajnoczi 212882ec7ceSStefan Hajnoczi if (speed < 0) { 213c6bd8c70SMarkus Armbruster error_setg(errp, QERR_INVALID_PARAMETER, "speed"); 2149e6636c7SStefan Hajnoczi return; 2155094a6c0SStefan Hajnoczi } 2166ef228fcSPaolo Bonzini ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); 2175094a6c0SStefan Hajnoczi } 2185094a6c0SStefan Hajnoczi 2193fc4b10aSFam Zheng static const BlockJobDriver stream_job_driver = { 2204f1043b4SStefan Hajnoczi .instance_size = sizeof(StreamBlockJob), 22179e14bf7SFam Zheng .job_type = BLOCK_JOB_TYPE_STREAM, 2225094a6c0SStefan Hajnoczi .set_speed = stream_set_speed, 223a7815a76SJohn Snow .start = stream_run, 2244f1043b4SStefan Hajnoczi }; 2254f1043b4SStefan Hajnoczi 2262323322eSAlberto Garcia void stream_start(const char *job_id, BlockDriverState *bs, 2272323322eSAlberto Garcia BlockDriverState *base, const char *backing_file_str, 2288254b6d9SJohn Snow int64_t speed, BlockdevOnError on_error, Error **errp) 2294f1043b4SStefan Hajnoczi { 2304f1043b4SStefan Hajnoczi StreamBlockJob *s; 23161b49e48SAlberto Garcia BlockDriverState *iter; 23261b49e48SAlberto Garcia int orig_bs_flags; 2334f1043b4SStefan Hajnoczi 23461b49e48SAlberto Garcia /* Make sure that the image is opened in read-write mode */ 23561b49e48SAlberto Garcia orig_bs_flags = bdrv_get_flags(bs); 23661b49e48SAlberto Garcia if (!(orig_bs_flags & BDRV_O_RDWR)) { 23761b49e48SAlberto Garcia if (bdrv_reopen(bs, orig_bs_flags | BDRV_O_RDWR, errp) != 0) { 23861b49e48SAlberto Garcia return; 23961b49e48SAlberto Garcia } 24061b49e48SAlberto Garcia } 24161b49e48SAlberto Garcia 242*a170a91fSKevin Wolf /* Prevent concurrent jobs trying to modify the graph structure here, we 243*a170a91fSKevin Wolf * already have our own plans. Also don't allow resize as the image size is 244*a170a91fSKevin Wolf * queried only at the job start and then cached. */ 245*a170a91fSKevin Wolf s = block_job_create(job_id, &stream_job_driver, bs, 246*a170a91fSKevin Wolf BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED | 247*a170a91fSKevin Wolf BLK_PERM_GRAPH_MOD, 248*a170a91fSKevin Wolf BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED | 249*a170a91fSKevin Wolf BLK_PERM_WRITE, 250*a170a91fSKevin Wolf speed, BLOCK_JOB_DEFAULT, NULL, NULL, errp); 251*a170a91fSKevin Wolf if (!s) { 252*a170a91fSKevin Wolf goto fail; 253*a170a91fSKevin Wolf } 254*a170a91fSKevin Wolf 255*a170a91fSKevin Wolf /* Block all intermediate nodes between bs and base, because they will 256*a170a91fSKevin Wolf * disappear from the chain after this operation. The streaming job reads 257*a170a91fSKevin Wolf * every block only once, assuming that it doesn't change, so block writes 258*a170a91fSKevin Wolf * and resizes. */ 25961b49e48SAlberto Garcia for (iter = backing_bs(bs); iter && iter != base; iter = backing_bs(iter)) { 26076d554e2SKevin Wolf block_job_add_bdrv(&s->common, "intermediate node", iter, 0, 261*a170a91fSKevin Wolf BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED, 262*a170a91fSKevin Wolf &error_abort); 26361b49e48SAlberto Garcia } 26461b49e48SAlberto Garcia 2654f1043b4SStefan Hajnoczi s->base = base; 26613d8cc51SJeff Cody s->backing_file_str = g_strdup(backing_file_str); 26761b49e48SAlberto Garcia s->bs_flags = orig_bs_flags; 2684f1043b4SStefan Hajnoczi 2691d809098SPaolo Bonzini s->on_error = on_error; 2705ccac6f1SJohn Snow trace_stream_start(bs, base, s); 2715ccac6f1SJohn Snow block_job_start(&s->common); 272*a170a91fSKevin Wolf return; 273*a170a91fSKevin Wolf 274*a170a91fSKevin Wolf fail: 275*a170a91fSKevin Wolf if (orig_bs_flags != bdrv_get_flags(bs)) { 276*a170a91fSKevin Wolf bdrv_reopen(bs, s->bs_flags, NULL); 277*a170a91fSKevin Wolf } 2784f1043b4SStefan Hajnoczi } 279