xref: /qemu/block/stream.c (revision c8c3080f)
14f1043b4SStefan Hajnoczi /*
24f1043b4SStefan Hajnoczi  * Image streaming
34f1043b4SStefan Hajnoczi  *
44f1043b4SStefan Hajnoczi  * Copyright IBM, Corp. 2011
54f1043b4SStefan Hajnoczi  *
64f1043b4SStefan Hajnoczi  * Authors:
74f1043b4SStefan Hajnoczi  *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
84f1043b4SStefan Hajnoczi  *
94f1043b4SStefan Hajnoczi  * This work is licensed under the terms of the GNU LGPL, version 2 or later.
104f1043b4SStefan Hajnoczi  * See the COPYING.LIB file in the top-level directory.
114f1043b4SStefan Hajnoczi  *
124f1043b4SStefan Hajnoczi  */
134f1043b4SStefan Hajnoczi 
144f1043b4SStefan Hajnoczi #include "trace.h"
154f1043b4SStefan Hajnoczi #include "block_int.h"
164f1043b4SStefan Hajnoczi 
174f1043b4SStefan Hajnoczi enum {
184f1043b4SStefan Hajnoczi     /*
194f1043b4SStefan Hajnoczi      * Size of data buffer for populating the image file.  This should be large
204f1043b4SStefan Hajnoczi      * enough to process multiple clusters in a single call, so that populating
214f1043b4SStefan Hajnoczi      * contiguous regions of the image is efficient.
224f1043b4SStefan Hajnoczi      */
234f1043b4SStefan Hajnoczi     STREAM_BUFFER_SIZE = 512 * 1024, /* in bytes */
244f1043b4SStefan Hajnoczi };
254f1043b4SStefan Hajnoczi 
265094a6c0SStefan Hajnoczi #define SLICE_TIME 100000000ULL /* ns */
275094a6c0SStefan Hajnoczi 
285094a6c0SStefan Hajnoczi typedef struct {
295094a6c0SStefan Hajnoczi     int64_t next_slice_time;
305094a6c0SStefan Hajnoczi     uint64_t slice_quota;
315094a6c0SStefan Hajnoczi     uint64_t dispatched;
325094a6c0SStefan Hajnoczi } RateLimit;
335094a6c0SStefan Hajnoczi 
345094a6c0SStefan Hajnoczi static int64_t ratelimit_calculate_delay(RateLimit *limit, uint64_t n)
355094a6c0SStefan Hajnoczi {
365094a6c0SStefan Hajnoczi     int64_t delay_ns = 0;
375094a6c0SStefan Hajnoczi     int64_t now = qemu_get_clock_ns(rt_clock);
385094a6c0SStefan Hajnoczi 
395094a6c0SStefan Hajnoczi     if (limit->next_slice_time < now) {
405094a6c0SStefan Hajnoczi         limit->next_slice_time = now + SLICE_TIME;
415094a6c0SStefan Hajnoczi         limit->dispatched = 0;
425094a6c0SStefan Hajnoczi     }
435094a6c0SStefan Hajnoczi     if (limit->dispatched + n > limit->slice_quota) {
445094a6c0SStefan Hajnoczi         delay_ns = limit->next_slice_time - now;
455094a6c0SStefan Hajnoczi     } else {
465094a6c0SStefan Hajnoczi         limit->dispatched += n;
475094a6c0SStefan Hajnoczi     }
485094a6c0SStefan Hajnoczi     return delay_ns;
495094a6c0SStefan Hajnoczi }
505094a6c0SStefan Hajnoczi 
515094a6c0SStefan Hajnoczi static void ratelimit_set_speed(RateLimit *limit, uint64_t speed)
525094a6c0SStefan Hajnoczi {
535094a6c0SStefan Hajnoczi     limit->slice_quota = speed / (1000000000ULL / SLICE_TIME);
545094a6c0SStefan Hajnoczi }
555094a6c0SStefan Hajnoczi 
564f1043b4SStefan Hajnoczi typedef struct StreamBlockJob {
574f1043b4SStefan Hajnoczi     BlockJob common;
585094a6c0SStefan Hajnoczi     RateLimit limit;
594f1043b4SStefan Hajnoczi     BlockDriverState *base;
60*c8c3080fSMarcelo Tosatti     char backing_file_id[1024];
614f1043b4SStefan Hajnoczi } StreamBlockJob;
624f1043b4SStefan Hajnoczi 
634f1043b4SStefan Hajnoczi static int coroutine_fn stream_populate(BlockDriverState *bs,
644f1043b4SStefan Hajnoczi                                         int64_t sector_num, int nb_sectors,
654f1043b4SStefan Hajnoczi                                         void *buf)
664f1043b4SStefan Hajnoczi {
674f1043b4SStefan Hajnoczi     struct iovec iov = {
684f1043b4SStefan Hajnoczi         .iov_base = buf,
694f1043b4SStefan Hajnoczi         .iov_len  = nb_sectors * BDRV_SECTOR_SIZE,
704f1043b4SStefan Hajnoczi     };
714f1043b4SStefan Hajnoczi     QEMUIOVector qiov;
724f1043b4SStefan Hajnoczi 
734f1043b4SStefan Hajnoczi     qemu_iovec_init_external(&qiov, &iov, 1);
744f1043b4SStefan Hajnoczi 
754f1043b4SStefan Hajnoczi     /* Copy-on-read the unallocated clusters */
764f1043b4SStefan Hajnoczi     return bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, &qiov);
774f1043b4SStefan Hajnoczi }
784f1043b4SStefan Hajnoczi 
79*c8c3080fSMarcelo Tosatti /*
80*c8c3080fSMarcelo Tosatti  * Given an image chain: [BASE] -> [INTER1] -> [INTER2] -> [TOP]
81*c8c3080fSMarcelo Tosatti  *
82*c8c3080fSMarcelo Tosatti  * Return true if the given sector is allocated in top.
83*c8c3080fSMarcelo Tosatti  * Return false if the given sector is allocated in intermediate images.
84*c8c3080fSMarcelo Tosatti  * Return true otherwise.
85*c8c3080fSMarcelo Tosatti  *
86*c8c3080fSMarcelo Tosatti  * 'pnum' is set to the number of sectors (including and immediately following
87*c8c3080fSMarcelo Tosatti  *  the specified sector) that are known to be in the same
88*c8c3080fSMarcelo Tosatti  *  allocated/unallocated state.
89*c8c3080fSMarcelo Tosatti  *
90*c8c3080fSMarcelo Tosatti  */
91*c8c3080fSMarcelo Tosatti static int coroutine_fn is_allocated_base(BlockDriverState *top,
92*c8c3080fSMarcelo Tosatti                                           BlockDriverState *base,
93*c8c3080fSMarcelo Tosatti                                           int64_t sector_num,
94*c8c3080fSMarcelo Tosatti                                           int nb_sectors, int *pnum)
95*c8c3080fSMarcelo Tosatti {
96*c8c3080fSMarcelo Tosatti     BlockDriverState *intermediate;
97*c8c3080fSMarcelo Tosatti     int ret, n;
98*c8c3080fSMarcelo Tosatti 
99*c8c3080fSMarcelo Tosatti     ret = bdrv_co_is_allocated(top, sector_num, nb_sectors, &n);
100*c8c3080fSMarcelo Tosatti     if (ret) {
101*c8c3080fSMarcelo Tosatti         *pnum = n;
102*c8c3080fSMarcelo Tosatti         return ret;
103*c8c3080fSMarcelo Tosatti     }
104*c8c3080fSMarcelo Tosatti 
105*c8c3080fSMarcelo Tosatti     /*
106*c8c3080fSMarcelo Tosatti      * Is the unallocated chunk [sector_num, n] also
107*c8c3080fSMarcelo Tosatti      * unallocated between base and top?
108*c8c3080fSMarcelo Tosatti      */
109*c8c3080fSMarcelo Tosatti     intermediate = top->backing_hd;
110*c8c3080fSMarcelo Tosatti 
111*c8c3080fSMarcelo Tosatti     while (intermediate) {
112*c8c3080fSMarcelo Tosatti         int pnum_inter;
113*c8c3080fSMarcelo Tosatti 
114*c8c3080fSMarcelo Tosatti         /* reached base */
115*c8c3080fSMarcelo Tosatti         if (intermediate == base) {
116*c8c3080fSMarcelo Tosatti             *pnum = n;
117*c8c3080fSMarcelo Tosatti             return 1;
118*c8c3080fSMarcelo Tosatti         }
119*c8c3080fSMarcelo Tosatti         ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
120*c8c3080fSMarcelo Tosatti                                    &pnum_inter);
121*c8c3080fSMarcelo Tosatti         if (ret < 0) {
122*c8c3080fSMarcelo Tosatti             return ret;
123*c8c3080fSMarcelo Tosatti         } else if (ret) {
124*c8c3080fSMarcelo Tosatti             *pnum = pnum_inter;
125*c8c3080fSMarcelo Tosatti             return 0;
126*c8c3080fSMarcelo Tosatti         }
127*c8c3080fSMarcelo Tosatti 
128*c8c3080fSMarcelo Tosatti         /*
129*c8c3080fSMarcelo Tosatti          * [sector_num, nb_sectors] is unallocated on top but intermediate
130*c8c3080fSMarcelo Tosatti          * might have
131*c8c3080fSMarcelo Tosatti          *
132*c8c3080fSMarcelo Tosatti          * [sector_num+x, nr_sectors] allocated.
133*c8c3080fSMarcelo Tosatti          */
134*c8c3080fSMarcelo Tosatti         if (n > pnum_inter) {
135*c8c3080fSMarcelo Tosatti             n = pnum_inter;
136*c8c3080fSMarcelo Tosatti         }
137*c8c3080fSMarcelo Tosatti 
138*c8c3080fSMarcelo Tosatti         intermediate = intermediate->backing_hd;
139*c8c3080fSMarcelo Tosatti     }
140*c8c3080fSMarcelo Tosatti 
141*c8c3080fSMarcelo Tosatti     return 1;
142*c8c3080fSMarcelo Tosatti }
143*c8c3080fSMarcelo Tosatti 
1444f1043b4SStefan Hajnoczi static void coroutine_fn stream_run(void *opaque)
1454f1043b4SStefan Hajnoczi {
1464f1043b4SStefan Hajnoczi     StreamBlockJob *s = opaque;
1474f1043b4SStefan Hajnoczi     BlockDriverState *bs = s->common.bs;
148*c8c3080fSMarcelo Tosatti     BlockDriverState *base = s->base;
1494f1043b4SStefan Hajnoczi     int64_t sector_num, end;
1504f1043b4SStefan Hajnoczi     int ret = 0;
1514f1043b4SStefan Hajnoczi     int n;
1524f1043b4SStefan Hajnoczi     void *buf;
1534f1043b4SStefan Hajnoczi 
1544f1043b4SStefan Hajnoczi     s->common.len = bdrv_getlength(bs);
1554f1043b4SStefan Hajnoczi     if (s->common.len < 0) {
1564f1043b4SStefan Hajnoczi         block_job_complete(&s->common, s->common.len);
1574f1043b4SStefan Hajnoczi         return;
1584f1043b4SStefan Hajnoczi     }
1594f1043b4SStefan Hajnoczi 
1604f1043b4SStefan Hajnoczi     end = s->common.len >> BDRV_SECTOR_BITS;
1614f1043b4SStefan Hajnoczi     buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE);
1624f1043b4SStefan Hajnoczi 
1634f1043b4SStefan Hajnoczi     /* Turn on copy-on-read for the whole block device so that guest read
1644f1043b4SStefan Hajnoczi      * requests help us make progress.  Only do this when copying the entire
1654f1043b4SStefan Hajnoczi      * backing chain since the copy-on-read operation does not take base into
1664f1043b4SStefan Hajnoczi      * account.
1674f1043b4SStefan Hajnoczi      */
1684f1043b4SStefan Hajnoczi     if (!base) {
1694f1043b4SStefan Hajnoczi         bdrv_enable_copy_on_read(bs);
1704f1043b4SStefan Hajnoczi     }
1714f1043b4SStefan Hajnoczi 
1724f1043b4SStefan Hajnoczi     for (sector_num = 0; sector_num < end; sector_num += n) {
1735094a6c0SStefan Hajnoczi retry:
1744f1043b4SStefan Hajnoczi         if (block_job_is_cancelled(&s->common)) {
1754f1043b4SStefan Hajnoczi             break;
1764f1043b4SStefan Hajnoczi         }
1774f1043b4SStefan Hajnoczi 
178*c8c3080fSMarcelo Tosatti 
179*c8c3080fSMarcelo Tosatti         if (base) {
180*c8c3080fSMarcelo Tosatti             ret = is_allocated_base(bs, base, sector_num,
1814f1043b4SStefan Hajnoczi                                     STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n);
182*c8c3080fSMarcelo Tosatti         } else {
183*c8c3080fSMarcelo Tosatti             ret = bdrv_co_is_allocated(bs, sector_num,
184*c8c3080fSMarcelo Tosatti                                        STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE,
185*c8c3080fSMarcelo Tosatti                                        &n);
186*c8c3080fSMarcelo Tosatti         }
1874f1043b4SStefan Hajnoczi         trace_stream_one_iteration(s, sector_num, n, ret);
1884f1043b4SStefan Hajnoczi         if (ret == 0) {
1895094a6c0SStefan Hajnoczi             if (s->common.speed) {
1905094a6c0SStefan Hajnoczi                 uint64_t delay_ns = ratelimit_calculate_delay(&s->limit, n);
1915094a6c0SStefan Hajnoczi                 if (delay_ns > 0) {
1925094a6c0SStefan Hajnoczi                     co_sleep_ns(rt_clock, delay_ns);
1935094a6c0SStefan Hajnoczi 
1945094a6c0SStefan Hajnoczi                     /* Recheck cancellation and that sectors are unallocated */
1955094a6c0SStefan Hajnoczi                     goto retry;
1965094a6c0SStefan Hajnoczi                 }
1975094a6c0SStefan Hajnoczi             }
1984f1043b4SStefan Hajnoczi             ret = stream_populate(bs, sector_num, n, buf);
1994f1043b4SStefan Hajnoczi         }
2004f1043b4SStefan Hajnoczi         if (ret < 0) {
2014f1043b4SStefan Hajnoczi             break;
2024f1043b4SStefan Hajnoczi         }
203*c8c3080fSMarcelo Tosatti         ret = 0;
2044f1043b4SStefan Hajnoczi 
2054f1043b4SStefan Hajnoczi         /* Publish progress */
2064f1043b4SStefan Hajnoczi         s->common.offset += n * BDRV_SECTOR_SIZE;
2075094a6c0SStefan Hajnoczi 
2085094a6c0SStefan Hajnoczi         /* Note that even when no rate limit is applied we need to yield
2095094a6c0SStefan Hajnoczi          * with no pending I/O here so that qemu_aio_flush() returns.
2105094a6c0SStefan Hajnoczi          */
2115094a6c0SStefan Hajnoczi         co_sleep_ns(rt_clock, 0);
2124f1043b4SStefan Hajnoczi     }
2134f1043b4SStefan Hajnoczi 
2144f1043b4SStefan Hajnoczi     if (!base) {
2154f1043b4SStefan Hajnoczi         bdrv_disable_copy_on_read(bs);
2164f1043b4SStefan Hajnoczi     }
2174f1043b4SStefan Hajnoczi 
2184f1043b4SStefan Hajnoczi     if (sector_num == end && ret == 0) {
219*c8c3080fSMarcelo Tosatti         const char *base_id = NULL;
220*c8c3080fSMarcelo Tosatti         if (base) {
221*c8c3080fSMarcelo Tosatti             base_id = s->backing_file_id;
222*c8c3080fSMarcelo Tosatti         }
223*c8c3080fSMarcelo Tosatti         ret = bdrv_change_backing_file(bs, base_id, NULL);
2244f1043b4SStefan Hajnoczi     }
2254f1043b4SStefan Hajnoczi 
2264f1043b4SStefan Hajnoczi     qemu_vfree(buf);
2274f1043b4SStefan Hajnoczi     block_job_complete(&s->common, ret);
2284f1043b4SStefan Hajnoczi }
2294f1043b4SStefan Hajnoczi 
2305094a6c0SStefan Hajnoczi static int stream_set_speed(BlockJob *job, int64_t value)
2315094a6c0SStefan Hajnoczi {
2325094a6c0SStefan Hajnoczi     StreamBlockJob *s = container_of(job, StreamBlockJob, common);
2335094a6c0SStefan Hajnoczi 
2345094a6c0SStefan Hajnoczi     if (value < 0) {
2355094a6c0SStefan Hajnoczi         return -EINVAL;
2365094a6c0SStefan Hajnoczi     }
2375094a6c0SStefan Hajnoczi     job->speed = value;
2385094a6c0SStefan Hajnoczi     ratelimit_set_speed(&s->limit, value / BDRV_SECTOR_SIZE);
2395094a6c0SStefan Hajnoczi     return 0;
2405094a6c0SStefan Hajnoczi }
2415094a6c0SStefan Hajnoczi 
2424f1043b4SStefan Hajnoczi static BlockJobType stream_job_type = {
2434f1043b4SStefan Hajnoczi     .instance_size = sizeof(StreamBlockJob),
2444f1043b4SStefan Hajnoczi     .job_type      = "stream",
2455094a6c0SStefan Hajnoczi     .set_speed     = stream_set_speed,
2464f1043b4SStefan Hajnoczi };
2474f1043b4SStefan Hajnoczi 
2484f1043b4SStefan Hajnoczi int stream_start(BlockDriverState *bs, BlockDriverState *base,
249*c8c3080fSMarcelo Tosatti                  const char *base_id, BlockDriverCompletionFunc *cb,
250*c8c3080fSMarcelo Tosatti                  void *opaque)
2514f1043b4SStefan Hajnoczi {
2524f1043b4SStefan Hajnoczi     StreamBlockJob *s;
2534f1043b4SStefan Hajnoczi     Coroutine *co;
2544f1043b4SStefan Hajnoczi 
2554f1043b4SStefan Hajnoczi     s = block_job_create(&stream_job_type, bs, cb, opaque);
2564f1043b4SStefan Hajnoczi     if (!s) {
2574f1043b4SStefan Hajnoczi         return -EBUSY; /* bs must already be in use */
2584f1043b4SStefan Hajnoczi     }
2594f1043b4SStefan Hajnoczi 
2604f1043b4SStefan Hajnoczi     s->base = base;
261*c8c3080fSMarcelo Tosatti     if (base_id) {
262*c8c3080fSMarcelo Tosatti         pstrcpy(s->backing_file_id, sizeof(s->backing_file_id), base_id);
263*c8c3080fSMarcelo Tosatti     }
2644f1043b4SStefan Hajnoczi 
2654f1043b4SStefan Hajnoczi     co = qemu_coroutine_create(stream_run);
2664f1043b4SStefan Hajnoczi     trace_stream_start(bs, base, s, co, opaque);
2674f1043b4SStefan Hajnoczi     qemu_coroutine_enter(co, s);
2684f1043b4SStefan Hajnoczi     return 0;
2694f1043b4SStefan Hajnoczi }
270