1 /* 2 * Image mirroring 3 * 4 * Copyright Red Hat, Inc. 2012 5 * 6 * Authors: 7 * Paolo Bonzini <pbonzini@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU LGPL, version 2 or later. 10 * See the COPYING.LIB file in the top-level directory. 11 * 12 */ 13 14 #include "qemu/osdep.h" 15 #include "trace.h" 16 #include "block/blockjob.h" 17 #include "block/block_int.h" 18 #include "sysemu/block-backend.h" 19 #include "qapi/error.h" 20 #include "qapi/qmp/qerror.h" 21 #include "qemu/ratelimit.h" 22 #include "qemu/bitmap.h" 23 24 #define SLICE_TIME 100000000ULL /* ns */ 25 #define MAX_IN_FLIGHT 16 26 #define DEFAULT_MIRROR_BUF_SIZE (10 << 20) 27 28 /* The mirroring buffer is a list of granularity-sized chunks. 29 * Free chunks are organized in a list. 30 */ 31 typedef struct MirrorBuffer { 32 QSIMPLEQ_ENTRY(MirrorBuffer) next; 33 } MirrorBuffer; 34 35 typedef struct MirrorBlockJob { 36 BlockJob common; 37 RateLimit limit; 38 BlockBackend *target; 39 BlockDriverState *base; 40 /* The name of the graph node to replace */ 41 char *replaces; 42 /* The BDS to replace */ 43 BlockDriverState *to_replace; 44 /* Used to block operations on the drive-mirror-replace target */ 45 Error *replace_blocker; 46 bool is_none_mode; 47 BlockMirrorBackingMode backing_mode; 48 BlockdevOnError on_source_error, on_target_error; 49 bool synced; 50 bool should_complete; 51 int64_t granularity; 52 size_t buf_size; 53 int64_t bdev_length; 54 unsigned long *cow_bitmap; 55 BdrvDirtyBitmap *dirty_bitmap; 56 HBitmapIter hbi; 57 uint8_t *buf; 58 QSIMPLEQ_HEAD(, MirrorBuffer) buf_free; 59 int buf_free_count; 60 61 uint64_t last_pause_ns; 62 unsigned long *in_flight_bitmap; 63 int in_flight; 64 int64_t sectors_in_flight; 65 int ret; 66 bool unmap; 67 bool waiting_for_io; 68 int target_cluster_sectors; 69 int max_iov; 70 } MirrorBlockJob; 71 72 typedef struct MirrorOp { 73 MirrorBlockJob *s; 74 QEMUIOVector qiov; 75 int64_t sector_num; 76 int nb_sectors; 77 } MirrorOp; 78 79 static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read, 80 int error) 81 { 82 s->synced = false; 83 if (read) { 84 return block_job_error_action(&s->common, s->on_source_error, 85 true, error); 86 } else { 87 return block_job_error_action(&s->common, s->on_target_error, 88 false, error); 89 } 90 } 91 92 static void mirror_iteration_done(MirrorOp *op, int ret) 93 { 94 MirrorBlockJob *s = op->s; 95 struct iovec *iov; 96 int64_t chunk_num; 97 int i, nb_chunks, sectors_per_chunk; 98 99 trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret); 100 101 s->in_flight--; 102 s->sectors_in_flight -= op->nb_sectors; 103 iov = op->qiov.iov; 104 for (i = 0; i < op->qiov.niov; i++) { 105 MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base; 106 QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next); 107 s->buf_free_count++; 108 } 109 110 sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; 111 chunk_num = op->sector_num / sectors_per_chunk; 112 nb_chunks = DIV_ROUND_UP(op->nb_sectors, sectors_per_chunk); 113 bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks); 114 if (ret >= 0) { 115 if (s->cow_bitmap) { 116 bitmap_set(s->cow_bitmap, chunk_num, nb_chunks); 117 } 118 s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE; 119 } 120 121 qemu_iovec_destroy(&op->qiov); 122 g_free(op); 123 124 if (s->waiting_for_io) { 125 qemu_coroutine_enter(s->common.co); 126 } 127 } 128 129 static void mirror_write_complete(void *opaque, int ret) 130 { 131 MirrorOp *op = opaque; 132 MirrorBlockJob *s = op->s; 133 if (ret < 0) { 134 BlockErrorAction action; 135 136 bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors); 137 action = mirror_error_action(s, false, -ret); 138 if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) { 139 s->ret = ret; 140 } 141 } 142 mirror_iteration_done(op, ret); 143 } 144 145 static void mirror_read_complete(void *opaque, int ret) 146 { 147 MirrorOp *op = opaque; 148 MirrorBlockJob *s = op->s; 149 if (ret < 0) { 150 BlockErrorAction action; 151 152 bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors); 153 action = mirror_error_action(s, true, -ret); 154 if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) { 155 s->ret = ret; 156 } 157 158 mirror_iteration_done(op, ret); 159 return; 160 } 161 blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov, 162 0, mirror_write_complete, op); 163 } 164 165 static inline void mirror_clip_sectors(MirrorBlockJob *s, 166 int64_t sector_num, 167 int *nb_sectors) 168 { 169 *nb_sectors = MIN(*nb_sectors, 170 s->bdev_length / BDRV_SECTOR_SIZE - sector_num); 171 } 172 173 /* Round sector_num and/or nb_sectors to target cluster if COW is needed, and 174 * return the offset of the adjusted tail sector against original. */ 175 static int mirror_cow_align(MirrorBlockJob *s, 176 int64_t *sector_num, 177 int *nb_sectors) 178 { 179 bool need_cow; 180 int ret = 0; 181 int chunk_sectors = s->granularity >> BDRV_SECTOR_BITS; 182 int64_t align_sector_num = *sector_num; 183 int align_nb_sectors = *nb_sectors; 184 int max_sectors = chunk_sectors * s->max_iov; 185 186 need_cow = !test_bit(*sector_num / chunk_sectors, s->cow_bitmap); 187 need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors, 188 s->cow_bitmap); 189 if (need_cow) { 190 bdrv_round_sectors_to_clusters(blk_bs(s->target), *sector_num, 191 *nb_sectors, &align_sector_num, 192 &align_nb_sectors); 193 } 194 195 if (align_nb_sectors > max_sectors) { 196 align_nb_sectors = max_sectors; 197 if (need_cow) { 198 align_nb_sectors = QEMU_ALIGN_DOWN(align_nb_sectors, 199 s->target_cluster_sectors); 200 } 201 } 202 /* Clipping may result in align_nb_sectors unaligned to chunk boundary, but 203 * that doesn't matter because it's already the end of source image. */ 204 mirror_clip_sectors(s, align_sector_num, &align_nb_sectors); 205 206 ret = align_sector_num + align_nb_sectors - (*sector_num + *nb_sectors); 207 *sector_num = align_sector_num; 208 *nb_sectors = align_nb_sectors; 209 assert(ret >= 0); 210 return ret; 211 } 212 213 static inline void mirror_wait_for_io(MirrorBlockJob *s) 214 { 215 assert(!s->waiting_for_io); 216 s->waiting_for_io = true; 217 qemu_coroutine_yield(); 218 s->waiting_for_io = false; 219 } 220 221 /* Submit async read while handling COW. 222 * Returns: The number of sectors copied after and including sector_num, 223 * excluding any sectors copied prior to sector_num due to alignment. 224 * This will be nb_sectors if no alignment is necessary, or 225 * (new_end - sector_num) if tail is rounded up or down due to 226 * alignment or buffer limit. 227 */ 228 static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num, 229 int nb_sectors) 230 { 231 BlockBackend *source = s->common.blk; 232 int sectors_per_chunk, nb_chunks; 233 int ret; 234 MirrorOp *op; 235 int max_sectors; 236 237 sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; 238 max_sectors = sectors_per_chunk * s->max_iov; 239 240 /* We can only handle as much as buf_size at a time. */ 241 nb_sectors = MIN(s->buf_size >> BDRV_SECTOR_BITS, nb_sectors); 242 nb_sectors = MIN(max_sectors, nb_sectors); 243 assert(nb_sectors); 244 ret = nb_sectors; 245 246 if (s->cow_bitmap) { 247 ret += mirror_cow_align(s, §or_num, &nb_sectors); 248 } 249 assert(nb_sectors << BDRV_SECTOR_BITS <= s->buf_size); 250 /* The sector range must meet granularity because: 251 * 1) Caller passes in aligned values; 252 * 2) mirror_cow_align is used only when target cluster is larger. */ 253 assert(!(sector_num % sectors_per_chunk)); 254 nb_chunks = DIV_ROUND_UP(nb_sectors, sectors_per_chunk); 255 256 while (s->buf_free_count < nb_chunks) { 257 trace_mirror_yield_in_flight(s, sector_num, s->in_flight); 258 mirror_wait_for_io(s); 259 } 260 261 /* Allocate a MirrorOp that is used as an AIO callback. */ 262 op = g_new(MirrorOp, 1); 263 op->s = s; 264 op->sector_num = sector_num; 265 op->nb_sectors = nb_sectors; 266 267 /* Now make a QEMUIOVector taking enough granularity-sized chunks 268 * from s->buf_free. 269 */ 270 qemu_iovec_init(&op->qiov, nb_chunks); 271 while (nb_chunks-- > 0) { 272 MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free); 273 size_t remaining = nb_sectors * BDRV_SECTOR_SIZE - op->qiov.size; 274 275 QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next); 276 s->buf_free_count--; 277 qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining)); 278 } 279 280 /* Copy the dirty cluster. */ 281 s->in_flight++; 282 s->sectors_in_flight += nb_sectors; 283 trace_mirror_one_iteration(s, sector_num, nb_sectors); 284 285 blk_aio_preadv(source, sector_num * BDRV_SECTOR_SIZE, &op->qiov, 0, 286 mirror_read_complete, op); 287 return ret; 288 } 289 290 static void mirror_do_zero_or_discard(MirrorBlockJob *s, 291 int64_t sector_num, 292 int nb_sectors, 293 bool is_discard) 294 { 295 MirrorOp *op; 296 297 /* Allocate a MirrorOp that is used as an AIO callback. The qiov is zeroed 298 * so the freeing in mirror_iteration_done is nop. */ 299 op = g_new0(MirrorOp, 1); 300 op->s = s; 301 op->sector_num = sector_num; 302 op->nb_sectors = nb_sectors; 303 304 s->in_flight++; 305 s->sectors_in_flight += nb_sectors; 306 if (is_discard) { 307 blk_aio_pdiscard(s->target, sector_num << BDRV_SECTOR_BITS, 308 op->nb_sectors << BDRV_SECTOR_BITS, 309 mirror_write_complete, op); 310 } else { 311 blk_aio_pwrite_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE, 312 op->nb_sectors * BDRV_SECTOR_SIZE, 313 s->unmap ? BDRV_REQ_MAY_UNMAP : 0, 314 mirror_write_complete, op); 315 } 316 } 317 318 static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) 319 { 320 BlockDriverState *source = blk_bs(s->common.blk); 321 int64_t sector_num, first_chunk; 322 uint64_t delay_ns = 0; 323 /* At least the first dirty chunk is mirrored in one iteration. */ 324 int nb_chunks = 1; 325 int64_t end = s->bdev_length / BDRV_SECTOR_SIZE; 326 int sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; 327 bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target)); 328 329 sector_num = hbitmap_iter_next(&s->hbi); 330 if (sector_num < 0) { 331 bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); 332 sector_num = hbitmap_iter_next(&s->hbi); 333 trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap)); 334 assert(sector_num >= 0); 335 } 336 337 first_chunk = sector_num / sectors_per_chunk; 338 while (test_bit(first_chunk, s->in_flight_bitmap)) { 339 trace_mirror_yield_in_flight(s, sector_num, s->in_flight); 340 mirror_wait_for_io(s); 341 } 342 343 block_job_pause_point(&s->common); 344 345 /* Find the number of consective dirty chunks following the first dirty 346 * one, and wait for in flight requests in them. */ 347 while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) { 348 int64_t hbitmap_next; 349 int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk; 350 int64_t next_chunk = next_sector / sectors_per_chunk; 351 if (next_sector >= end || 352 !bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) { 353 break; 354 } 355 if (test_bit(next_chunk, s->in_flight_bitmap)) { 356 break; 357 } 358 359 hbitmap_next = hbitmap_iter_next(&s->hbi); 360 if (hbitmap_next > next_sector || hbitmap_next < 0) { 361 /* The bitmap iterator's cache is stale, refresh it */ 362 bdrv_set_dirty_iter(&s->hbi, next_sector); 363 hbitmap_next = hbitmap_iter_next(&s->hbi); 364 } 365 assert(hbitmap_next == next_sector); 366 nb_chunks++; 367 } 368 369 /* Clear dirty bits before querying the block status, because 370 * calling bdrv_get_block_status_above could yield - if some blocks are 371 * marked dirty in this window, we need to know. 372 */ 373 bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, 374 nb_chunks * sectors_per_chunk); 375 bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks); 376 while (nb_chunks > 0 && sector_num < end) { 377 int ret; 378 int io_sectors, io_sectors_acct; 379 BlockDriverState *file; 380 enum MirrorMethod { 381 MIRROR_METHOD_COPY, 382 MIRROR_METHOD_ZERO, 383 MIRROR_METHOD_DISCARD 384 } mirror_method = MIRROR_METHOD_COPY; 385 386 assert(!(sector_num % sectors_per_chunk)); 387 ret = bdrv_get_block_status_above(source, NULL, sector_num, 388 nb_chunks * sectors_per_chunk, 389 &io_sectors, &file); 390 if (ret < 0) { 391 io_sectors = nb_chunks * sectors_per_chunk; 392 } 393 394 io_sectors -= io_sectors % sectors_per_chunk; 395 if (io_sectors < sectors_per_chunk) { 396 io_sectors = sectors_per_chunk; 397 } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) { 398 int64_t target_sector_num; 399 int target_nb_sectors; 400 bdrv_round_sectors_to_clusters(blk_bs(s->target), sector_num, 401 io_sectors, &target_sector_num, 402 &target_nb_sectors); 403 if (target_sector_num == sector_num && 404 target_nb_sectors == io_sectors) { 405 mirror_method = ret & BDRV_BLOCK_ZERO ? 406 MIRROR_METHOD_ZERO : 407 MIRROR_METHOD_DISCARD; 408 } 409 } 410 411 while (s->in_flight >= MAX_IN_FLIGHT) { 412 trace_mirror_yield_in_flight(s, sector_num, s->in_flight); 413 mirror_wait_for_io(s); 414 } 415 416 mirror_clip_sectors(s, sector_num, &io_sectors); 417 switch (mirror_method) { 418 case MIRROR_METHOD_COPY: 419 io_sectors = mirror_do_read(s, sector_num, io_sectors); 420 io_sectors_acct = io_sectors; 421 break; 422 case MIRROR_METHOD_ZERO: 423 case MIRROR_METHOD_DISCARD: 424 mirror_do_zero_or_discard(s, sector_num, io_sectors, 425 mirror_method == MIRROR_METHOD_DISCARD); 426 if (write_zeroes_ok) { 427 io_sectors_acct = 0; 428 } else { 429 io_sectors_acct = io_sectors; 430 } 431 break; 432 default: 433 abort(); 434 } 435 assert(io_sectors); 436 sector_num += io_sectors; 437 nb_chunks -= DIV_ROUND_UP(io_sectors, sectors_per_chunk); 438 if (s->common.speed) { 439 delay_ns = ratelimit_calculate_delay(&s->limit, io_sectors_acct); 440 } 441 } 442 return delay_ns; 443 } 444 445 static void mirror_free_init(MirrorBlockJob *s) 446 { 447 int granularity = s->granularity; 448 size_t buf_size = s->buf_size; 449 uint8_t *buf = s->buf; 450 451 assert(s->buf_free_count == 0); 452 QSIMPLEQ_INIT(&s->buf_free); 453 while (buf_size != 0) { 454 MirrorBuffer *cur = (MirrorBuffer *)buf; 455 QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next); 456 s->buf_free_count++; 457 buf_size -= granularity; 458 buf += granularity; 459 } 460 } 461 462 static void mirror_drain(MirrorBlockJob *s) 463 { 464 while (s->in_flight > 0) { 465 mirror_wait_for_io(s); 466 } 467 } 468 469 typedef struct { 470 int ret; 471 } MirrorExitData; 472 473 static void mirror_exit(BlockJob *job, void *opaque) 474 { 475 MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); 476 MirrorExitData *data = opaque; 477 AioContext *replace_aio_context = NULL; 478 BlockDriverState *src = blk_bs(s->common.blk); 479 BlockDriverState *target_bs = blk_bs(s->target); 480 481 /* Make sure that the source BDS doesn't go away before we called 482 * block_job_completed(). */ 483 bdrv_ref(src); 484 485 if (s->to_replace) { 486 replace_aio_context = bdrv_get_aio_context(s->to_replace); 487 aio_context_acquire(replace_aio_context); 488 } 489 490 if (s->should_complete && data->ret == 0) { 491 BlockDriverState *to_replace = src; 492 if (s->to_replace) { 493 to_replace = s->to_replace; 494 } 495 496 if (bdrv_get_flags(target_bs) != bdrv_get_flags(to_replace)) { 497 bdrv_reopen(target_bs, bdrv_get_flags(to_replace), NULL); 498 } 499 500 /* The mirror job has no requests in flight any more, but we need to 501 * drain potential other users of the BDS before changing the graph. */ 502 bdrv_drained_begin(target_bs); 503 bdrv_replace_in_backing_chain(to_replace, target_bs); 504 bdrv_drained_end(target_bs); 505 506 /* We just changed the BDS the job BB refers to */ 507 blk_remove_bs(job->blk); 508 blk_insert_bs(job->blk, src); 509 } 510 if (s->to_replace) { 511 bdrv_op_unblock_all(s->to_replace, s->replace_blocker); 512 error_free(s->replace_blocker); 513 bdrv_unref(s->to_replace); 514 } 515 if (replace_aio_context) { 516 aio_context_release(replace_aio_context); 517 } 518 g_free(s->replaces); 519 bdrv_op_unblock_all(target_bs, s->common.blocker); 520 blk_unref(s->target); 521 block_job_completed(&s->common, data->ret); 522 g_free(data); 523 bdrv_drained_end(src); 524 bdrv_unref(src); 525 } 526 527 static void mirror_throttle(MirrorBlockJob *s) 528 { 529 int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 530 531 if (now - s->last_pause_ns > SLICE_TIME) { 532 s->last_pause_ns = now; 533 block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0); 534 } else { 535 block_job_pause_point(&s->common); 536 } 537 } 538 539 static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) 540 { 541 int64_t sector_num, end; 542 BlockDriverState *base = s->base; 543 BlockDriverState *bs = blk_bs(s->common.blk); 544 BlockDriverState *target_bs = blk_bs(s->target); 545 int ret, n; 546 547 end = s->bdev_length / BDRV_SECTOR_SIZE; 548 549 if (base == NULL && !bdrv_has_zero_init(target_bs)) { 550 if (!bdrv_can_write_zeroes_with_unmap(target_bs)) { 551 bdrv_set_dirty_bitmap(s->dirty_bitmap, 0, end); 552 return 0; 553 } 554 555 for (sector_num = 0; sector_num < end; ) { 556 int nb_sectors = MIN(end - sector_num, 557 QEMU_ALIGN_DOWN(INT_MAX, s->granularity) >> BDRV_SECTOR_BITS); 558 559 mirror_throttle(s); 560 561 if (block_job_is_cancelled(&s->common)) { 562 return 0; 563 } 564 565 if (s->in_flight >= MAX_IN_FLIGHT) { 566 trace_mirror_yield(s, s->in_flight, s->buf_free_count, -1); 567 mirror_wait_for_io(s); 568 continue; 569 } 570 571 mirror_do_zero_or_discard(s, sector_num, nb_sectors, false); 572 sector_num += nb_sectors; 573 } 574 575 mirror_drain(s); 576 } 577 578 /* First part, loop on the sectors and initialize the dirty bitmap. */ 579 for (sector_num = 0; sector_num < end; ) { 580 /* Just to make sure we are not exceeding int limit. */ 581 int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS, 582 end - sector_num); 583 584 mirror_throttle(s); 585 586 if (block_job_is_cancelled(&s->common)) { 587 return 0; 588 } 589 590 ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n); 591 if (ret < 0) { 592 return ret; 593 } 594 595 assert(n > 0); 596 if (ret == 1) { 597 bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n); 598 } 599 sector_num += n; 600 } 601 return 0; 602 } 603 604 static void coroutine_fn mirror_run(void *opaque) 605 { 606 MirrorBlockJob *s = opaque; 607 MirrorExitData *data; 608 BlockDriverState *bs = blk_bs(s->common.blk); 609 BlockDriverState *target_bs = blk_bs(s->target); 610 int64_t length; 611 BlockDriverInfo bdi; 612 char backing_filename[2]; /* we only need 2 characters because we are only 613 checking for a NULL string */ 614 int ret = 0; 615 int target_cluster_size = BDRV_SECTOR_SIZE; 616 617 if (block_job_is_cancelled(&s->common)) { 618 goto immediate_exit; 619 } 620 621 s->bdev_length = bdrv_getlength(bs); 622 if (s->bdev_length < 0) { 623 ret = s->bdev_length; 624 goto immediate_exit; 625 } else if (s->bdev_length == 0) { 626 /* Report BLOCK_JOB_READY and wait for complete. */ 627 block_job_event_ready(&s->common); 628 s->synced = true; 629 while (!block_job_is_cancelled(&s->common) && !s->should_complete) { 630 block_job_yield(&s->common); 631 } 632 s->common.cancelled = false; 633 goto immediate_exit; 634 } 635 636 length = DIV_ROUND_UP(s->bdev_length, s->granularity); 637 s->in_flight_bitmap = bitmap_new(length); 638 639 /* If we have no backing file yet in the destination, we cannot let 640 * the destination do COW. Instead, we copy sectors around the 641 * dirty data if needed. We need a bitmap to do that. 642 */ 643 bdrv_get_backing_filename(target_bs, backing_filename, 644 sizeof(backing_filename)); 645 if (!bdrv_get_info(target_bs, &bdi) && bdi.cluster_size) { 646 target_cluster_size = bdi.cluster_size; 647 } 648 if (backing_filename[0] && !target_bs->backing 649 && s->granularity < target_cluster_size) { 650 s->buf_size = MAX(s->buf_size, target_cluster_size); 651 s->cow_bitmap = bitmap_new(length); 652 } 653 s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS; 654 s->max_iov = MIN(bs->bl.max_iov, target_bs->bl.max_iov); 655 656 s->buf = qemu_try_blockalign(bs, s->buf_size); 657 if (s->buf == NULL) { 658 ret = -ENOMEM; 659 goto immediate_exit; 660 } 661 662 mirror_free_init(s); 663 664 s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 665 if (!s->is_none_mode) { 666 ret = mirror_dirty_init(s); 667 if (ret < 0 || block_job_is_cancelled(&s->common)) { 668 goto immediate_exit; 669 } 670 } 671 672 bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); 673 for (;;) { 674 uint64_t delay_ns = 0; 675 int64_t cnt, delta; 676 bool should_complete; 677 678 if (s->ret < 0) { 679 ret = s->ret; 680 goto immediate_exit; 681 } 682 683 block_job_pause_point(&s->common); 684 685 cnt = bdrv_get_dirty_count(s->dirty_bitmap); 686 /* s->common.offset contains the number of bytes already processed so 687 * far, cnt is the number of dirty sectors remaining and 688 * s->sectors_in_flight is the number of sectors currently being 689 * processed; together those are the current total operation length */ 690 s->common.len = s->common.offset + 691 (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE; 692 693 /* Note that even when no rate limit is applied we need to yield 694 * periodically with no pending I/O so that bdrv_drain_all() returns. 695 * We do so every SLICE_TIME nanoseconds, or when there is an error, 696 * or when the source is clean, whichever comes first. 697 */ 698 delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns; 699 if (delta < SLICE_TIME && 700 s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) { 701 if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 || 702 (cnt == 0 && s->in_flight > 0)) { 703 trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt); 704 mirror_wait_for_io(s); 705 continue; 706 } else if (cnt != 0) { 707 delay_ns = mirror_iteration(s); 708 } 709 } 710 711 should_complete = false; 712 if (s->in_flight == 0 && cnt == 0) { 713 trace_mirror_before_flush(s); 714 ret = blk_flush(s->target); 715 if (ret < 0) { 716 if (mirror_error_action(s, false, -ret) == 717 BLOCK_ERROR_ACTION_REPORT) { 718 goto immediate_exit; 719 } 720 } else { 721 /* We're out of the streaming phase. From now on, if the job 722 * is cancelled we will actually complete all pending I/O and 723 * report completion. This way, block-job-cancel will leave 724 * the target in a consistent state. 725 */ 726 if (!s->synced) { 727 block_job_event_ready(&s->common); 728 s->synced = true; 729 } 730 731 should_complete = s->should_complete || 732 block_job_is_cancelled(&s->common); 733 cnt = bdrv_get_dirty_count(s->dirty_bitmap); 734 } 735 } 736 737 if (cnt == 0 && should_complete) { 738 /* The dirty bitmap is not updated while operations are pending. 739 * If we're about to exit, wait for pending operations before 740 * calling bdrv_get_dirty_count(bs), or we may exit while the 741 * source has dirty data to copy! 742 * 743 * Note that I/O can be submitted by the guest while 744 * mirror_populate runs. 745 */ 746 trace_mirror_before_drain(s, cnt); 747 bdrv_co_drain(bs); 748 cnt = bdrv_get_dirty_count(s->dirty_bitmap); 749 } 750 751 ret = 0; 752 trace_mirror_before_sleep(s, cnt, s->synced, delay_ns); 753 if (!s->synced) { 754 block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); 755 if (block_job_is_cancelled(&s->common)) { 756 break; 757 } 758 } else if (!should_complete) { 759 delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0); 760 block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); 761 } else if (cnt == 0) { 762 /* The two disks are in sync. Exit and report successful 763 * completion. 764 */ 765 assert(QLIST_EMPTY(&bs->tracked_requests)); 766 s->common.cancelled = false; 767 break; 768 } 769 s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 770 } 771 772 immediate_exit: 773 if (s->in_flight > 0) { 774 /* We get here only if something went wrong. Either the job failed, 775 * or it was cancelled prematurely so that we do not guarantee that 776 * the target is a copy of the source. 777 */ 778 assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common))); 779 mirror_drain(s); 780 } 781 782 assert(s->in_flight == 0); 783 qemu_vfree(s->buf); 784 g_free(s->cow_bitmap); 785 g_free(s->in_flight_bitmap); 786 bdrv_release_dirty_bitmap(bs, s->dirty_bitmap); 787 788 data = g_malloc(sizeof(*data)); 789 data->ret = ret; 790 /* Before we switch to target in mirror_exit, make sure data doesn't 791 * change. */ 792 bdrv_drained_begin(bs); 793 block_job_defer_to_main_loop(&s->common, mirror_exit, data); 794 } 795 796 static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp) 797 { 798 MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); 799 800 if (speed < 0) { 801 error_setg(errp, QERR_INVALID_PARAMETER, "speed"); 802 return; 803 } 804 ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); 805 } 806 807 static void mirror_complete(BlockJob *job, Error **errp) 808 { 809 MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); 810 BlockDriverState *src, *target; 811 812 src = blk_bs(job->blk); 813 target = blk_bs(s->target); 814 815 if (!s->synced) { 816 error_setg(errp, "The active block job '%s' cannot be completed", 817 job->id); 818 return; 819 } 820 821 if (s->backing_mode == MIRROR_OPEN_BACKING_CHAIN) { 822 int ret; 823 824 assert(!target->backing); 825 ret = bdrv_open_backing_file(target, NULL, "backing", errp); 826 if (ret < 0) { 827 return; 828 } 829 } 830 831 /* block all operations on to_replace bs */ 832 if (s->replaces) { 833 AioContext *replace_aio_context; 834 835 s->to_replace = bdrv_find_node(s->replaces); 836 if (!s->to_replace) { 837 error_setg(errp, "Node name '%s' not found", s->replaces); 838 return; 839 } 840 841 replace_aio_context = bdrv_get_aio_context(s->to_replace); 842 aio_context_acquire(replace_aio_context); 843 844 error_setg(&s->replace_blocker, 845 "block device is in use by block-job-complete"); 846 bdrv_op_block_all(s->to_replace, s->replace_blocker); 847 bdrv_ref(s->to_replace); 848 849 aio_context_release(replace_aio_context); 850 } 851 852 if (s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) { 853 BlockDriverState *backing = s->is_none_mode ? src : s->base; 854 if (backing_bs(target) != backing) { 855 bdrv_set_backing_hd(target, backing); 856 } 857 } 858 859 s->should_complete = true; 860 block_job_enter(&s->common); 861 } 862 863 /* There is no matching mirror_resume() because mirror_run() will begin 864 * iterating again when the job is resumed. 865 */ 866 static void coroutine_fn mirror_pause(BlockJob *job) 867 { 868 MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); 869 870 mirror_drain(s); 871 } 872 873 static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context) 874 { 875 MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); 876 877 blk_set_aio_context(s->target, new_context); 878 } 879 880 static const BlockJobDriver mirror_job_driver = { 881 .instance_size = sizeof(MirrorBlockJob), 882 .job_type = BLOCK_JOB_TYPE_MIRROR, 883 .set_speed = mirror_set_speed, 884 .complete = mirror_complete, 885 .pause = mirror_pause, 886 .attached_aio_context = mirror_attached_aio_context, 887 }; 888 889 static const BlockJobDriver commit_active_job_driver = { 890 .instance_size = sizeof(MirrorBlockJob), 891 .job_type = BLOCK_JOB_TYPE_COMMIT, 892 .set_speed = mirror_set_speed, 893 .complete = mirror_complete, 894 .pause = mirror_pause, 895 .attached_aio_context = mirror_attached_aio_context, 896 }; 897 898 static void mirror_start_job(const char *job_id, BlockDriverState *bs, 899 BlockDriverState *target, const char *replaces, 900 int64_t speed, uint32_t granularity, 901 int64_t buf_size, 902 BlockMirrorBackingMode backing_mode, 903 BlockdevOnError on_source_error, 904 BlockdevOnError on_target_error, 905 bool unmap, 906 BlockCompletionFunc *cb, 907 void *opaque, Error **errp, 908 const BlockJobDriver *driver, 909 bool is_none_mode, BlockDriverState *base) 910 { 911 MirrorBlockJob *s; 912 913 if (granularity == 0) { 914 granularity = bdrv_get_default_bitmap_granularity(target); 915 } 916 917 assert ((granularity & (granularity - 1)) == 0); 918 919 if (buf_size < 0) { 920 error_setg(errp, "Invalid parameter 'buf-size'"); 921 return; 922 } 923 924 if (buf_size == 0) { 925 buf_size = DEFAULT_MIRROR_BUF_SIZE; 926 } 927 928 s = block_job_create(job_id, driver, bs, speed, cb, opaque, errp); 929 if (!s) { 930 return; 931 } 932 933 s->target = blk_new(); 934 blk_insert_bs(s->target, target); 935 936 s->replaces = g_strdup(replaces); 937 s->on_source_error = on_source_error; 938 s->on_target_error = on_target_error; 939 s->is_none_mode = is_none_mode; 940 s->backing_mode = backing_mode; 941 s->base = base; 942 s->granularity = granularity; 943 s->buf_size = ROUND_UP(buf_size, granularity); 944 s->unmap = unmap; 945 946 s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp); 947 if (!s->dirty_bitmap) { 948 g_free(s->replaces); 949 blk_unref(s->target); 950 block_job_unref(&s->common); 951 return; 952 } 953 954 bdrv_op_block_all(target, s->common.blocker); 955 956 s->common.co = qemu_coroutine_create(mirror_run, s); 957 trace_mirror_start(bs, s, s->common.co, opaque); 958 qemu_coroutine_enter(s->common.co); 959 } 960 961 void mirror_start(const char *job_id, BlockDriverState *bs, 962 BlockDriverState *target, const char *replaces, 963 int64_t speed, uint32_t granularity, int64_t buf_size, 964 MirrorSyncMode mode, BlockMirrorBackingMode backing_mode, 965 BlockdevOnError on_source_error, 966 BlockdevOnError on_target_error, 967 bool unmap, 968 BlockCompletionFunc *cb, 969 void *opaque, Error **errp) 970 { 971 bool is_none_mode; 972 BlockDriverState *base; 973 974 if (mode == MIRROR_SYNC_MODE_INCREMENTAL) { 975 error_setg(errp, "Sync mode 'incremental' not supported"); 976 return; 977 } 978 is_none_mode = mode == MIRROR_SYNC_MODE_NONE; 979 base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL; 980 mirror_start_job(job_id, bs, target, replaces, 981 speed, granularity, buf_size, backing_mode, 982 on_source_error, on_target_error, unmap, cb, opaque, errp, 983 &mirror_job_driver, is_none_mode, base); 984 } 985 986 void commit_active_start(const char *job_id, BlockDriverState *bs, 987 BlockDriverState *base, int64_t speed, 988 BlockdevOnError on_error, 989 BlockCompletionFunc *cb, 990 void *opaque, Error **errp) 991 { 992 int64_t length, base_length; 993 int orig_base_flags; 994 int ret; 995 Error *local_err = NULL; 996 997 orig_base_flags = bdrv_get_flags(base); 998 999 if (bdrv_reopen(base, bs->open_flags, errp)) { 1000 return; 1001 } 1002 1003 length = bdrv_getlength(bs); 1004 if (length < 0) { 1005 error_setg_errno(errp, -length, 1006 "Unable to determine length of %s", bs->filename); 1007 goto error_restore_flags; 1008 } 1009 1010 base_length = bdrv_getlength(base); 1011 if (base_length < 0) { 1012 error_setg_errno(errp, -base_length, 1013 "Unable to determine length of %s", base->filename); 1014 goto error_restore_flags; 1015 } 1016 1017 if (length > base_length) { 1018 ret = bdrv_truncate(base, length); 1019 if (ret < 0) { 1020 error_setg_errno(errp, -ret, 1021 "Top image %s is larger than base image %s, and " 1022 "resize of base image failed", 1023 bs->filename, base->filename); 1024 goto error_restore_flags; 1025 } 1026 } 1027 1028 mirror_start_job(job_id, bs, base, NULL, speed, 0, 0, 1029 MIRROR_LEAVE_BACKING_CHAIN, 1030 on_error, on_error, false, cb, opaque, &local_err, 1031 &commit_active_job_driver, false, base); 1032 if (local_err) { 1033 error_propagate(errp, local_err); 1034 goto error_restore_flags; 1035 } 1036 1037 return; 1038 1039 error_restore_flags: 1040 /* ignore error and errp for bdrv_reopen, because we want to propagate 1041 * the original error */ 1042 bdrv_reopen(base, orig_base_flags, NULL); 1043 return; 1044 } 1045