1 /* 2 * QEMU live block migration 3 * 4 * Copyright IBM, Corp. 2009 5 * 6 * Authors: 7 * Liran Schour <lirans@il.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu/osdep.h" 17 #include "qapi/error.h" 18 #include "qemu/error-report.h" 19 #include "qemu/main-loop.h" 20 #include "qemu/cutils.h" 21 #include "qemu/queue.h" 22 #include "block.h" 23 #include "block/dirty-bitmap.h" 24 #include "migration/misc.h" 25 #include "migration.h" 26 #include "migration-stats.h" 27 #include "migration/register.h" 28 #include "qemu-file.h" 29 #include "migration/vmstate.h" 30 #include "sysemu/block-backend.h" 31 #include "trace.h" 32 #include "options.h" 33 34 #define BLK_MIG_BLOCK_SIZE (1ULL << 20) 35 #define BDRV_SECTORS_PER_DIRTY_CHUNK (BLK_MIG_BLOCK_SIZE >> BDRV_SECTOR_BITS) 36 37 #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01 38 #define BLK_MIG_FLAG_EOS 0x02 39 #define BLK_MIG_FLAG_PROGRESS 0x04 40 #define BLK_MIG_FLAG_ZERO_BLOCK 0x08 41 42 #define MAX_IS_ALLOCATED_SEARCH (65536 * BDRV_SECTOR_SIZE) 43 44 #define MAX_IO_BUFFERS 512 45 #define MAX_PARALLEL_IO 16 46 47 typedef struct BlkMigDevState { 48 /* Written during setup phase. Can be read without a lock. */ 49 BlockBackend *blk; 50 char *blk_name; 51 int shared_base; 52 int64_t total_sectors; 53 QSIMPLEQ_ENTRY(BlkMigDevState) entry; 54 Error *blocker; 55 56 /* Only used by migration thread. Does not need a lock. */ 57 int bulk_completed; 58 int64_t cur_sector; 59 int64_t cur_dirty; 60 61 /* Data in the aio_bitmap is protected by block migration lock. 62 * Allocation and free happen during setup and cleanup respectively. 63 */ 64 unsigned long *aio_bitmap; 65 66 /* Protected by block migration lock. */ 67 int64_t completed_sectors; 68 69 /* During migration this is protected by iothread lock / AioContext. 70 * Allocation and free happen during setup and cleanup respectively. 71 */ 72 BdrvDirtyBitmap *dirty_bitmap; 73 } BlkMigDevState; 74 75 typedef struct BlkMigBlock { 76 /* Only used by migration thread. */ 77 uint8_t *buf; 78 BlkMigDevState *bmds; 79 int64_t sector; 80 int nr_sectors; 81 QEMUIOVector qiov; 82 BlockAIOCB *aiocb; 83 84 /* Protected by block migration lock. */ 85 int ret; 86 QSIMPLEQ_ENTRY(BlkMigBlock) entry; 87 } BlkMigBlock; 88 89 typedef struct BlkMigState { 90 QSIMPLEQ_HEAD(, BlkMigDevState) bmds_list; 91 int64_t total_sector_sum; 92 bool zero_blocks; 93 94 /* Protected by lock. */ 95 QSIMPLEQ_HEAD(, BlkMigBlock) blk_list; 96 int submitted; 97 int read_done; 98 99 /* Only used by migration thread. Does not need a lock. */ 100 int transferred; 101 int prev_progress; 102 int bulk_completed; 103 104 /* Lock must be taken _inside_ the iothread lock and any AioContexts. */ 105 QemuMutex lock; 106 } BlkMigState; 107 108 static BlkMigState block_mig_state; 109 110 static void blk_mig_lock(void) 111 { 112 qemu_mutex_lock(&block_mig_state.lock); 113 } 114 115 static void blk_mig_unlock(void) 116 { 117 qemu_mutex_unlock(&block_mig_state.lock); 118 } 119 120 /* Must run outside of the iothread lock during the bulk phase, 121 * or the VM will stall. 122 */ 123 124 static void blk_send(QEMUFile *f, BlkMigBlock * blk) 125 { 126 int len; 127 uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK; 128 129 if (block_mig_state.zero_blocks && 130 buffer_is_zero(blk->buf, BLK_MIG_BLOCK_SIZE)) { 131 flags |= BLK_MIG_FLAG_ZERO_BLOCK; 132 } 133 134 /* sector number and flags */ 135 qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS) 136 | flags); 137 138 /* device name */ 139 len = strlen(blk->bmds->blk_name); 140 qemu_put_byte(f, len); 141 qemu_put_buffer(f, (uint8_t *) blk->bmds->blk_name, len); 142 143 /* if a block is zero we need to flush here since the network 144 * bandwidth is now a lot higher than the storage device bandwidth. 145 * thus if we queue zero blocks we slow down the migration */ 146 if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { 147 qemu_fflush(f); 148 return; 149 } 150 151 qemu_put_buffer(f, blk->buf, BLK_MIG_BLOCK_SIZE); 152 } 153 154 int blk_mig_active(void) 155 { 156 return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list); 157 } 158 159 int blk_mig_bulk_active(void) 160 { 161 return blk_mig_active() && !block_mig_state.bulk_completed; 162 } 163 164 uint64_t blk_mig_bytes_transferred(void) 165 { 166 BlkMigDevState *bmds; 167 uint64_t sum = 0; 168 169 blk_mig_lock(); 170 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 171 sum += bmds->completed_sectors; 172 } 173 blk_mig_unlock(); 174 return sum << BDRV_SECTOR_BITS; 175 } 176 177 uint64_t blk_mig_bytes_remaining(void) 178 { 179 return blk_mig_bytes_total() - blk_mig_bytes_transferred(); 180 } 181 182 uint64_t blk_mig_bytes_total(void) 183 { 184 BlkMigDevState *bmds; 185 uint64_t sum = 0; 186 187 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 188 sum += bmds->total_sectors; 189 } 190 return sum << BDRV_SECTOR_BITS; 191 } 192 193 194 /* Called with migration lock held. */ 195 196 static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector) 197 { 198 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; 199 200 if (sector < bmds->total_sectors) { 201 return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] & 202 (1UL << (chunk % (sizeof(unsigned long) * 8)))); 203 } else { 204 return 0; 205 } 206 } 207 208 /* Called with migration lock held. */ 209 210 static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num, 211 int nb_sectors, int set) 212 { 213 int64_t start, end; 214 unsigned long val, idx, bit; 215 216 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK; 217 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK; 218 219 for (; start <= end; start++) { 220 idx = start / (sizeof(unsigned long) * 8); 221 bit = start % (sizeof(unsigned long) * 8); 222 val = bmds->aio_bitmap[idx]; 223 if (set) { 224 val |= 1UL << bit; 225 } else { 226 val &= ~(1UL << bit); 227 } 228 bmds->aio_bitmap[idx] = val; 229 } 230 } 231 232 static void alloc_aio_bitmap(BlkMigDevState *bmds) 233 { 234 int64_t bitmap_size; 235 236 bitmap_size = bmds->total_sectors + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; 237 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8; 238 239 bmds->aio_bitmap = g_malloc0(bitmap_size); 240 } 241 242 /* Never hold migration lock when yielding to the main loop! */ 243 244 static void blk_mig_read_cb(void *opaque, int ret) 245 { 246 BlkMigBlock *blk = opaque; 247 248 blk_mig_lock(); 249 blk->ret = ret; 250 251 QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry); 252 bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0); 253 254 block_mig_state.submitted--; 255 block_mig_state.read_done++; 256 assert(block_mig_state.submitted >= 0); 257 blk_mig_unlock(); 258 } 259 260 /* Called with no lock taken. */ 261 262 static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds) 263 { 264 int64_t total_sectors = bmds->total_sectors; 265 int64_t cur_sector = bmds->cur_sector; 266 BlockBackend *bb = bmds->blk; 267 BlkMigBlock *blk; 268 int nr_sectors; 269 int64_t count; 270 271 if (bmds->shared_base) { 272 qemu_mutex_lock_iothread(); 273 aio_context_acquire(blk_get_aio_context(bb)); 274 /* Skip unallocated sectors; intentionally treats failure or 275 * partial sector as an allocated sector */ 276 while (cur_sector < total_sectors && 277 !bdrv_is_allocated(blk_bs(bb), cur_sector * BDRV_SECTOR_SIZE, 278 MAX_IS_ALLOCATED_SEARCH, &count)) { 279 if (count < BDRV_SECTOR_SIZE) { 280 break; 281 } 282 cur_sector += count >> BDRV_SECTOR_BITS; 283 } 284 aio_context_release(blk_get_aio_context(bb)); 285 qemu_mutex_unlock_iothread(); 286 } 287 288 if (cur_sector >= total_sectors) { 289 bmds->cur_sector = bmds->completed_sectors = total_sectors; 290 return 1; 291 } 292 293 bmds->completed_sectors = cur_sector; 294 295 cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1); 296 297 /* we are going to transfer a full block even if it is not allocated */ 298 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; 299 300 if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { 301 nr_sectors = total_sectors - cur_sector; 302 } 303 304 blk = g_new(BlkMigBlock, 1); 305 blk->buf = g_malloc(BLK_MIG_BLOCK_SIZE); 306 blk->bmds = bmds; 307 blk->sector = cur_sector; 308 blk->nr_sectors = nr_sectors; 309 310 qemu_iovec_init_buf(&blk->qiov, blk->buf, nr_sectors * BDRV_SECTOR_SIZE); 311 312 blk_mig_lock(); 313 block_mig_state.submitted++; 314 blk_mig_unlock(); 315 316 /* We do not know if bs is under the main thread (and thus does 317 * not acquire the AioContext when doing AIO) or rather under 318 * dataplane. Thus acquire both the iothread mutex and the 319 * AioContext. 320 * 321 * This is ugly and will disappear when we make bdrv_* thread-safe, 322 * without the need to acquire the AioContext. 323 */ 324 qemu_mutex_lock_iothread(); 325 aio_context_acquire(blk_get_aio_context(bmds->blk)); 326 bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector * BDRV_SECTOR_SIZE, 327 nr_sectors * BDRV_SECTOR_SIZE); 328 blk->aiocb = blk_aio_preadv(bb, cur_sector * BDRV_SECTOR_SIZE, &blk->qiov, 329 0, blk_mig_read_cb, blk); 330 aio_context_release(blk_get_aio_context(bmds->blk)); 331 qemu_mutex_unlock_iothread(); 332 333 bmds->cur_sector = cur_sector + nr_sectors; 334 return (bmds->cur_sector >= total_sectors); 335 } 336 337 /* Called with iothread lock taken. */ 338 339 static int set_dirty_tracking(void) 340 { 341 BlkMigDevState *bmds; 342 int ret; 343 344 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 345 bmds->dirty_bitmap = bdrv_create_dirty_bitmap(blk_bs(bmds->blk), 346 BLK_MIG_BLOCK_SIZE, 347 NULL, NULL); 348 if (!bmds->dirty_bitmap) { 349 ret = -errno; 350 goto fail; 351 } 352 } 353 return 0; 354 355 fail: 356 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 357 if (bmds->dirty_bitmap) { 358 bdrv_release_dirty_bitmap(bmds->dirty_bitmap); 359 } 360 } 361 return ret; 362 } 363 364 /* Called with iothread lock taken. */ 365 366 static void unset_dirty_tracking(void) 367 { 368 BlkMigDevState *bmds; 369 370 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 371 if (bmds->dirty_bitmap) { 372 bdrv_release_dirty_bitmap(bmds->dirty_bitmap); 373 } 374 } 375 } 376 377 static int init_blk_migration(QEMUFile *f) 378 { 379 BlockDriverState *bs; 380 BlkMigDevState *bmds; 381 int64_t sectors; 382 BdrvNextIterator it; 383 int i, num_bs = 0; 384 struct { 385 BlkMigDevState *bmds; 386 BlockDriverState *bs; 387 } *bmds_bs; 388 Error *local_err = NULL; 389 int ret; 390 391 GRAPH_RDLOCK_GUARD_MAINLOOP(); 392 393 block_mig_state.submitted = 0; 394 block_mig_state.read_done = 0; 395 block_mig_state.transferred = 0; 396 block_mig_state.total_sector_sum = 0; 397 block_mig_state.prev_progress = -1; 398 block_mig_state.bulk_completed = 0; 399 block_mig_state.zero_blocks = migrate_zero_blocks(); 400 401 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 402 num_bs++; 403 } 404 bmds_bs = g_malloc0(num_bs * sizeof(*bmds_bs)); 405 406 for (i = 0, bs = bdrv_first(&it); bs; bs = bdrv_next(&it), i++) { 407 if (bdrv_is_read_only(bs)) { 408 continue; 409 } 410 411 sectors = bdrv_nb_sectors(bs); 412 if (sectors <= 0) { 413 ret = sectors; 414 bdrv_next_cleanup(&it); 415 goto out; 416 } 417 418 bmds = g_new0(BlkMigDevState, 1); 419 bmds->blk = blk_new(qemu_get_aio_context(), 420 BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL); 421 bmds->blk_name = g_strdup(bdrv_get_device_name(bs)); 422 bmds->bulk_completed = 0; 423 bmds->total_sectors = sectors; 424 bmds->completed_sectors = 0; 425 bmds->shared_base = migrate_block_incremental(); 426 427 assert(i < num_bs); 428 bmds_bs[i].bmds = bmds; 429 bmds_bs[i].bs = bs; 430 431 block_mig_state.total_sector_sum += sectors; 432 433 if (bmds->shared_base) { 434 trace_migration_block_init_shared(bdrv_get_device_name(bs)); 435 } else { 436 trace_migration_block_init_full(bdrv_get_device_name(bs)); 437 } 438 439 QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry); 440 } 441 442 /* Can only insert new BDSes now because doing so while iterating block 443 * devices may end up in a deadlock (iterating the new BDSes, too). */ 444 for (i = 0; i < num_bs; i++) { 445 bmds = bmds_bs[i].bmds; 446 bs = bmds_bs[i].bs; 447 448 if (bmds) { 449 ret = blk_insert_bs(bmds->blk, bs, &local_err); 450 if (ret < 0) { 451 error_report_err(local_err); 452 goto out; 453 } 454 455 alloc_aio_bitmap(bmds); 456 error_setg(&bmds->blocker, "block device is in use by migration"); 457 bdrv_op_block_all(bs, bmds->blocker); 458 } 459 } 460 461 ret = 0; 462 out: 463 g_free(bmds_bs); 464 return ret; 465 } 466 467 /* Called with no lock taken. */ 468 469 static int blk_mig_save_bulked_block(QEMUFile *f) 470 { 471 int64_t completed_sector_sum = 0; 472 BlkMigDevState *bmds; 473 int progress; 474 int ret = 0; 475 476 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 477 if (bmds->bulk_completed == 0) { 478 if (mig_save_device_bulk(f, bmds) == 1) { 479 /* completed bulk section for this device */ 480 bmds->bulk_completed = 1; 481 } 482 completed_sector_sum += bmds->completed_sectors; 483 ret = 1; 484 break; 485 } else { 486 completed_sector_sum += bmds->completed_sectors; 487 } 488 } 489 490 if (block_mig_state.total_sector_sum != 0) { 491 progress = completed_sector_sum * 100 / 492 block_mig_state.total_sector_sum; 493 } else { 494 progress = 100; 495 } 496 if (progress != block_mig_state.prev_progress) { 497 block_mig_state.prev_progress = progress; 498 qemu_put_be64(f, (progress << BDRV_SECTOR_BITS) 499 | BLK_MIG_FLAG_PROGRESS); 500 trace_migration_block_progression(progress); 501 } 502 503 return ret; 504 } 505 506 static void blk_mig_reset_dirty_cursor(void) 507 { 508 BlkMigDevState *bmds; 509 510 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 511 bmds->cur_dirty = 0; 512 } 513 } 514 515 /* Called with iothread lock and AioContext taken. */ 516 517 static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds, 518 int is_async) 519 { 520 BlkMigBlock *blk; 521 int64_t total_sectors = bmds->total_sectors; 522 int64_t sector; 523 int nr_sectors; 524 int ret = -EIO; 525 526 for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) { 527 blk_mig_lock(); 528 if (bmds_aio_inflight(bmds, sector)) { 529 blk_mig_unlock(); 530 blk_drain(bmds->blk); 531 } else { 532 blk_mig_unlock(); 533 } 534 bdrv_dirty_bitmap_lock(bmds->dirty_bitmap); 535 if (bdrv_dirty_bitmap_get_locked(bmds->dirty_bitmap, 536 sector * BDRV_SECTOR_SIZE)) { 537 if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { 538 nr_sectors = total_sectors - sector; 539 } else { 540 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; 541 } 542 bdrv_reset_dirty_bitmap_locked(bmds->dirty_bitmap, 543 sector * BDRV_SECTOR_SIZE, 544 nr_sectors * BDRV_SECTOR_SIZE); 545 bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap); 546 547 blk = g_new(BlkMigBlock, 1); 548 blk->buf = g_malloc(BLK_MIG_BLOCK_SIZE); 549 blk->bmds = bmds; 550 blk->sector = sector; 551 blk->nr_sectors = nr_sectors; 552 553 if (is_async) { 554 qemu_iovec_init_buf(&blk->qiov, blk->buf, 555 nr_sectors * BDRV_SECTOR_SIZE); 556 557 blk->aiocb = blk_aio_preadv(bmds->blk, 558 sector * BDRV_SECTOR_SIZE, 559 &blk->qiov, 0, blk_mig_read_cb, 560 blk); 561 562 blk_mig_lock(); 563 block_mig_state.submitted++; 564 bmds_set_aio_inflight(bmds, sector, nr_sectors, 1); 565 blk_mig_unlock(); 566 } else { 567 ret = blk_pread(bmds->blk, sector * BDRV_SECTOR_SIZE, 568 nr_sectors * BDRV_SECTOR_SIZE, blk->buf, 0); 569 if (ret < 0) { 570 goto error; 571 } 572 blk_send(f, blk); 573 574 g_free(blk->buf); 575 g_free(blk); 576 } 577 578 sector += nr_sectors; 579 bmds->cur_dirty = sector; 580 break; 581 } 582 583 bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap); 584 sector += BDRV_SECTORS_PER_DIRTY_CHUNK; 585 bmds->cur_dirty = sector; 586 } 587 588 return (bmds->cur_dirty >= bmds->total_sectors); 589 590 error: 591 trace_migration_block_save_device_dirty(sector); 592 g_free(blk->buf); 593 g_free(blk); 594 return ret; 595 } 596 597 /* Called with iothread lock taken. 598 * 599 * return value: 600 * 0: too much data for max_downtime 601 * 1: few enough data for max_downtime 602 */ 603 static int blk_mig_save_dirty_block(QEMUFile *f, int is_async) 604 { 605 BlkMigDevState *bmds; 606 int ret = 1; 607 608 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 609 aio_context_acquire(blk_get_aio_context(bmds->blk)); 610 ret = mig_save_device_dirty(f, bmds, is_async); 611 aio_context_release(blk_get_aio_context(bmds->blk)); 612 if (ret <= 0) { 613 break; 614 } 615 } 616 617 return ret; 618 } 619 620 /* Called with no locks taken. */ 621 622 static int flush_blks(QEMUFile *f) 623 { 624 BlkMigBlock *blk; 625 int ret = 0; 626 627 trace_migration_block_flush_blks("Enter", block_mig_state.submitted, 628 block_mig_state.read_done, 629 block_mig_state.transferred); 630 631 blk_mig_lock(); 632 while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) { 633 if (migration_rate_exceeded(f)) { 634 break; 635 } 636 if (blk->ret < 0) { 637 ret = blk->ret; 638 break; 639 } 640 641 QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry); 642 blk_mig_unlock(); 643 blk_send(f, blk); 644 blk_mig_lock(); 645 646 g_free(blk->buf); 647 g_free(blk); 648 649 block_mig_state.read_done--; 650 block_mig_state.transferred++; 651 assert(block_mig_state.read_done >= 0); 652 } 653 blk_mig_unlock(); 654 655 trace_migration_block_flush_blks("Exit", block_mig_state.submitted, 656 block_mig_state.read_done, 657 block_mig_state.transferred); 658 return ret; 659 } 660 661 /* Called with iothread lock taken. */ 662 663 static int64_t get_remaining_dirty(void) 664 { 665 BlkMigDevState *bmds; 666 int64_t dirty = 0; 667 668 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 669 aio_context_acquire(blk_get_aio_context(bmds->blk)); 670 dirty += bdrv_get_dirty_count(bmds->dirty_bitmap); 671 aio_context_release(blk_get_aio_context(bmds->blk)); 672 } 673 674 return dirty; 675 } 676 677 678 679 /* Called with iothread lock taken. */ 680 static void block_migration_cleanup_bmds(void) 681 { 682 BlkMigDevState *bmds; 683 BlockDriverState *bs; 684 AioContext *ctx; 685 686 unset_dirty_tracking(); 687 688 while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) { 689 QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry); 690 691 bs = blk_bs(bmds->blk); 692 if (bs) { 693 bdrv_op_unblock_all(bs, bmds->blocker); 694 } 695 error_free(bmds->blocker); 696 697 /* Save ctx, because bmds->blk can disappear during blk_unref. */ 698 ctx = blk_get_aio_context(bmds->blk); 699 aio_context_acquire(ctx); 700 blk_unref(bmds->blk); 701 aio_context_release(ctx); 702 703 g_free(bmds->blk_name); 704 g_free(bmds->aio_bitmap); 705 g_free(bmds); 706 } 707 } 708 709 /* Called with iothread lock taken. */ 710 static void block_migration_cleanup(void *opaque) 711 { 712 BlkMigBlock *blk; 713 714 bdrv_drain_all(); 715 716 block_migration_cleanup_bmds(); 717 718 blk_mig_lock(); 719 while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) { 720 QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry); 721 g_free(blk->buf); 722 g_free(blk); 723 } 724 blk_mig_unlock(); 725 } 726 727 static int block_save_setup(QEMUFile *f, void *opaque) 728 { 729 int ret; 730 731 trace_migration_block_save("setup", block_mig_state.submitted, 732 block_mig_state.transferred); 733 734 warn_report("block migration is deprecated;" 735 " use blockdev-mirror with NBD instead"); 736 737 ret = init_blk_migration(f); 738 if (ret < 0) { 739 return ret; 740 } 741 742 /* start track dirty blocks */ 743 ret = set_dirty_tracking(); 744 if (ret) { 745 return ret; 746 } 747 748 ret = flush_blks(f); 749 blk_mig_reset_dirty_cursor(); 750 qemu_put_be64(f, BLK_MIG_FLAG_EOS); 751 752 return ret; 753 } 754 755 static int block_save_iterate(QEMUFile *f, void *opaque) 756 { 757 int ret; 758 uint64_t last_bytes = qemu_file_transferred(f); 759 760 trace_migration_block_save("iterate", block_mig_state.submitted, 761 block_mig_state.transferred); 762 763 ret = flush_blks(f); 764 if (ret) { 765 return ret; 766 } 767 768 blk_mig_reset_dirty_cursor(); 769 770 /* control the rate of transfer */ 771 blk_mig_lock(); 772 while (block_mig_state.read_done * BLK_MIG_BLOCK_SIZE < 773 migration_rate_get() && 774 block_mig_state.submitted < MAX_PARALLEL_IO && 775 (block_mig_state.submitted + block_mig_state.read_done) < 776 MAX_IO_BUFFERS) { 777 blk_mig_unlock(); 778 if (block_mig_state.bulk_completed == 0) { 779 /* first finish the bulk phase */ 780 if (blk_mig_save_bulked_block(f) == 0) { 781 /* finished saving bulk on all devices */ 782 block_mig_state.bulk_completed = 1; 783 } 784 ret = 0; 785 } else { 786 /* Always called with iothread lock taken for 787 * simplicity, block_save_complete also calls it. 788 */ 789 qemu_mutex_lock_iothread(); 790 ret = blk_mig_save_dirty_block(f, 1); 791 qemu_mutex_unlock_iothread(); 792 } 793 if (ret < 0) { 794 return ret; 795 } 796 blk_mig_lock(); 797 if (ret != 0) { 798 /* no more dirty blocks */ 799 break; 800 } 801 } 802 blk_mig_unlock(); 803 804 ret = flush_blks(f); 805 if (ret) { 806 return ret; 807 } 808 809 qemu_put_be64(f, BLK_MIG_FLAG_EOS); 810 uint64_t delta_bytes = qemu_file_transferred(f) - last_bytes; 811 return (delta_bytes > 0); 812 } 813 814 /* Called with iothread lock taken. */ 815 816 static int block_save_complete(QEMUFile *f, void *opaque) 817 { 818 int ret; 819 820 trace_migration_block_save("complete", block_mig_state.submitted, 821 block_mig_state.transferred); 822 823 ret = flush_blks(f); 824 if (ret) { 825 return ret; 826 } 827 828 blk_mig_reset_dirty_cursor(); 829 830 /* we know for sure that save bulk is completed and 831 all async read completed */ 832 blk_mig_lock(); 833 assert(block_mig_state.submitted == 0); 834 blk_mig_unlock(); 835 836 do { 837 ret = blk_mig_save_dirty_block(f, 0); 838 if (ret < 0) { 839 return ret; 840 } 841 } while (ret == 0); 842 843 /* report completion */ 844 qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS); 845 846 trace_migration_block_save_complete(); 847 848 qemu_put_be64(f, BLK_MIG_FLAG_EOS); 849 850 /* Make sure that our BlockBackends are gone, so that the block driver 851 * nodes can be inactivated. */ 852 block_migration_cleanup_bmds(); 853 854 return 0; 855 } 856 857 static void block_state_pending(void *opaque, uint64_t *must_precopy, 858 uint64_t *can_postcopy) 859 { 860 /* Estimate pending number of bytes to send */ 861 uint64_t pending; 862 863 qemu_mutex_lock_iothread(); 864 pending = get_remaining_dirty(); 865 qemu_mutex_unlock_iothread(); 866 867 blk_mig_lock(); 868 pending += block_mig_state.submitted * BLK_MIG_BLOCK_SIZE + 869 block_mig_state.read_done * BLK_MIG_BLOCK_SIZE; 870 blk_mig_unlock(); 871 872 /* Report at least one block pending during bulk phase */ 873 if (!pending && !block_mig_state.bulk_completed) { 874 pending = BLK_MIG_BLOCK_SIZE; 875 } 876 877 trace_migration_block_state_pending(pending); 878 /* We don't do postcopy */ 879 *must_precopy += pending; 880 } 881 882 static int block_load(QEMUFile *f, void *opaque, int version_id) 883 { 884 static int banner_printed; 885 int len, flags; 886 char device_name[256]; 887 int64_t addr; 888 BlockBackend *blk, *blk_prev = NULL; 889 Error *local_err = NULL; 890 uint8_t *buf; 891 int64_t total_sectors = 0; 892 int nr_sectors; 893 int ret; 894 BlockDriverInfo bdi; 895 int cluster_size = BLK_MIG_BLOCK_SIZE; 896 897 do { 898 addr = qemu_get_be64(f); 899 900 flags = addr & (BDRV_SECTOR_SIZE - 1); 901 addr >>= BDRV_SECTOR_BITS; 902 903 if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) { 904 /* get device name */ 905 len = qemu_get_byte(f); 906 qemu_get_buffer(f, (uint8_t *)device_name, len); 907 device_name[len] = '\0'; 908 909 blk = blk_by_name(device_name); 910 if (!blk) { 911 fprintf(stderr, "Error unknown block device %s\n", 912 device_name); 913 return -EINVAL; 914 } 915 916 if (blk != blk_prev) { 917 blk_prev = blk; 918 total_sectors = blk_nb_sectors(blk); 919 if (total_sectors <= 0) { 920 error_report("Error getting length of block device %s", 921 device_name); 922 return -EINVAL; 923 } 924 925 blk_activate(blk, &local_err); 926 if (local_err) { 927 error_report_err(local_err); 928 return -EINVAL; 929 } 930 931 ret = bdrv_get_info(blk_bs(blk), &bdi); 932 if (ret == 0 && bdi.cluster_size > 0 && 933 bdi.cluster_size <= BLK_MIG_BLOCK_SIZE && 934 BLK_MIG_BLOCK_SIZE % bdi.cluster_size == 0) { 935 cluster_size = bdi.cluster_size; 936 } else { 937 cluster_size = BLK_MIG_BLOCK_SIZE; 938 } 939 } 940 941 if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) { 942 nr_sectors = total_sectors - addr; 943 } else { 944 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; 945 } 946 947 if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { 948 ret = blk_pwrite_zeroes(blk, addr * BDRV_SECTOR_SIZE, 949 nr_sectors * BDRV_SECTOR_SIZE, 950 BDRV_REQ_MAY_UNMAP); 951 } else { 952 int i; 953 int64_t cur_addr; 954 uint8_t *cur_buf; 955 956 buf = g_malloc(BLK_MIG_BLOCK_SIZE); 957 qemu_get_buffer(f, buf, BLK_MIG_BLOCK_SIZE); 958 for (i = 0; i < BLK_MIG_BLOCK_SIZE / cluster_size; i++) { 959 cur_addr = addr * BDRV_SECTOR_SIZE + i * cluster_size; 960 cur_buf = buf + i * cluster_size; 961 962 if ((!block_mig_state.zero_blocks || 963 cluster_size < BLK_MIG_BLOCK_SIZE) && 964 buffer_is_zero(cur_buf, cluster_size)) { 965 ret = blk_pwrite_zeroes(blk, cur_addr, 966 cluster_size, 967 BDRV_REQ_MAY_UNMAP); 968 } else { 969 ret = blk_pwrite(blk, cur_addr, cluster_size, cur_buf, 970 0); 971 } 972 if (ret < 0) { 973 break; 974 } 975 } 976 g_free(buf); 977 } 978 979 if (ret < 0) { 980 return ret; 981 } 982 } else if (flags & BLK_MIG_FLAG_PROGRESS) { 983 if (!banner_printed) { 984 printf("Receiving block device images\n"); 985 banner_printed = 1; 986 } 987 printf("Completed %d %%%c", (int)addr, 988 (addr == 100) ? '\n' : '\r'); 989 fflush(stdout); 990 } else if (!(flags & BLK_MIG_FLAG_EOS)) { 991 fprintf(stderr, "Unknown block migration flags: 0x%x\n", flags); 992 return -EINVAL; 993 } 994 ret = qemu_file_get_error(f); 995 if (ret != 0) { 996 return ret; 997 } 998 } while (!(flags & BLK_MIG_FLAG_EOS)); 999 1000 return 0; 1001 } 1002 1003 static bool block_is_active(void *opaque) 1004 { 1005 return migrate_block(); 1006 } 1007 1008 static SaveVMHandlers savevm_block_handlers = { 1009 .save_setup = block_save_setup, 1010 .save_live_iterate = block_save_iterate, 1011 .save_live_complete_precopy = block_save_complete, 1012 .state_pending_exact = block_state_pending, 1013 .state_pending_estimate = block_state_pending, 1014 .load_state = block_load, 1015 .save_cleanup = block_migration_cleanup, 1016 .is_active = block_is_active, 1017 }; 1018 1019 void blk_mig_init(void) 1020 { 1021 QSIMPLEQ_INIT(&block_mig_state.bmds_list); 1022 QSIMPLEQ_INIT(&block_mig_state.blk_list); 1023 qemu_mutex_init(&block_mig_state.lock); 1024 1025 register_savevm_live("block", 0, 1, &savevm_block_handlers, 1026 &block_mig_state); 1027 } 1028