1 /* 2 * QEMU live block migration 3 * 4 * Copyright IBM, Corp. 2009 5 * 6 * Authors: 7 * Liran Schour <lirans@il.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu/osdep.h" 17 #include "qapi/error.h" 18 #include "qemu/error-report.h" 19 #include "qemu/main-loop.h" 20 #include "qemu/cutils.h" 21 #include "qemu/queue.h" 22 #include "block.h" 23 #include "block/dirty-bitmap.h" 24 #include "migration/misc.h" 25 #include "migration.h" 26 #include "migration-stats.h" 27 #include "migration/register.h" 28 #include "qemu-file.h" 29 #include "migration/vmstate.h" 30 #include "sysemu/block-backend.h" 31 #include "trace.h" 32 #include "options.h" 33 34 #define BLK_MIG_BLOCK_SIZE (1ULL << 20) 35 #define BDRV_SECTORS_PER_DIRTY_CHUNK (BLK_MIG_BLOCK_SIZE >> BDRV_SECTOR_BITS) 36 37 #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01 38 #define BLK_MIG_FLAG_EOS 0x02 39 #define BLK_MIG_FLAG_PROGRESS 0x04 40 #define BLK_MIG_FLAG_ZERO_BLOCK 0x08 41 42 #define MAX_IS_ALLOCATED_SEARCH (65536 * BDRV_SECTOR_SIZE) 43 44 #define MAX_IO_BUFFERS 512 45 #define MAX_PARALLEL_IO 16 46 47 typedef struct BlkMigDevState { 48 /* Written during setup phase. Can be read without a lock. */ 49 BlockBackend *blk; 50 char *blk_name; 51 int shared_base; 52 int64_t total_sectors; 53 QSIMPLEQ_ENTRY(BlkMigDevState) entry; 54 Error *blocker; 55 56 /* Only used by migration thread. Does not need a lock. */ 57 int bulk_completed; 58 int64_t cur_sector; 59 int64_t cur_dirty; 60 61 /* Data in the aio_bitmap is protected by block migration lock. 62 * Allocation and free happen during setup and cleanup respectively. 63 */ 64 unsigned long *aio_bitmap; 65 66 /* Protected by block migration lock. */ 67 int64_t completed_sectors; 68 69 /* During migration this is protected by bdrv_dirty_bitmap_lock(). 70 * Allocation and free happen during setup and cleanup respectively. 71 */ 72 BdrvDirtyBitmap *dirty_bitmap; 73 } BlkMigDevState; 74 75 typedef struct BlkMigBlock { 76 /* Only used by migration thread. */ 77 uint8_t *buf; 78 BlkMigDevState *bmds; 79 int64_t sector; 80 int nr_sectors; 81 QEMUIOVector qiov; 82 BlockAIOCB *aiocb; 83 84 /* Protected by block migration lock. */ 85 int ret; 86 QSIMPLEQ_ENTRY(BlkMigBlock) entry; 87 } BlkMigBlock; 88 89 typedef struct BlkMigState { 90 QSIMPLEQ_HEAD(, BlkMigDevState) bmds_list; 91 int64_t total_sector_sum; 92 bool zero_blocks; 93 94 /* Protected by lock. */ 95 QSIMPLEQ_HEAD(, BlkMigBlock) blk_list; 96 int submitted; 97 int read_done; 98 99 /* Only used by migration thread. Does not need a lock. */ 100 int transferred; 101 int prev_progress; 102 int bulk_completed; 103 104 /* Lock must be taken _inside_ the BQL. */ 105 QemuMutex lock; 106 } BlkMigState; 107 108 static BlkMigState block_mig_state; 109 110 static void blk_mig_lock(void) 111 { 112 qemu_mutex_lock(&block_mig_state.lock); 113 } 114 115 static void blk_mig_unlock(void) 116 { 117 qemu_mutex_unlock(&block_mig_state.lock); 118 } 119 120 /* Must run outside of the BQL during the bulk phase, 121 * or the VM will stall. 122 */ 123 124 static void blk_send(QEMUFile *f, BlkMigBlock * blk) 125 { 126 int len; 127 uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK; 128 129 if (block_mig_state.zero_blocks && 130 buffer_is_zero(blk->buf, BLK_MIG_BLOCK_SIZE)) { 131 flags |= BLK_MIG_FLAG_ZERO_BLOCK; 132 } 133 134 /* sector number and flags */ 135 qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS) 136 | flags); 137 138 /* device name */ 139 len = strlen(blk->bmds->blk_name); 140 qemu_put_byte(f, len); 141 qemu_put_buffer(f, (uint8_t *) blk->bmds->blk_name, len); 142 143 /* if a block is zero we need to flush here since the network 144 * bandwidth is now a lot higher than the storage device bandwidth. 145 * thus if we queue zero blocks we slow down the migration */ 146 if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { 147 qemu_fflush(f); 148 return; 149 } 150 151 qemu_put_buffer(f, blk->buf, BLK_MIG_BLOCK_SIZE); 152 } 153 154 int blk_mig_active(void) 155 { 156 return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list); 157 } 158 159 int blk_mig_bulk_active(void) 160 { 161 return blk_mig_active() && !block_mig_state.bulk_completed; 162 } 163 164 uint64_t blk_mig_bytes_transferred(void) 165 { 166 BlkMigDevState *bmds; 167 uint64_t sum = 0; 168 169 blk_mig_lock(); 170 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 171 sum += bmds->completed_sectors; 172 } 173 blk_mig_unlock(); 174 return sum << BDRV_SECTOR_BITS; 175 } 176 177 uint64_t blk_mig_bytes_remaining(void) 178 { 179 return blk_mig_bytes_total() - blk_mig_bytes_transferred(); 180 } 181 182 uint64_t blk_mig_bytes_total(void) 183 { 184 BlkMigDevState *bmds; 185 uint64_t sum = 0; 186 187 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 188 sum += bmds->total_sectors; 189 } 190 return sum << BDRV_SECTOR_BITS; 191 } 192 193 194 /* Called with migration lock held. */ 195 196 static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector) 197 { 198 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; 199 200 if (sector < bmds->total_sectors) { 201 return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] & 202 (1UL << (chunk % (sizeof(unsigned long) * 8)))); 203 } else { 204 return 0; 205 } 206 } 207 208 /* Called with migration lock held. */ 209 210 static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num, 211 int nb_sectors, int set) 212 { 213 int64_t start, end; 214 unsigned long val, idx, bit; 215 216 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK; 217 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK; 218 219 for (; start <= end; start++) { 220 idx = start / (sizeof(unsigned long) * 8); 221 bit = start % (sizeof(unsigned long) * 8); 222 val = bmds->aio_bitmap[idx]; 223 if (set) { 224 val |= 1UL << bit; 225 } else { 226 val &= ~(1UL << bit); 227 } 228 bmds->aio_bitmap[idx] = val; 229 } 230 } 231 232 static void alloc_aio_bitmap(BlkMigDevState *bmds) 233 { 234 int64_t bitmap_size; 235 236 bitmap_size = bmds->total_sectors + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; 237 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8; 238 239 bmds->aio_bitmap = g_malloc0(bitmap_size); 240 } 241 242 /* Never hold migration lock when yielding to the main loop! */ 243 244 static void blk_mig_read_cb(void *opaque, int ret) 245 { 246 BlkMigBlock *blk = opaque; 247 248 blk_mig_lock(); 249 blk->ret = ret; 250 251 QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry); 252 bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0); 253 254 block_mig_state.submitted--; 255 block_mig_state.read_done++; 256 assert(block_mig_state.submitted >= 0); 257 blk_mig_unlock(); 258 } 259 260 /* Called with no lock taken. */ 261 262 static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds) 263 { 264 int64_t total_sectors = bmds->total_sectors; 265 int64_t cur_sector = bmds->cur_sector; 266 BlockBackend *bb = bmds->blk; 267 BlkMigBlock *blk; 268 int nr_sectors; 269 int64_t count; 270 271 if (bmds->shared_base) { 272 bql_lock(); 273 /* Skip unallocated sectors; intentionally treats failure or 274 * partial sector as an allocated sector */ 275 while (cur_sector < total_sectors && 276 !bdrv_is_allocated(blk_bs(bb), cur_sector * BDRV_SECTOR_SIZE, 277 MAX_IS_ALLOCATED_SEARCH, &count)) { 278 if (count < BDRV_SECTOR_SIZE) { 279 break; 280 } 281 cur_sector += count >> BDRV_SECTOR_BITS; 282 } 283 bql_unlock(); 284 } 285 286 if (cur_sector >= total_sectors) { 287 bmds->cur_sector = bmds->completed_sectors = total_sectors; 288 return 1; 289 } 290 291 bmds->completed_sectors = cur_sector; 292 293 cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1); 294 295 /* we are going to transfer a full block even if it is not allocated */ 296 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; 297 298 if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { 299 nr_sectors = total_sectors - cur_sector; 300 } 301 302 blk = g_new(BlkMigBlock, 1); 303 blk->buf = g_malloc(BLK_MIG_BLOCK_SIZE); 304 blk->bmds = bmds; 305 blk->sector = cur_sector; 306 blk->nr_sectors = nr_sectors; 307 308 qemu_iovec_init_buf(&blk->qiov, blk->buf, nr_sectors * BDRV_SECTOR_SIZE); 309 310 blk_mig_lock(); 311 block_mig_state.submitted++; 312 blk_mig_unlock(); 313 314 /* 315 * The migration thread does not have an AioContext. Lock the BQL so that 316 * I/O runs in the main loop AioContext (see 317 * qemu_get_current_aio_context()). 318 */ 319 bql_lock(); 320 bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector * BDRV_SECTOR_SIZE, 321 nr_sectors * BDRV_SECTOR_SIZE); 322 blk->aiocb = blk_aio_preadv(bb, cur_sector * BDRV_SECTOR_SIZE, &blk->qiov, 323 0, blk_mig_read_cb, blk); 324 bql_unlock(); 325 326 bmds->cur_sector = cur_sector + nr_sectors; 327 return (bmds->cur_sector >= total_sectors); 328 } 329 330 /* Called with the BQL taken. */ 331 332 static int set_dirty_tracking(void) 333 { 334 BlkMigDevState *bmds; 335 int ret; 336 337 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 338 bmds->dirty_bitmap = bdrv_create_dirty_bitmap(blk_bs(bmds->blk), 339 BLK_MIG_BLOCK_SIZE, 340 NULL, NULL); 341 if (!bmds->dirty_bitmap) { 342 ret = -errno; 343 goto fail; 344 } 345 } 346 return 0; 347 348 fail: 349 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 350 if (bmds->dirty_bitmap) { 351 bdrv_release_dirty_bitmap(bmds->dirty_bitmap); 352 } 353 } 354 return ret; 355 } 356 357 /* Called with the BQL taken. */ 358 359 static void unset_dirty_tracking(void) 360 { 361 BlkMigDevState *bmds; 362 363 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 364 if (bmds->dirty_bitmap) { 365 bdrv_release_dirty_bitmap(bmds->dirty_bitmap); 366 } 367 } 368 } 369 370 static int init_blk_migration(QEMUFile *f, Error **errp) 371 { 372 BlockDriverState *bs; 373 BlkMigDevState *bmds; 374 int64_t sectors; 375 BdrvNextIterator it; 376 int i, num_bs = 0; 377 struct { 378 BlkMigDevState *bmds; 379 BlockDriverState *bs; 380 } *bmds_bs; 381 int ret; 382 383 GRAPH_RDLOCK_GUARD_MAINLOOP(); 384 385 block_mig_state.submitted = 0; 386 block_mig_state.read_done = 0; 387 block_mig_state.transferred = 0; 388 block_mig_state.total_sector_sum = 0; 389 block_mig_state.prev_progress = -1; 390 block_mig_state.bulk_completed = 0; 391 block_mig_state.zero_blocks = migrate_zero_blocks(); 392 393 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 394 num_bs++; 395 } 396 bmds_bs = g_malloc0(num_bs * sizeof(*bmds_bs)); 397 398 for (i = 0, bs = bdrv_first(&it); bs; bs = bdrv_next(&it), i++) { 399 if (bdrv_is_read_only(bs)) { 400 continue; 401 } 402 403 sectors = bdrv_nb_sectors(bs); 404 if (sectors == 0) { 405 continue; 406 } 407 if (sectors < 0) { 408 error_setg(errp, "Error getting length of block device %s", 409 bdrv_get_device_name(bs)); 410 ret = sectors; 411 bdrv_next_cleanup(&it); 412 goto out; 413 } 414 415 bmds = g_new0(BlkMigDevState, 1); 416 bmds->blk = blk_new(qemu_get_aio_context(), 417 BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL); 418 bmds->blk_name = g_strdup(bdrv_get_device_name(bs)); 419 bmds->bulk_completed = 0; 420 bmds->total_sectors = sectors; 421 bmds->completed_sectors = 0; 422 bmds->shared_base = migrate_block_incremental(); 423 424 assert(i < num_bs); 425 bmds_bs[i].bmds = bmds; 426 bmds_bs[i].bs = bs; 427 428 block_mig_state.total_sector_sum += sectors; 429 430 if (bmds->shared_base) { 431 trace_migration_block_init_shared(bdrv_get_device_name(bs)); 432 } else { 433 trace_migration_block_init_full(bdrv_get_device_name(bs)); 434 } 435 436 QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry); 437 } 438 439 /* Can only insert new BDSes now because doing so while iterating block 440 * devices may end up in a deadlock (iterating the new BDSes, too). */ 441 for (i = 0; i < num_bs; i++) { 442 bmds = bmds_bs[i].bmds; 443 bs = bmds_bs[i].bs; 444 445 if (bmds) { 446 ret = blk_insert_bs(bmds->blk, bs, errp); 447 if (ret < 0) { 448 goto out; 449 } 450 451 alloc_aio_bitmap(bmds); 452 error_setg(&bmds->blocker, "block device is in use by migration"); 453 bdrv_op_block_all(bs, bmds->blocker); 454 } 455 } 456 457 ret = 0; 458 out: 459 g_free(bmds_bs); 460 return ret; 461 } 462 463 /* Called with no lock taken. */ 464 465 static int blk_mig_save_bulked_block(QEMUFile *f) 466 { 467 int64_t completed_sector_sum = 0; 468 BlkMigDevState *bmds; 469 int progress; 470 int ret = 0; 471 472 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 473 if (bmds->bulk_completed == 0) { 474 if (mig_save_device_bulk(f, bmds) == 1) { 475 /* completed bulk section for this device */ 476 bmds->bulk_completed = 1; 477 } 478 completed_sector_sum += bmds->completed_sectors; 479 ret = 1; 480 break; 481 } else { 482 completed_sector_sum += bmds->completed_sectors; 483 } 484 } 485 486 if (block_mig_state.total_sector_sum != 0) { 487 progress = completed_sector_sum * 100 / 488 block_mig_state.total_sector_sum; 489 } else { 490 progress = 100; 491 } 492 if (progress != block_mig_state.prev_progress) { 493 block_mig_state.prev_progress = progress; 494 qemu_put_be64(f, (progress << BDRV_SECTOR_BITS) 495 | BLK_MIG_FLAG_PROGRESS); 496 trace_migration_block_progression(progress); 497 } 498 499 return ret; 500 } 501 502 static void blk_mig_reset_dirty_cursor(void) 503 { 504 BlkMigDevState *bmds; 505 506 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 507 bmds->cur_dirty = 0; 508 } 509 } 510 511 /* Called with the BQL taken. */ 512 513 static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds, 514 int is_async) 515 { 516 BlkMigBlock *blk; 517 int64_t total_sectors = bmds->total_sectors; 518 int64_t sector; 519 int nr_sectors; 520 int ret = -EIO; 521 522 for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) { 523 blk_mig_lock(); 524 if (bmds_aio_inflight(bmds, sector)) { 525 blk_mig_unlock(); 526 blk_drain(bmds->blk); 527 } else { 528 blk_mig_unlock(); 529 } 530 bdrv_dirty_bitmap_lock(bmds->dirty_bitmap); 531 if (bdrv_dirty_bitmap_get_locked(bmds->dirty_bitmap, 532 sector * BDRV_SECTOR_SIZE)) { 533 if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { 534 nr_sectors = total_sectors - sector; 535 } else { 536 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; 537 } 538 bdrv_reset_dirty_bitmap_locked(bmds->dirty_bitmap, 539 sector * BDRV_SECTOR_SIZE, 540 nr_sectors * BDRV_SECTOR_SIZE); 541 bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap); 542 543 blk = g_new(BlkMigBlock, 1); 544 blk->buf = g_malloc(BLK_MIG_BLOCK_SIZE); 545 blk->bmds = bmds; 546 blk->sector = sector; 547 blk->nr_sectors = nr_sectors; 548 549 if (is_async) { 550 qemu_iovec_init_buf(&blk->qiov, blk->buf, 551 nr_sectors * BDRV_SECTOR_SIZE); 552 553 blk->aiocb = blk_aio_preadv(bmds->blk, 554 sector * BDRV_SECTOR_SIZE, 555 &blk->qiov, 0, blk_mig_read_cb, 556 blk); 557 558 blk_mig_lock(); 559 block_mig_state.submitted++; 560 bmds_set_aio_inflight(bmds, sector, nr_sectors, 1); 561 blk_mig_unlock(); 562 } else { 563 ret = blk_pread(bmds->blk, sector * BDRV_SECTOR_SIZE, 564 nr_sectors * BDRV_SECTOR_SIZE, blk->buf, 0); 565 if (ret < 0) { 566 goto error; 567 } 568 blk_send(f, blk); 569 570 g_free(blk->buf); 571 g_free(blk); 572 } 573 574 sector += nr_sectors; 575 bmds->cur_dirty = sector; 576 break; 577 } 578 579 bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap); 580 sector += BDRV_SECTORS_PER_DIRTY_CHUNK; 581 bmds->cur_dirty = sector; 582 } 583 584 return (bmds->cur_dirty >= bmds->total_sectors); 585 586 error: 587 trace_migration_block_save_device_dirty(sector); 588 g_free(blk->buf); 589 g_free(blk); 590 return ret; 591 } 592 593 /* Called with the BQL taken. 594 * 595 * return value: 596 * 0: too much data for max_downtime 597 * 1: few enough data for max_downtime 598 */ 599 static int blk_mig_save_dirty_block(QEMUFile *f, int is_async) 600 { 601 BlkMigDevState *bmds; 602 int ret = 1; 603 604 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 605 ret = mig_save_device_dirty(f, bmds, is_async); 606 if (ret <= 0) { 607 break; 608 } 609 } 610 611 return ret; 612 } 613 614 /* Called with no locks taken. */ 615 616 static int flush_blks(QEMUFile *f) 617 { 618 BlkMigBlock *blk; 619 int ret = 0; 620 621 trace_migration_block_flush_blks("Enter", block_mig_state.submitted, 622 block_mig_state.read_done, 623 block_mig_state.transferred); 624 625 blk_mig_lock(); 626 while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) { 627 if (migration_rate_exceeded(f)) { 628 break; 629 } 630 if (blk->ret < 0) { 631 ret = blk->ret; 632 break; 633 } 634 635 QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry); 636 blk_mig_unlock(); 637 blk_send(f, blk); 638 blk_mig_lock(); 639 640 g_free(blk->buf); 641 g_free(blk); 642 643 block_mig_state.read_done--; 644 block_mig_state.transferred++; 645 assert(block_mig_state.read_done >= 0); 646 } 647 blk_mig_unlock(); 648 649 trace_migration_block_flush_blks("Exit", block_mig_state.submitted, 650 block_mig_state.read_done, 651 block_mig_state.transferred); 652 return ret; 653 } 654 655 /* Called with the BQL taken. */ 656 657 static int64_t get_remaining_dirty(void) 658 { 659 BlkMigDevState *bmds; 660 int64_t dirty = 0; 661 662 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 663 bdrv_dirty_bitmap_lock(bmds->dirty_bitmap); 664 dirty += bdrv_get_dirty_count(bmds->dirty_bitmap); 665 bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap); 666 } 667 668 return dirty; 669 } 670 671 672 673 /* Called with the BQL taken. */ 674 static void block_migration_cleanup_bmds(void) 675 { 676 BlkMigDevState *bmds; 677 BlockDriverState *bs; 678 679 unset_dirty_tracking(); 680 681 while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) { 682 QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry); 683 684 bs = blk_bs(bmds->blk); 685 if (bs) { 686 bdrv_op_unblock_all(bs, bmds->blocker); 687 } 688 error_free(bmds->blocker); 689 blk_unref(bmds->blk); 690 g_free(bmds->blk_name); 691 g_free(bmds->aio_bitmap); 692 g_free(bmds); 693 } 694 } 695 696 /* Called with the BQL taken. */ 697 static void block_migration_cleanup(void *opaque) 698 { 699 BlkMigBlock *blk; 700 701 bdrv_drain_all(); 702 703 block_migration_cleanup_bmds(); 704 705 blk_mig_lock(); 706 while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) { 707 QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry); 708 g_free(blk->buf); 709 g_free(blk); 710 } 711 blk_mig_unlock(); 712 } 713 714 static int block_save_setup(QEMUFile *f, void *opaque, Error **errp) 715 { 716 int ret; 717 718 trace_migration_block_save("setup", block_mig_state.submitted, 719 block_mig_state.transferred); 720 721 warn_report("block migration is deprecated;" 722 " use blockdev-mirror with NBD instead"); 723 724 ret = init_blk_migration(f, errp); 725 if (ret < 0) { 726 return ret; 727 } 728 729 /* start track dirty blocks */ 730 ret = set_dirty_tracking(); 731 if (ret) { 732 error_setg_errno(errp, -ret, "Failed to start block dirty tracking"); 733 return ret; 734 } 735 736 ret = flush_blks(f); 737 if (ret) { 738 error_setg_errno(errp, -ret, "Flushing block failed"); 739 return ret; 740 } 741 blk_mig_reset_dirty_cursor(); 742 qemu_put_be64(f, BLK_MIG_FLAG_EOS); 743 744 return ret; 745 } 746 747 static int block_save_iterate(QEMUFile *f, void *opaque) 748 { 749 int ret; 750 uint64_t last_bytes = qemu_file_transferred(f); 751 752 trace_migration_block_save("iterate", block_mig_state.submitted, 753 block_mig_state.transferred); 754 755 ret = flush_blks(f); 756 if (ret) { 757 return ret; 758 } 759 760 blk_mig_reset_dirty_cursor(); 761 762 /* control the rate of transfer */ 763 blk_mig_lock(); 764 while (block_mig_state.read_done * BLK_MIG_BLOCK_SIZE < 765 migration_rate_get() && 766 block_mig_state.submitted < MAX_PARALLEL_IO && 767 (block_mig_state.submitted + block_mig_state.read_done) < 768 MAX_IO_BUFFERS) { 769 blk_mig_unlock(); 770 if (block_mig_state.bulk_completed == 0) { 771 /* first finish the bulk phase */ 772 if (blk_mig_save_bulked_block(f) == 0) { 773 /* finished saving bulk on all devices */ 774 block_mig_state.bulk_completed = 1; 775 } 776 ret = 0; 777 } else { 778 /* Always called with the BQL taken for 779 * simplicity, block_save_complete also calls it. 780 */ 781 bql_lock(); 782 ret = blk_mig_save_dirty_block(f, 1); 783 bql_unlock(); 784 } 785 if (ret < 0) { 786 return ret; 787 } 788 blk_mig_lock(); 789 if (ret != 0) { 790 /* no more dirty blocks */ 791 break; 792 } 793 } 794 blk_mig_unlock(); 795 796 ret = flush_blks(f); 797 if (ret) { 798 return ret; 799 } 800 801 qemu_put_be64(f, BLK_MIG_FLAG_EOS); 802 uint64_t delta_bytes = qemu_file_transferred(f) - last_bytes; 803 return (delta_bytes > 0); 804 } 805 806 /* Called with the BQL taken. */ 807 808 static int block_save_complete(QEMUFile *f, void *opaque) 809 { 810 int ret; 811 812 trace_migration_block_save("complete", block_mig_state.submitted, 813 block_mig_state.transferred); 814 815 ret = flush_blks(f); 816 if (ret) { 817 return ret; 818 } 819 820 blk_mig_reset_dirty_cursor(); 821 822 /* we know for sure that save bulk is completed and 823 all async read completed */ 824 blk_mig_lock(); 825 assert(block_mig_state.submitted == 0); 826 blk_mig_unlock(); 827 828 do { 829 ret = blk_mig_save_dirty_block(f, 0); 830 if (ret < 0) { 831 return ret; 832 } 833 } while (ret == 0); 834 835 /* report completion */ 836 qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS); 837 838 trace_migration_block_save_complete(); 839 840 qemu_put_be64(f, BLK_MIG_FLAG_EOS); 841 842 /* Make sure that our BlockBackends are gone, so that the block driver 843 * nodes can be inactivated. */ 844 block_migration_cleanup_bmds(); 845 846 return 0; 847 } 848 849 static void block_state_pending(void *opaque, uint64_t *must_precopy, 850 uint64_t *can_postcopy) 851 { 852 /* Estimate pending number of bytes to send */ 853 uint64_t pending; 854 855 bql_lock(); 856 pending = get_remaining_dirty(); 857 bql_unlock(); 858 859 blk_mig_lock(); 860 pending += block_mig_state.submitted * BLK_MIG_BLOCK_SIZE + 861 block_mig_state.read_done * BLK_MIG_BLOCK_SIZE; 862 blk_mig_unlock(); 863 864 /* Report at least one block pending during bulk phase */ 865 if (!pending && !block_mig_state.bulk_completed) { 866 pending = BLK_MIG_BLOCK_SIZE; 867 } 868 869 trace_migration_block_state_pending(pending); 870 /* We don't do postcopy */ 871 *must_precopy += pending; 872 } 873 874 static int block_load(QEMUFile *f, void *opaque, int version_id) 875 { 876 static int banner_printed; 877 int len, flags; 878 char device_name[256]; 879 int64_t addr; 880 BlockBackend *blk, *blk_prev = NULL; 881 Error *local_err = NULL; 882 uint8_t *buf; 883 int64_t total_sectors = 0; 884 int nr_sectors; 885 int ret; 886 BlockDriverInfo bdi; 887 int cluster_size = BLK_MIG_BLOCK_SIZE; 888 889 do { 890 addr = qemu_get_be64(f); 891 892 flags = addr & (BDRV_SECTOR_SIZE - 1); 893 addr >>= BDRV_SECTOR_BITS; 894 895 if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) { 896 /* get device name */ 897 len = qemu_get_byte(f); 898 qemu_get_buffer(f, (uint8_t *)device_name, len); 899 device_name[len] = '\0'; 900 901 blk = blk_by_name(device_name); 902 if (!blk) { 903 fprintf(stderr, "Error unknown block device %s\n", 904 device_name); 905 return -EINVAL; 906 } 907 908 if (blk != blk_prev) { 909 blk_prev = blk; 910 total_sectors = blk_nb_sectors(blk); 911 if (total_sectors <= 0) { 912 error_report("Error getting length of block device %s", 913 device_name); 914 return -EINVAL; 915 } 916 917 blk_activate(blk, &local_err); 918 if (local_err) { 919 error_report_err(local_err); 920 return -EINVAL; 921 } 922 923 ret = bdrv_get_info(blk_bs(blk), &bdi); 924 if (ret == 0 && bdi.cluster_size > 0 && 925 bdi.cluster_size <= BLK_MIG_BLOCK_SIZE && 926 BLK_MIG_BLOCK_SIZE % bdi.cluster_size == 0) { 927 cluster_size = bdi.cluster_size; 928 } else { 929 cluster_size = BLK_MIG_BLOCK_SIZE; 930 } 931 } 932 933 if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) { 934 nr_sectors = total_sectors - addr; 935 } else { 936 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; 937 } 938 939 if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { 940 ret = blk_pwrite_zeroes(blk, addr * BDRV_SECTOR_SIZE, 941 nr_sectors * BDRV_SECTOR_SIZE, 942 BDRV_REQ_MAY_UNMAP); 943 } else { 944 int i; 945 int64_t cur_addr; 946 uint8_t *cur_buf; 947 948 buf = g_malloc(BLK_MIG_BLOCK_SIZE); 949 qemu_get_buffer(f, buf, BLK_MIG_BLOCK_SIZE); 950 for (i = 0; i < BLK_MIG_BLOCK_SIZE / cluster_size; i++) { 951 cur_addr = addr * BDRV_SECTOR_SIZE + i * cluster_size; 952 cur_buf = buf + i * cluster_size; 953 954 if ((!block_mig_state.zero_blocks || 955 cluster_size < BLK_MIG_BLOCK_SIZE) && 956 buffer_is_zero(cur_buf, cluster_size)) { 957 ret = blk_pwrite_zeroes(blk, cur_addr, 958 cluster_size, 959 BDRV_REQ_MAY_UNMAP); 960 } else { 961 ret = blk_pwrite(blk, cur_addr, cluster_size, cur_buf, 962 0); 963 } 964 if (ret < 0) { 965 break; 966 } 967 } 968 g_free(buf); 969 } 970 971 if (ret < 0) { 972 return ret; 973 } 974 } else if (flags & BLK_MIG_FLAG_PROGRESS) { 975 if (!banner_printed) { 976 printf("Receiving block device images\n"); 977 banner_printed = 1; 978 } 979 printf("Completed %d %%%c", (int)addr, 980 (addr == 100) ? '\n' : '\r'); 981 fflush(stdout); 982 } else if (!(flags & BLK_MIG_FLAG_EOS)) { 983 fprintf(stderr, "Unknown block migration flags: 0x%x\n", flags); 984 return -EINVAL; 985 } 986 ret = qemu_file_get_error(f); 987 if (ret != 0) { 988 return ret; 989 } 990 } while (!(flags & BLK_MIG_FLAG_EOS)); 991 992 return 0; 993 } 994 995 static bool block_is_active(void *opaque) 996 { 997 return migrate_block(); 998 } 999 1000 static SaveVMHandlers savevm_block_handlers = { 1001 .save_setup = block_save_setup, 1002 .save_live_iterate = block_save_iterate, 1003 .save_live_complete_precopy = block_save_complete, 1004 .state_pending_exact = block_state_pending, 1005 .state_pending_estimate = block_state_pending, 1006 .load_state = block_load, 1007 .save_cleanup = block_migration_cleanup, 1008 .is_active = block_is_active, 1009 }; 1010 1011 void blk_mig_init(void) 1012 { 1013 QSIMPLEQ_INIT(&block_mig_state.bmds_list); 1014 QSIMPLEQ_INIT(&block_mig_state.blk_list); 1015 qemu_mutex_init(&block_mig_state.lock); 1016 1017 register_savevm_live("block", 0, 1, &savevm_block_handlers, 1018 &block_mig_state); 1019 } 1020