1 /* 2 * QEMU live block migration 3 * 4 * Copyright IBM, Corp. 2009 5 * 6 * Authors: 7 * Liran Schour <lirans@il.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu/osdep.h" 17 #include "qapi/error.h" 18 #include "qemu/error-report.h" 19 #include "qemu/main-loop.h" 20 #include "qemu/cutils.h" 21 #include "qemu/queue.h" 22 #include "block.h" 23 #include "block/dirty-bitmap.h" 24 #include "migration/misc.h" 25 #include "migration.h" 26 #include "migration/register.h" 27 #include "qemu-file.h" 28 #include "migration/vmstate.h" 29 #include "sysemu/block-backend.h" 30 #include "trace.h" 31 32 #define BLK_MIG_BLOCK_SIZE (1ULL << 20) 33 #define BDRV_SECTORS_PER_DIRTY_CHUNK (BLK_MIG_BLOCK_SIZE >> BDRV_SECTOR_BITS) 34 35 #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01 36 #define BLK_MIG_FLAG_EOS 0x02 37 #define BLK_MIG_FLAG_PROGRESS 0x04 38 #define BLK_MIG_FLAG_ZERO_BLOCK 0x08 39 40 #define MAX_IS_ALLOCATED_SEARCH (65536 * BDRV_SECTOR_SIZE) 41 42 #define MAX_IO_BUFFERS 512 43 #define MAX_PARALLEL_IO 16 44 45 typedef struct BlkMigDevState { 46 /* Written during setup phase. Can be read without a lock. */ 47 BlockBackend *blk; 48 char *blk_name; 49 int shared_base; 50 int64_t total_sectors; 51 QSIMPLEQ_ENTRY(BlkMigDevState) entry; 52 Error *blocker; 53 54 /* Only used by migration thread. Does not need a lock. */ 55 int bulk_completed; 56 int64_t cur_sector; 57 int64_t cur_dirty; 58 59 /* Data in the aio_bitmap is protected by block migration lock. 60 * Allocation and free happen during setup and cleanup respectively. 61 */ 62 unsigned long *aio_bitmap; 63 64 /* Protected by block migration lock. */ 65 int64_t completed_sectors; 66 67 /* During migration this is protected by iothread lock / AioContext. 68 * Allocation and free happen during setup and cleanup respectively. 69 */ 70 BdrvDirtyBitmap *dirty_bitmap; 71 } BlkMigDevState; 72 73 typedef struct BlkMigBlock { 74 /* Only used by migration thread. */ 75 uint8_t *buf; 76 BlkMigDevState *bmds; 77 int64_t sector; 78 int nr_sectors; 79 QEMUIOVector qiov; 80 BlockAIOCB *aiocb; 81 82 /* Protected by block migration lock. */ 83 int ret; 84 QSIMPLEQ_ENTRY(BlkMigBlock) entry; 85 } BlkMigBlock; 86 87 typedef struct BlkMigState { 88 QSIMPLEQ_HEAD(, BlkMigDevState) bmds_list; 89 int64_t total_sector_sum; 90 bool zero_blocks; 91 92 /* Protected by lock. */ 93 QSIMPLEQ_HEAD(, BlkMigBlock) blk_list; 94 int submitted; 95 int read_done; 96 97 /* Only used by migration thread. Does not need a lock. */ 98 int transferred; 99 int prev_progress; 100 int bulk_completed; 101 102 /* Lock must be taken _inside_ the iothread lock and any AioContexts. */ 103 QemuMutex lock; 104 } BlkMigState; 105 106 static BlkMigState block_mig_state; 107 108 static void blk_mig_lock(void) 109 { 110 qemu_mutex_lock(&block_mig_state.lock); 111 } 112 113 static void blk_mig_unlock(void) 114 { 115 qemu_mutex_unlock(&block_mig_state.lock); 116 } 117 118 /* Must run outside of the iothread lock during the bulk phase, 119 * or the VM will stall. 120 */ 121 122 static void blk_send(QEMUFile *f, BlkMigBlock * blk) 123 { 124 int len; 125 uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK; 126 127 if (block_mig_state.zero_blocks && 128 buffer_is_zero(blk->buf, BLK_MIG_BLOCK_SIZE)) { 129 flags |= BLK_MIG_FLAG_ZERO_BLOCK; 130 } 131 132 /* sector number and flags */ 133 qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS) 134 | flags); 135 136 /* device name */ 137 len = strlen(blk->bmds->blk_name); 138 qemu_put_byte(f, len); 139 qemu_put_buffer(f, (uint8_t *) blk->bmds->blk_name, len); 140 141 /* if a block is zero we need to flush here since the network 142 * bandwidth is now a lot higher than the storage device bandwidth. 143 * thus if we queue zero blocks we slow down the migration */ 144 if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { 145 qemu_fflush(f); 146 return; 147 } 148 149 qemu_put_buffer(f, blk->buf, BLK_MIG_BLOCK_SIZE); 150 } 151 152 int blk_mig_active(void) 153 { 154 return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list); 155 } 156 157 int blk_mig_bulk_active(void) 158 { 159 return blk_mig_active() && !block_mig_state.bulk_completed; 160 } 161 162 uint64_t blk_mig_bytes_transferred(void) 163 { 164 BlkMigDevState *bmds; 165 uint64_t sum = 0; 166 167 blk_mig_lock(); 168 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 169 sum += bmds->completed_sectors; 170 } 171 blk_mig_unlock(); 172 return sum << BDRV_SECTOR_BITS; 173 } 174 175 uint64_t blk_mig_bytes_remaining(void) 176 { 177 return blk_mig_bytes_total() - blk_mig_bytes_transferred(); 178 } 179 180 uint64_t blk_mig_bytes_total(void) 181 { 182 BlkMigDevState *bmds; 183 uint64_t sum = 0; 184 185 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 186 sum += bmds->total_sectors; 187 } 188 return sum << BDRV_SECTOR_BITS; 189 } 190 191 192 /* Called with migration lock held. */ 193 194 static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector) 195 { 196 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; 197 198 if (sector < blk_nb_sectors(bmds->blk)) { 199 return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] & 200 (1UL << (chunk % (sizeof(unsigned long) * 8)))); 201 } else { 202 return 0; 203 } 204 } 205 206 /* Called with migration lock held. */ 207 208 static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num, 209 int nb_sectors, int set) 210 { 211 int64_t start, end; 212 unsigned long val, idx, bit; 213 214 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK; 215 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK; 216 217 for (; start <= end; start++) { 218 idx = start / (sizeof(unsigned long) * 8); 219 bit = start % (sizeof(unsigned long) * 8); 220 val = bmds->aio_bitmap[idx]; 221 if (set) { 222 val |= 1UL << bit; 223 } else { 224 val &= ~(1UL << bit); 225 } 226 bmds->aio_bitmap[idx] = val; 227 } 228 } 229 230 static void alloc_aio_bitmap(BlkMigDevState *bmds) 231 { 232 BlockBackend *bb = bmds->blk; 233 int64_t bitmap_size; 234 235 bitmap_size = blk_nb_sectors(bb) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; 236 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8; 237 238 bmds->aio_bitmap = g_malloc0(bitmap_size); 239 } 240 241 /* Never hold migration lock when yielding to the main loop! */ 242 243 static void blk_mig_read_cb(void *opaque, int ret) 244 { 245 BlkMigBlock *blk = opaque; 246 247 blk_mig_lock(); 248 blk->ret = ret; 249 250 QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry); 251 bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0); 252 253 block_mig_state.submitted--; 254 block_mig_state.read_done++; 255 assert(block_mig_state.submitted >= 0); 256 blk_mig_unlock(); 257 } 258 259 /* Called with no lock taken. */ 260 261 static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds) 262 { 263 int64_t total_sectors = bmds->total_sectors; 264 int64_t cur_sector = bmds->cur_sector; 265 BlockBackend *bb = bmds->blk; 266 BlkMigBlock *blk; 267 int nr_sectors; 268 int64_t count; 269 270 if (bmds->shared_base) { 271 qemu_mutex_lock_iothread(); 272 aio_context_acquire(blk_get_aio_context(bb)); 273 /* Skip unallocated sectors; intentionally treats failure or 274 * partial sector as an allocated sector */ 275 while (cur_sector < total_sectors && 276 !bdrv_is_allocated(blk_bs(bb), cur_sector * BDRV_SECTOR_SIZE, 277 MAX_IS_ALLOCATED_SEARCH, &count)) { 278 if (count < BDRV_SECTOR_SIZE) { 279 break; 280 } 281 cur_sector += count >> BDRV_SECTOR_BITS; 282 } 283 aio_context_release(blk_get_aio_context(bb)); 284 qemu_mutex_unlock_iothread(); 285 } 286 287 if (cur_sector >= total_sectors) { 288 bmds->cur_sector = bmds->completed_sectors = total_sectors; 289 return 1; 290 } 291 292 bmds->completed_sectors = cur_sector; 293 294 cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1); 295 296 /* we are going to transfer a full block even if it is not allocated */ 297 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; 298 299 if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { 300 nr_sectors = total_sectors - cur_sector; 301 } 302 303 blk = g_new(BlkMigBlock, 1); 304 blk->buf = g_malloc(BLK_MIG_BLOCK_SIZE); 305 blk->bmds = bmds; 306 blk->sector = cur_sector; 307 blk->nr_sectors = nr_sectors; 308 309 qemu_iovec_init_buf(&blk->qiov, blk->buf, nr_sectors * BDRV_SECTOR_SIZE); 310 311 blk_mig_lock(); 312 block_mig_state.submitted++; 313 blk_mig_unlock(); 314 315 /* We do not know if bs is under the main thread (and thus does 316 * not acquire the AioContext when doing AIO) or rather under 317 * dataplane. Thus acquire both the iothread mutex and the 318 * AioContext. 319 * 320 * This is ugly and will disappear when we make bdrv_* thread-safe, 321 * without the need to acquire the AioContext. 322 */ 323 qemu_mutex_lock_iothread(); 324 aio_context_acquire(blk_get_aio_context(bmds->blk)); 325 bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector * BDRV_SECTOR_SIZE, 326 nr_sectors * BDRV_SECTOR_SIZE); 327 blk->aiocb = blk_aio_preadv(bb, cur_sector * BDRV_SECTOR_SIZE, &blk->qiov, 328 0, blk_mig_read_cb, blk); 329 aio_context_release(blk_get_aio_context(bmds->blk)); 330 qemu_mutex_unlock_iothread(); 331 332 bmds->cur_sector = cur_sector + nr_sectors; 333 return (bmds->cur_sector >= total_sectors); 334 } 335 336 /* Called with iothread lock taken. */ 337 338 static int set_dirty_tracking(void) 339 { 340 BlkMigDevState *bmds; 341 int ret; 342 343 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 344 bmds->dirty_bitmap = bdrv_create_dirty_bitmap(blk_bs(bmds->blk), 345 BLK_MIG_BLOCK_SIZE, 346 NULL, NULL); 347 if (!bmds->dirty_bitmap) { 348 ret = -errno; 349 goto fail; 350 } 351 } 352 return 0; 353 354 fail: 355 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 356 if (bmds->dirty_bitmap) { 357 bdrv_release_dirty_bitmap(bmds->dirty_bitmap); 358 } 359 } 360 return ret; 361 } 362 363 /* Called with iothread lock taken. */ 364 365 static void unset_dirty_tracking(void) 366 { 367 BlkMigDevState *bmds; 368 369 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 370 bdrv_release_dirty_bitmap(bmds->dirty_bitmap); 371 } 372 } 373 374 static int init_blk_migration(QEMUFile *f) 375 { 376 BlockDriverState *bs; 377 BlkMigDevState *bmds; 378 int64_t sectors; 379 BdrvNextIterator it; 380 int i, num_bs = 0; 381 struct { 382 BlkMigDevState *bmds; 383 BlockDriverState *bs; 384 } *bmds_bs; 385 Error *local_err = NULL; 386 int ret; 387 388 block_mig_state.submitted = 0; 389 block_mig_state.read_done = 0; 390 block_mig_state.transferred = 0; 391 block_mig_state.total_sector_sum = 0; 392 block_mig_state.prev_progress = -1; 393 block_mig_state.bulk_completed = 0; 394 block_mig_state.zero_blocks = migrate_zero_blocks(); 395 396 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 397 num_bs++; 398 } 399 bmds_bs = g_malloc0(num_bs * sizeof(*bmds_bs)); 400 401 for (i = 0, bs = bdrv_first(&it); bs; bs = bdrv_next(&it), i++) { 402 if (bdrv_is_read_only(bs)) { 403 continue; 404 } 405 406 sectors = bdrv_nb_sectors(bs); 407 if (sectors <= 0) { 408 ret = sectors; 409 bdrv_next_cleanup(&it); 410 goto out; 411 } 412 413 bmds = g_new0(BlkMigDevState, 1); 414 bmds->blk = blk_new(qemu_get_aio_context(), 415 BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL); 416 bmds->blk_name = g_strdup(bdrv_get_device_name(bs)); 417 bmds->bulk_completed = 0; 418 bmds->total_sectors = sectors; 419 bmds->completed_sectors = 0; 420 bmds->shared_base = migrate_use_block_incremental(); 421 422 assert(i < num_bs); 423 bmds_bs[i].bmds = bmds; 424 bmds_bs[i].bs = bs; 425 426 block_mig_state.total_sector_sum += sectors; 427 428 if (bmds->shared_base) { 429 trace_migration_block_init_shared(bdrv_get_device_name(bs)); 430 } else { 431 trace_migration_block_init_full(bdrv_get_device_name(bs)); 432 } 433 434 QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry); 435 } 436 437 /* Can only insert new BDSes now because doing so while iterating block 438 * devices may end up in a deadlock (iterating the new BDSes, too). */ 439 for (i = 0; i < num_bs; i++) { 440 BlkMigDevState *bmds = bmds_bs[i].bmds; 441 BlockDriverState *bs = bmds_bs[i].bs; 442 443 if (bmds) { 444 ret = blk_insert_bs(bmds->blk, bs, &local_err); 445 if (ret < 0) { 446 error_report_err(local_err); 447 goto out; 448 } 449 450 alloc_aio_bitmap(bmds); 451 error_setg(&bmds->blocker, "block device is in use by migration"); 452 bdrv_op_block_all(bs, bmds->blocker); 453 } 454 } 455 456 ret = 0; 457 out: 458 g_free(bmds_bs); 459 return ret; 460 } 461 462 /* Called with no lock taken. */ 463 464 static int blk_mig_save_bulked_block(QEMUFile *f) 465 { 466 int64_t completed_sector_sum = 0; 467 BlkMigDevState *bmds; 468 int progress; 469 int ret = 0; 470 471 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 472 if (bmds->bulk_completed == 0) { 473 if (mig_save_device_bulk(f, bmds) == 1) { 474 /* completed bulk section for this device */ 475 bmds->bulk_completed = 1; 476 } 477 completed_sector_sum += bmds->completed_sectors; 478 ret = 1; 479 break; 480 } else { 481 completed_sector_sum += bmds->completed_sectors; 482 } 483 } 484 485 if (block_mig_state.total_sector_sum != 0) { 486 progress = completed_sector_sum * 100 / 487 block_mig_state.total_sector_sum; 488 } else { 489 progress = 100; 490 } 491 if (progress != block_mig_state.prev_progress) { 492 block_mig_state.prev_progress = progress; 493 qemu_put_be64(f, (progress << BDRV_SECTOR_BITS) 494 | BLK_MIG_FLAG_PROGRESS); 495 trace_migration_block_progression(progress); 496 } 497 498 return ret; 499 } 500 501 static void blk_mig_reset_dirty_cursor(void) 502 { 503 BlkMigDevState *bmds; 504 505 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 506 bmds->cur_dirty = 0; 507 } 508 } 509 510 /* Called with iothread lock and AioContext taken. */ 511 512 static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds, 513 int is_async) 514 { 515 BlkMigBlock *blk; 516 int64_t total_sectors = bmds->total_sectors; 517 int64_t sector; 518 int nr_sectors; 519 int ret = -EIO; 520 521 for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) { 522 blk_mig_lock(); 523 if (bmds_aio_inflight(bmds, sector)) { 524 blk_mig_unlock(); 525 blk_drain(bmds->blk); 526 } else { 527 blk_mig_unlock(); 528 } 529 bdrv_dirty_bitmap_lock(bmds->dirty_bitmap); 530 if (bdrv_dirty_bitmap_get_locked(bmds->dirty_bitmap, 531 sector * BDRV_SECTOR_SIZE)) { 532 if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { 533 nr_sectors = total_sectors - sector; 534 } else { 535 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; 536 } 537 bdrv_reset_dirty_bitmap_locked(bmds->dirty_bitmap, 538 sector * BDRV_SECTOR_SIZE, 539 nr_sectors * BDRV_SECTOR_SIZE); 540 bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap); 541 542 blk = g_new(BlkMigBlock, 1); 543 blk->buf = g_malloc(BLK_MIG_BLOCK_SIZE); 544 blk->bmds = bmds; 545 blk->sector = sector; 546 blk->nr_sectors = nr_sectors; 547 548 if (is_async) { 549 qemu_iovec_init_buf(&blk->qiov, blk->buf, 550 nr_sectors * BDRV_SECTOR_SIZE); 551 552 blk->aiocb = blk_aio_preadv(bmds->blk, 553 sector * BDRV_SECTOR_SIZE, 554 &blk->qiov, 0, blk_mig_read_cb, 555 blk); 556 557 blk_mig_lock(); 558 block_mig_state.submitted++; 559 bmds_set_aio_inflight(bmds, sector, nr_sectors, 1); 560 blk_mig_unlock(); 561 } else { 562 ret = blk_pread(bmds->blk, sector * BDRV_SECTOR_SIZE, 563 nr_sectors * BDRV_SECTOR_SIZE, blk->buf, 0); 564 if (ret < 0) { 565 goto error; 566 } 567 blk_send(f, blk); 568 569 g_free(blk->buf); 570 g_free(blk); 571 } 572 573 sector += nr_sectors; 574 bmds->cur_dirty = sector; 575 break; 576 } 577 578 bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap); 579 sector += BDRV_SECTORS_PER_DIRTY_CHUNK; 580 bmds->cur_dirty = sector; 581 } 582 583 return (bmds->cur_dirty >= bmds->total_sectors); 584 585 error: 586 trace_migration_block_save_device_dirty(sector); 587 g_free(blk->buf); 588 g_free(blk); 589 return ret; 590 } 591 592 /* Called with iothread lock taken. 593 * 594 * return value: 595 * 0: too much data for max_downtime 596 * 1: few enough data for max_downtime 597 */ 598 static int blk_mig_save_dirty_block(QEMUFile *f, int is_async) 599 { 600 BlkMigDevState *bmds; 601 int ret = 1; 602 603 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 604 aio_context_acquire(blk_get_aio_context(bmds->blk)); 605 ret = mig_save_device_dirty(f, bmds, is_async); 606 aio_context_release(blk_get_aio_context(bmds->blk)); 607 if (ret <= 0) { 608 break; 609 } 610 } 611 612 return ret; 613 } 614 615 /* Called with no locks taken. */ 616 617 static int flush_blks(QEMUFile *f) 618 { 619 BlkMigBlock *blk; 620 int ret = 0; 621 622 trace_migration_block_flush_blks("Enter", block_mig_state.submitted, 623 block_mig_state.read_done, 624 block_mig_state.transferred); 625 626 blk_mig_lock(); 627 while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) { 628 if (qemu_file_rate_limit(f)) { 629 break; 630 } 631 if (blk->ret < 0) { 632 ret = blk->ret; 633 break; 634 } 635 636 QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry); 637 blk_mig_unlock(); 638 blk_send(f, blk); 639 blk_mig_lock(); 640 641 g_free(blk->buf); 642 g_free(blk); 643 644 block_mig_state.read_done--; 645 block_mig_state.transferred++; 646 assert(block_mig_state.read_done >= 0); 647 } 648 blk_mig_unlock(); 649 650 trace_migration_block_flush_blks("Exit", block_mig_state.submitted, 651 block_mig_state.read_done, 652 block_mig_state.transferred); 653 return ret; 654 } 655 656 /* Called with iothread lock taken. */ 657 658 static int64_t get_remaining_dirty(void) 659 { 660 BlkMigDevState *bmds; 661 int64_t dirty = 0; 662 663 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 664 aio_context_acquire(blk_get_aio_context(bmds->blk)); 665 dirty += bdrv_get_dirty_count(bmds->dirty_bitmap); 666 aio_context_release(blk_get_aio_context(bmds->blk)); 667 } 668 669 return dirty; 670 } 671 672 673 674 /* Called with iothread lock taken. */ 675 static void block_migration_cleanup_bmds(void) 676 { 677 BlkMigDevState *bmds; 678 AioContext *ctx; 679 680 unset_dirty_tracking(); 681 682 while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) { 683 QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry); 684 bdrv_op_unblock_all(blk_bs(bmds->blk), bmds->blocker); 685 error_free(bmds->blocker); 686 687 /* Save ctx, because bmds->blk can disappear during blk_unref. */ 688 ctx = blk_get_aio_context(bmds->blk); 689 aio_context_acquire(ctx); 690 blk_unref(bmds->blk); 691 aio_context_release(ctx); 692 693 g_free(bmds->blk_name); 694 g_free(bmds->aio_bitmap); 695 g_free(bmds); 696 } 697 } 698 699 /* Called with iothread lock taken. */ 700 static void block_migration_cleanup(void *opaque) 701 { 702 BlkMigBlock *blk; 703 704 bdrv_drain_all(); 705 706 block_migration_cleanup_bmds(); 707 708 blk_mig_lock(); 709 while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) { 710 QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry); 711 g_free(blk->buf); 712 g_free(blk); 713 } 714 blk_mig_unlock(); 715 } 716 717 static int block_save_setup(QEMUFile *f, void *opaque) 718 { 719 int ret; 720 721 trace_migration_block_save("setup", block_mig_state.submitted, 722 block_mig_state.transferred); 723 724 qemu_mutex_lock_iothread(); 725 ret = init_blk_migration(f); 726 if (ret < 0) { 727 qemu_mutex_unlock_iothread(); 728 return ret; 729 } 730 731 /* start track dirty blocks */ 732 ret = set_dirty_tracking(); 733 734 qemu_mutex_unlock_iothread(); 735 736 if (ret) { 737 return ret; 738 } 739 740 ret = flush_blks(f); 741 blk_mig_reset_dirty_cursor(); 742 qemu_put_be64(f, BLK_MIG_FLAG_EOS); 743 744 return ret; 745 } 746 747 static int block_save_iterate(QEMUFile *f, void *opaque) 748 { 749 int ret; 750 int64_t last_bytes = qemu_file_total_transferred(f); 751 int64_t delta_bytes; 752 753 trace_migration_block_save("iterate", block_mig_state.submitted, 754 block_mig_state.transferred); 755 756 ret = flush_blks(f); 757 if (ret) { 758 return ret; 759 } 760 761 blk_mig_reset_dirty_cursor(); 762 763 /* control the rate of transfer */ 764 blk_mig_lock(); 765 while (block_mig_state.read_done * BLK_MIG_BLOCK_SIZE < 766 qemu_file_get_rate_limit(f) && 767 block_mig_state.submitted < MAX_PARALLEL_IO && 768 (block_mig_state.submitted + block_mig_state.read_done) < 769 MAX_IO_BUFFERS) { 770 blk_mig_unlock(); 771 if (block_mig_state.bulk_completed == 0) { 772 /* first finish the bulk phase */ 773 if (blk_mig_save_bulked_block(f) == 0) { 774 /* finished saving bulk on all devices */ 775 block_mig_state.bulk_completed = 1; 776 } 777 ret = 0; 778 } else { 779 /* Always called with iothread lock taken for 780 * simplicity, block_save_complete also calls it. 781 */ 782 qemu_mutex_lock_iothread(); 783 ret = blk_mig_save_dirty_block(f, 1); 784 qemu_mutex_unlock_iothread(); 785 } 786 if (ret < 0) { 787 return ret; 788 } 789 blk_mig_lock(); 790 if (ret != 0) { 791 /* no more dirty blocks */ 792 break; 793 } 794 } 795 blk_mig_unlock(); 796 797 ret = flush_blks(f); 798 if (ret) { 799 return ret; 800 } 801 802 qemu_put_be64(f, BLK_MIG_FLAG_EOS); 803 delta_bytes = qemu_file_total_transferred(f) - last_bytes; 804 if (delta_bytes > 0) { 805 return 1; 806 } else if (delta_bytes < 0) { 807 return -1; 808 } else { 809 return 0; 810 } 811 } 812 813 /* Called with iothread lock taken. */ 814 815 static int block_save_complete(QEMUFile *f, void *opaque) 816 { 817 int ret; 818 819 trace_migration_block_save("complete", block_mig_state.submitted, 820 block_mig_state.transferred); 821 822 ret = flush_blks(f); 823 if (ret) { 824 return ret; 825 } 826 827 blk_mig_reset_dirty_cursor(); 828 829 /* we know for sure that save bulk is completed and 830 all async read completed */ 831 blk_mig_lock(); 832 assert(block_mig_state.submitted == 0); 833 blk_mig_unlock(); 834 835 do { 836 ret = blk_mig_save_dirty_block(f, 0); 837 if (ret < 0) { 838 return ret; 839 } 840 } while (ret == 0); 841 842 /* report completion */ 843 qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS); 844 845 trace_migration_block_save_complete(); 846 847 qemu_put_be64(f, BLK_MIG_FLAG_EOS); 848 849 /* Make sure that our BlockBackends are gone, so that the block driver 850 * nodes can be inactivated. */ 851 block_migration_cleanup_bmds(); 852 853 return 0; 854 } 855 856 static void block_state_pending(void *opaque, uint64_t *must_precopy, 857 uint64_t *can_postcopy) 858 { 859 /* Estimate pending number of bytes to send */ 860 uint64_t pending; 861 862 qemu_mutex_lock_iothread(); 863 pending = get_remaining_dirty(); 864 qemu_mutex_unlock_iothread(); 865 866 blk_mig_lock(); 867 pending += block_mig_state.submitted * BLK_MIG_BLOCK_SIZE + 868 block_mig_state.read_done * BLK_MIG_BLOCK_SIZE; 869 blk_mig_unlock(); 870 871 /* Report at least one block pending during bulk phase */ 872 if (!pending && !block_mig_state.bulk_completed) { 873 pending = BLK_MIG_BLOCK_SIZE; 874 } 875 876 trace_migration_block_state_pending(pending); 877 /* We don't do postcopy */ 878 *must_precopy += pending; 879 } 880 881 static int block_load(QEMUFile *f, void *opaque, int version_id) 882 { 883 static int banner_printed; 884 int len, flags; 885 char device_name[256]; 886 int64_t addr; 887 BlockBackend *blk, *blk_prev = NULL; 888 Error *local_err = NULL; 889 uint8_t *buf; 890 int64_t total_sectors = 0; 891 int nr_sectors; 892 int ret; 893 BlockDriverInfo bdi; 894 int cluster_size = BLK_MIG_BLOCK_SIZE; 895 896 do { 897 addr = qemu_get_be64(f); 898 899 flags = addr & (BDRV_SECTOR_SIZE - 1); 900 addr >>= BDRV_SECTOR_BITS; 901 902 if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) { 903 /* get device name */ 904 len = qemu_get_byte(f); 905 qemu_get_buffer(f, (uint8_t *)device_name, len); 906 device_name[len] = '\0'; 907 908 blk = blk_by_name(device_name); 909 if (!blk) { 910 fprintf(stderr, "Error unknown block device %s\n", 911 device_name); 912 return -EINVAL; 913 } 914 915 if (blk != blk_prev) { 916 blk_prev = blk; 917 total_sectors = blk_nb_sectors(blk); 918 if (total_sectors <= 0) { 919 error_report("Error getting length of block device %s", 920 device_name); 921 return -EINVAL; 922 } 923 924 blk_activate(blk, &local_err); 925 if (local_err) { 926 error_report_err(local_err); 927 return -EINVAL; 928 } 929 930 ret = bdrv_get_info(blk_bs(blk), &bdi); 931 if (ret == 0 && bdi.cluster_size > 0 && 932 bdi.cluster_size <= BLK_MIG_BLOCK_SIZE && 933 BLK_MIG_BLOCK_SIZE % bdi.cluster_size == 0) { 934 cluster_size = bdi.cluster_size; 935 } else { 936 cluster_size = BLK_MIG_BLOCK_SIZE; 937 } 938 } 939 940 if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) { 941 nr_sectors = total_sectors - addr; 942 } else { 943 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; 944 } 945 946 if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { 947 ret = blk_pwrite_zeroes(blk, addr * BDRV_SECTOR_SIZE, 948 nr_sectors * BDRV_SECTOR_SIZE, 949 BDRV_REQ_MAY_UNMAP); 950 } else { 951 int i; 952 int64_t cur_addr; 953 uint8_t *cur_buf; 954 955 buf = g_malloc(BLK_MIG_BLOCK_SIZE); 956 qemu_get_buffer(f, buf, BLK_MIG_BLOCK_SIZE); 957 for (i = 0; i < BLK_MIG_BLOCK_SIZE / cluster_size; i++) { 958 cur_addr = addr * BDRV_SECTOR_SIZE + i * cluster_size; 959 cur_buf = buf + i * cluster_size; 960 961 if ((!block_mig_state.zero_blocks || 962 cluster_size < BLK_MIG_BLOCK_SIZE) && 963 buffer_is_zero(cur_buf, cluster_size)) { 964 ret = blk_pwrite_zeroes(blk, cur_addr, 965 cluster_size, 966 BDRV_REQ_MAY_UNMAP); 967 } else { 968 ret = blk_pwrite(blk, cur_addr, cluster_size, cur_buf, 969 0); 970 } 971 if (ret < 0) { 972 break; 973 } 974 } 975 g_free(buf); 976 } 977 978 if (ret < 0) { 979 return ret; 980 } 981 } else if (flags & BLK_MIG_FLAG_PROGRESS) { 982 if (!banner_printed) { 983 printf("Receiving block device images\n"); 984 banner_printed = 1; 985 } 986 printf("Completed %d %%%c", (int)addr, 987 (addr == 100) ? '\n' : '\r'); 988 fflush(stdout); 989 } else if (!(flags & BLK_MIG_FLAG_EOS)) { 990 fprintf(stderr, "Unknown block migration flags: 0x%x\n", flags); 991 return -EINVAL; 992 } 993 ret = qemu_file_get_error(f); 994 if (ret != 0) { 995 return ret; 996 } 997 } while (!(flags & BLK_MIG_FLAG_EOS)); 998 999 return 0; 1000 } 1001 1002 static bool block_is_active(void *opaque) 1003 { 1004 return migrate_use_block(); 1005 } 1006 1007 static SaveVMHandlers savevm_block_handlers = { 1008 .save_setup = block_save_setup, 1009 .save_live_iterate = block_save_iterate, 1010 .save_live_complete_precopy = block_save_complete, 1011 .state_pending_exact = block_state_pending, 1012 .state_pending_estimate = block_state_pending, 1013 .load_state = block_load, 1014 .save_cleanup = block_migration_cleanup, 1015 .is_active = block_is_active, 1016 }; 1017 1018 void blk_mig_init(void) 1019 { 1020 QSIMPLEQ_INIT(&block_mig_state.bmds_list); 1021 QSIMPLEQ_INIT(&block_mig_state.blk_list); 1022 qemu_mutex_init(&block_mig_state.lock); 1023 1024 register_savevm_live("block", 0, 1, &savevm_block_handlers, 1025 &block_mig_state); 1026 } 1027