1 /* 2 * QEMU live block migration 3 * 4 * Copyright IBM, Corp. 2009 5 * 6 * Authors: 7 * Liran Schour <lirans@il.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu/osdep.h" 17 #include "qapi/error.h" 18 #include "qemu/error-report.h" 19 #include "qemu/main-loop.h" 20 #include "qemu/cutils.h" 21 #include "qemu/queue.h" 22 #include "block.h" 23 #include "block/dirty-bitmap.h" 24 #include "migration/misc.h" 25 #include "migration.h" 26 #include "migration/register.h" 27 #include "qemu-file.h" 28 #include "migration/vmstate.h" 29 #include "sysemu/block-backend.h" 30 #include "trace.h" 31 32 #define BLK_MIG_BLOCK_SIZE (1ULL << 20) 33 #define BDRV_SECTORS_PER_DIRTY_CHUNK (BLK_MIG_BLOCK_SIZE >> BDRV_SECTOR_BITS) 34 35 #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01 36 #define BLK_MIG_FLAG_EOS 0x02 37 #define BLK_MIG_FLAG_PROGRESS 0x04 38 #define BLK_MIG_FLAG_ZERO_BLOCK 0x08 39 40 #define MAX_IS_ALLOCATED_SEARCH (65536 * BDRV_SECTOR_SIZE) 41 42 #define MAX_IO_BUFFERS 512 43 #define MAX_PARALLEL_IO 16 44 45 typedef struct BlkMigDevState { 46 /* Written during setup phase. Can be read without a lock. */ 47 BlockBackend *blk; 48 char *blk_name; 49 int shared_base; 50 int64_t total_sectors; 51 QSIMPLEQ_ENTRY(BlkMigDevState) entry; 52 Error *blocker; 53 54 /* Only used by migration thread. Does not need a lock. */ 55 int bulk_completed; 56 int64_t cur_sector; 57 int64_t cur_dirty; 58 59 /* Data in the aio_bitmap is protected by block migration lock. 60 * Allocation and free happen during setup and cleanup respectively. 61 */ 62 unsigned long *aio_bitmap; 63 64 /* Protected by block migration lock. */ 65 int64_t completed_sectors; 66 67 /* During migration this is protected by iothread lock / AioContext. 68 * Allocation and free happen during setup and cleanup respectively. 69 */ 70 BdrvDirtyBitmap *dirty_bitmap; 71 } BlkMigDevState; 72 73 typedef struct BlkMigBlock { 74 /* Only used by migration thread. */ 75 uint8_t *buf; 76 BlkMigDevState *bmds; 77 int64_t sector; 78 int nr_sectors; 79 QEMUIOVector qiov; 80 BlockAIOCB *aiocb; 81 82 /* Protected by block migration lock. */ 83 int ret; 84 QSIMPLEQ_ENTRY(BlkMigBlock) entry; 85 } BlkMigBlock; 86 87 typedef struct BlkMigState { 88 QSIMPLEQ_HEAD(, BlkMigDevState) bmds_list; 89 int64_t total_sector_sum; 90 bool zero_blocks; 91 92 /* Protected by lock. */ 93 QSIMPLEQ_HEAD(, BlkMigBlock) blk_list; 94 int submitted; 95 int read_done; 96 97 /* Only used by migration thread. Does not need a lock. */ 98 int transferred; 99 int prev_progress; 100 int bulk_completed; 101 102 /* Lock must be taken _inside_ the iothread lock and any AioContexts. */ 103 QemuMutex lock; 104 } BlkMigState; 105 106 static BlkMigState block_mig_state; 107 108 static void blk_mig_lock(void) 109 { 110 qemu_mutex_lock(&block_mig_state.lock); 111 } 112 113 static void blk_mig_unlock(void) 114 { 115 qemu_mutex_unlock(&block_mig_state.lock); 116 } 117 118 /* Must run outside of the iothread lock during the bulk phase, 119 * or the VM will stall. 120 */ 121 122 static void blk_send(QEMUFile *f, BlkMigBlock * blk) 123 { 124 int len; 125 uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK; 126 127 if (block_mig_state.zero_blocks && 128 buffer_is_zero(blk->buf, BLK_MIG_BLOCK_SIZE)) { 129 flags |= BLK_MIG_FLAG_ZERO_BLOCK; 130 } 131 132 /* sector number and flags */ 133 qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS) 134 | flags); 135 136 /* device name */ 137 len = strlen(blk->bmds->blk_name); 138 qemu_put_byte(f, len); 139 qemu_put_buffer(f, (uint8_t *) blk->bmds->blk_name, len); 140 141 /* if a block is zero we need to flush here since the network 142 * bandwidth is now a lot higher than the storage device bandwidth. 143 * thus if we queue zero blocks we slow down the migration */ 144 if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { 145 qemu_fflush(f); 146 return; 147 } 148 149 qemu_put_buffer(f, blk->buf, BLK_MIG_BLOCK_SIZE); 150 } 151 152 int blk_mig_active(void) 153 { 154 return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list); 155 } 156 157 int blk_mig_bulk_active(void) 158 { 159 return blk_mig_active() && !block_mig_state.bulk_completed; 160 } 161 162 uint64_t blk_mig_bytes_transferred(void) 163 { 164 BlkMigDevState *bmds; 165 uint64_t sum = 0; 166 167 blk_mig_lock(); 168 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 169 sum += bmds->completed_sectors; 170 } 171 blk_mig_unlock(); 172 return sum << BDRV_SECTOR_BITS; 173 } 174 175 uint64_t blk_mig_bytes_remaining(void) 176 { 177 return blk_mig_bytes_total() - blk_mig_bytes_transferred(); 178 } 179 180 uint64_t blk_mig_bytes_total(void) 181 { 182 BlkMigDevState *bmds; 183 uint64_t sum = 0; 184 185 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 186 sum += bmds->total_sectors; 187 } 188 return sum << BDRV_SECTOR_BITS; 189 } 190 191 192 /* Called with migration lock held. */ 193 194 static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector) 195 { 196 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; 197 198 if (sector < bmds->total_sectors) { 199 return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] & 200 (1UL << (chunk % (sizeof(unsigned long) * 8)))); 201 } else { 202 return 0; 203 } 204 } 205 206 /* Called with migration lock held. */ 207 208 static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num, 209 int nb_sectors, int set) 210 { 211 int64_t start, end; 212 unsigned long val, idx, bit; 213 214 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK; 215 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK; 216 217 for (; start <= end; start++) { 218 idx = start / (sizeof(unsigned long) * 8); 219 bit = start % (sizeof(unsigned long) * 8); 220 val = bmds->aio_bitmap[idx]; 221 if (set) { 222 val |= 1UL << bit; 223 } else { 224 val &= ~(1UL << bit); 225 } 226 bmds->aio_bitmap[idx] = val; 227 } 228 } 229 230 static void alloc_aio_bitmap(BlkMigDevState *bmds) 231 { 232 int64_t bitmap_size; 233 234 bitmap_size = bmds->total_sectors + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; 235 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8; 236 237 bmds->aio_bitmap = g_malloc0(bitmap_size); 238 } 239 240 /* Never hold migration lock when yielding to the main loop! */ 241 242 static void blk_mig_read_cb(void *opaque, int ret) 243 { 244 BlkMigBlock *blk = opaque; 245 246 blk_mig_lock(); 247 blk->ret = ret; 248 249 QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry); 250 bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0); 251 252 block_mig_state.submitted--; 253 block_mig_state.read_done++; 254 assert(block_mig_state.submitted >= 0); 255 blk_mig_unlock(); 256 } 257 258 /* Called with no lock taken. */ 259 260 static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds) 261 { 262 int64_t total_sectors = bmds->total_sectors; 263 int64_t cur_sector = bmds->cur_sector; 264 BlockBackend *bb = bmds->blk; 265 BlkMigBlock *blk; 266 int nr_sectors; 267 int64_t count; 268 269 if (bmds->shared_base) { 270 qemu_mutex_lock_iothread(); 271 aio_context_acquire(blk_get_aio_context(bb)); 272 /* Skip unallocated sectors; intentionally treats failure or 273 * partial sector as an allocated sector */ 274 while (cur_sector < total_sectors && 275 !bdrv_is_allocated(blk_bs(bb), cur_sector * BDRV_SECTOR_SIZE, 276 MAX_IS_ALLOCATED_SEARCH, &count)) { 277 if (count < BDRV_SECTOR_SIZE) { 278 break; 279 } 280 cur_sector += count >> BDRV_SECTOR_BITS; 281 } 282 aio_context_release(blk_get_aio_context(bb)); 283 qemu_mutex_unlock_iothread(); 284 } 285 286 if (cur_sector >= total_sectors) { 287 bmds->cur_sector = bmds->completed_sectors = total_sectors; 288 return 1; 289 } 290 291 bmds->completed_sectors = cur_sector; 292 293 cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1); 294 295 /* we are going to transfer a full block even if it is not allocated */ 296 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; 297 298 if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { 299 nr_sectors = total_sectors - cur_sector; 300 } 301 302 blk = g_new(BlkMigBlock, 1); 303 blk->buf = g_malloc(BLK_MIG_BLOCK_SIZE); 304 blk->bmds = bmds; 305 blk->sector = cur_sector; 306 blk->nr_sectors = nr_sectors; 307 308 qemu_iovec_init_buf(&blk->qiov, blk->buf, nr_sectors * BDRV_SECTOR_SIZE); 309 310 blk_mig_lock(); 311 block_mig_state.submitted++; 312 blk_mig_unlock(); 313 314 /* We do not know if bs is under the main thread (and thus does 315 * not acquire the AioContext when doing AIO) or rather under 316 * dataplane. Thus acquire both the iothread mutex and the 317 * AioContext. 318 * 319 * This is ugly and will disappear when we make bdrv_* thread-safe, 320 * without the need to acquire the AioContext. 321 */ 322 qemu_mutex_lock_iothread(); 323 aio_context_acquire(blk_get_aio_context(bmds->blk)); 324 bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector * BDRV_SECTOR_SIZE, 325 nr_sectors * BDRV_SECTOR_SIZE); 326 blk->aiocb = blk_aio_preadv(bb, cur_sector * BDRV_SECTOR_SIZE, &blk->qiov, 327 0, blk_mig_read_cb, blk); 328 aio_context_release(blk_get_aio_context(bmds->blk)); 329 qemu_mutex_unlock_iothread(); 330 331 bmds->cur_sector = cur_sector + nr_sectors; 332 return (bmds->cur_sector >= total_sectors); 333 } 334 335 /* Called with iothread lock taken. */ 336 337 static int set_dirty_tracking(void) 338 { 339 BlkMigDevState *bmds; 340 int ret; 341 342 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 343 bmds->dirty_bitmap = bdrv_create_dirty_bitmap(blk_bs(bmds->blk), 344 BLK_MIG_BLOCK_SIZE, 345 NULL, NULL); 346 if (!bmds->dirty_bitmap) { 347 ret = -errno; 348 goto fail; 349 } 350 } 351 return 0; 352 353 fail: 354 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 355 if (bmds->dirty_bitmap) { 356 bdrv_release_dirty_bitmap(bmds->dirty_bitmap); 357 } 358 } 359 return ret; 360 } 361 362 /* Called with iothread lock taken. */ 363 364 static void unset_dirty_tracking(void) 365 { 366 BlkMigDevState *bmds; 367 368 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 369 bdrv_release_dirty_bitmap(bmds->dirty_bitmap); 370 } 371 } 372 373 static int init_blk_migration(QEMUFile *f) 374 { 375 BlockDriverState *bs; 376 BlkMigDevState *bmds; 377 int64_t sectors; 378 BdrvNextIterator it; 379 int i, num_bs = 0; 380 struct { 381 BlkMigDevState *bmds; 382 BlockDriverState *bs; 383 } *bmds_bs; 384 Error *local_err = NULL; 385 int ret; 386 387 block_mig_state.submitted = 0; 388 block_mig_state.read_done = 0; 389 block_mig_state.transferred = 0; 390 block_mig_state.total_sector_sum = 0; 391 block_mig_state.prev_progress = -1; 392 block_mig_state.bulk_completed = 0; 393 block_mig_state.zero_blocks = migrate_zero_blocks(); 394 395 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 396 num_bs++; 397 } 398 bmds_bs = g_malloc0(num_bs * sizeof(*bmds_bs)); 399 400 for (i = 0, bs = bdrv_first(&it); bs; bs = bdrv_next(&it), i++) { 401 if (bdrv_is_read_only(bs)) { 402 continue; 403 } 404 405 sectors = bdrv_nb_sectors(bs); 406 if (sectors <= 0) { 407 ret = sectors; 408 bdrv_next_cleanup(&it); 409 goto out; 410 } 411 412 bmds = g_new0(BlkMigDevState, 1); 413 bmds->blk = blk_new(qemu_get_aio_context(), 414 BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL); 415 bmds->blk_name = g_strdup(bdrv_get_device_name(bs)); 416 bmds->bulk_completed = 0; 417 bmds->total_sectors = sectors; 418 bmds->completed_sectors = 0; 419 bmds->shared_base = migrate_use_block_incremental(); 420 421 assert(i < num_bs); 422 bmds_bs[i].bmds = bmds; 423 bmds_bs[i].bs = bs; 424 425 block_mig_state.total_sector_sum += sectors; 426 427 if (bmds->shared_base) { 428 trace_migration_block_init_shared(bdrv_get_device_name(bs)); 429 } else { 430 trace_migration_block_init_full(bdrv_get_device_name(bs)); 431 } 432 433 QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry); 434 } 435 436 /* Can only insert new BDSes now because doing so while iterating block 437 * devices may end up in a deadlock (iterating the new BDSes, too). */ 438 for (i = 0; i < num_bs; i++) { 439 BlkMigDevState *bmds = bmds_bs[i].bmds; 440 BlockDriverState *bs = bmds_bs[i].bs; 441 442 if (bmds) { 443 ret = blk_insert_bs(bmds->blk, bs, &local_err); 444 if (ret < 0) { 445 error_report_err(local_err); 446 goto out; 447 } 448 449 alloc_aio_bitmap(bmds); 450 error_setg(&bmds->blocker, "block device is in use by migration"); 451 bdrv_op_block_all(bs, bmds->blocker); 452 } 453 } 454 455 ret = 0; 456 out: 457 g_free(bmds_bs); 458 return ret; 459 } 460 461 /* Called with no lock taken. */ 462 463 static int blk_mig_save_bulked_block(QEMUFile *f) 464 { 465 int64_t completed_sector_sum = 0; 466 BlkMigDevState *bmds; 467 int progress; 468 int ret = 0; 469 470 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 471 if (bmds->bulk_completed == 0) { 472 if (mig_save_device_bulk(f, bmds) == 1) { 473 /* completed bulk section for this device */ 474 bmds->bulk_completed = 1; 475 } 476 completed_sector_sum += bmds->completed_sectors; 477 ret = 1; 478 break; 479 } else { 480 completed_sector_sum += bmds->completed_sectors; 481 } 482 } 483 484 if (block_mig_state.total_sector_sum != 0) { 485 progress = completed_sector_sum * 100 / 486 block_mig_state.total_sector_sum; 487 } else { 488 progress = 100; 489 } 490 if (progress != block_mig_state.prev_progress) { 491 block_mig_state.prev_progress = progress; 492 qemu_put_be64(f, (progress << BDRV_SECTOR_BITS) 493 | BLK_MIG_FLAG_PROGRESS); 494 trace_migration_block_progression(progress); 495 } 496 497 return ret; 498 } 499 500 static void blk_mig_reset_dirty_cursor(void) 501 { 502 BlkMigDevState *bmds; 503 504 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 505 bmds->cur_dirty = 0; 506 } 507 } 508 509 /* Called with iothread lock and AioContext taken. */ 510 511 static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds, 512 int is_async) 513 { 514 BlkMigBlock *blk; 515 int64_t total_sectors = bmds->total_sectors; 516 int64_t sector; 517 int nr_sectors; 518 int ret = -EIO; 519 520 for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) { 521 blk_mig_lock(); 522 if (bmds_aio_inflight(bmds, sector)) { 523 blk_mig_unlock(); 524 blk_drain(bmds->blk); 525 } else { 526 blk_mig_unlock(); 527 } 528 bdrv_dirty_bitmap_lock(bmds->dirty_bitmap); 529 if (bdrv_dirty_bitmap_get_locked(bmds->dirty_bitmap, 530 sector * BDRV_SECTOR_SIZE)) { 531 if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { 532 nr_sectors = total_sectors - sector; 533 } else { 534 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; 535 } 536 bdrv_reset_dirty_bitmap_locked(bmds->dirty_bitmap, 537 sector * BDRV_SECTOR_SIZE, 538 nr_sectors * BDRV_SECTOR_SIZE); 539 bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap); 540 541 blk = g_new(BlkMigBlock, 1); 542 blk->buf = g_malloc(BLK_MIG_BLOCK_SIZE); 543 blk->bmds = bmds; 544 blk->sector = sector; 545 blk->nr_sectors = nr_sectors; 546 547 if (is_async) { 548 qemu_iovec_init_buf(&blk->qiov, blk->buf, 549 nr_sectors * BDRV_SECTOR_SIZE); 550 551 blk->aiocb = blk_aio_preadv(bmds->blk, 552 sector * BDRV_SECTOR_SIZE, 553 &blk->qiov, 0, blk_mig_read_cb, 554 blk); 555 556 blk_mig_lock(); 557 block_mig_state.submitted++; 558 bmds_set_aio_inflight(bmds, sector, nr_sectors, 1); 559 blk_mig_unlock(); 560 } else { 561 ret = blk_pread(bmds->blk, sector * BDRV_SECTOR_SIZE, 562 nr_sectors * BDRV_SECTOR_SIZE, blk->buf, 0); 563 if (ret < 0) { 564 goto error; 565 } 566 blk_send(f, blk); 567 568 g_free(blk->buf); 569 g_free(blk); 570 } 571 572 sector += nr_sectors; 573 bmds->cur_dirty = sector; 574 break; 575 } 576 577 bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap); 578 sector += BDRV_SECTORS_PER_DIRTY_CHUNK; 579 bmds->cur_dirty = sector; 580 } 581 582 return (bmds->cur_dirty >= bmds->total_sectors); 583 584 error: 585 trace_migration_block_save_device_dirty(sector); 586 g_free(blk->buf); 587 g_free(blk); 588 return ret; 589 } 590 591 /* Called with iothread lock taken. 592 * 593 * return value: 594 * 0: too much data for max_downtime 595 * 1: few enough data for max_downtime 596 */ 597 static int blk_mig_save_dirty_block(QEMUFile *f, int is_async) 598 { 599 BlkMigDevState *bmds; 600 int ret = 1; 601 602 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 603 aio_context_acquire(blk_get_aio_context(bmds->blk)); 604 ret = mig_save_device_dirty(f, bmds, is_async); 605 aio_context_release(blk_get_aio_context(bmds->blk)); 606 if (ret <= 0) { 607 break; 608 } 609 } 610 611 return ret; 612 } 613 614 /* Called with no locks taken. */ 615 616 static int flush_blks(QEMUFile *f) 617 { 618 BlkMigBlock *blk; 619 int ret = 0; 620 621 trace_migration_block_flush_blks("Enter", block_mig_state.submitted, 622 block_mig_state.read_done, 623 block_mig_state.transferred); 624 625 blk_mig_lock(); 626 while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) { 627 if (qemu_file_rate_limit(f)) { 628 break; 629 } 630 if (blk->ret < 0) { 631 ret = blk->ret; 632 break; 633 } 634 635 QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry); 636 blk_mig_unlock(); 637 blk_send(f, blk); 638 blk_mig_lock(); 639 640 g_free(blk->buf); 641 g_free(blk); 642 643 block_mig_state.read_done--; 644 block_mig_state.transferred++; 645 assert(block_mig_state.read_done >= 0); 646 } 647 blk_mig_unlock(); 648 649 trace_migration_block_flush_blks("Exit", block_mig_state.submitted, 650 block_mig_state.read_done, 651 block_mig_state.transferred); 652 return ret; 653 } 654 655 /* Called with iothread lock taken. */ 656 657 static int64_t get_remaining_dirty(void) 658 { 659 BlkMigDevState *bmds; 660 int64_t dirty = 0; 661 662 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 663 aio_context_acquire(blk_get_aio_context(bmds->blk)); 664 dirty += bdrv_get_dirty_count(bmds->dirty_bitmap); 665 aio_context_release(blk_get_aio_context(bmds->blk)); 666 } 667 668 return dirty; 669 } 670 671 672 673 /* Called with iothread lock taken. */ 674 static void block_migration_cleanup_bmds(void) 675 { 676 BlkMigDevState *bmds; 677 AioContext *ctx; 678 679 unset_dirty_tracking(); 680 681 while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) { 682 QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry); 683 bdrv_op_unblock_all(blk_bs(bmds->blk), bmds->blocker); 684 error_free(bmds->blocker); 685 686 /* Save ctx, because bmds->blk can disappear during blk_unref. */ 687 ctx = blk_get_aio_context(bmds->blk); 688 aio_context_acquire(ctx); 689 blk_unref(bmds->blk); 690 aio_context_release(ctx); 691 692 g_free(bmds->blk_name); 693 g_free(bmds->aio_bitmap); 694 g_free(bmds); 695 } 696 } 697 698 /* Called with iothread lock taken. */ 699 static void block_migration_cleanup(void *opaque) 700 { 701 BlkMigBlock *blk; 702 703 bdrv_drain_all(); 704 705 block_migration_cleanup_bmds(); 706 707 blk_mig_lock(); 708 while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) { 709 QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry); 710 g_free(blk->buf); 711 g_free(blk); 712 } 713 blk_mig_unlock(); 714 } 715 716 static int block_save_setup(QEMUFile *f, void *opaque) 717 { 718 int ret; 719 720 trace_migration_block_save("setup", block_mig_state.submitted, 721 block_mig_state.transferred); 722 723 qemu_mutex_lock_iothread(); 724 ret = init_blk_migration(f); 725 if (ret < 0) { 726 qemu_mutex_unlock_iothread(); 727 return ret; 728 } 729 730 /* start track dirty blocks */ 731 ret = set_dirty_tracking(); 732 733 qemu_mutex_unlock_iothread(); 734 735 if (ret) { 736 return ret; 737 } 738 739 ret = flush_blks(f); 740 blk_mig_reset_dirty_cursor(); 741 qemu_put_be64(f, BLK_MIG_FLAG_EOS); 742 743 return ret; 744 } 745 746 static int block_save_iterate(QEMUFile *f, void *opaque) 747 { 748 int ret; 749 int64_t last_bytes = qemu_file_total_transferred(f); 750 int64_t delta_bytes; 751 752 trace_migration_block_save("iterate", block_mig_state.submitted, 753 block_mig_state.transferred); 754 755 ret = flush_blks(f); 756 if (ret) { 757 return ret; 758 } 759 760 blk_mig_reset_dirty_cursor(); 761 762 /* control the rate of transfer */ 763 blk_mig_lock(); 764 while (block_mig_state.read_done * BLK_MIG_BLOCK_SIZE < 765 qemu_file_get_rate_limit(f) && 766 block_mig_state.submitted < MAX_PARALLEL_IO && 767 (block_mig_state.submitted + block_mig_state.read_done) < 768 MAX_IO_BUFFERS) { 769 blk_mig_unlock(); 770 if (block_mig_state.bulk_completed == 0) { 771 /* first finish the bulk phase */ 772 if (blk_mig_save_bulked_block(f) == 0) { 773 /* finished saving bulk on all devices */ 774 block_mig_state.bulk_completed = 1; 775 } 776 ret = 0; 777 } else { 778 /* Always called with iothread lock taken for 779 * simplicity, block_save_complete also calls it. 780 */ 781 qemu_mutex_lock_iothread(); 782 ret = blk_mig_save_dirty_block(f, 1); 783 qemu_mutex_unlock_iothread(); 784 } 785 if (ret < 0) { 786 return ret; 787 } 788 blk_mig_lock(); 789 if (ret != 0) { 790 /* no more dirty blocks */ 791 break; 792 } 793 } 794 blk_mig_unlock(); 795 796 ret = flush_blks(f); 797 if (ret) { 798 return ret; 799 } 800 801 qemu_put_be64(f, BLK_MIG_FLAG_EOS); 802 delta_bytes = qemu_file_total_transferred(f) - last_bytes; 803 if (delta_bytes > 0) { 804 return 1; 805 } else if (delta_bytes < 0) { 806 return -1; 807 } else { 808 return 0; 809 } 810 } 811 812 /* Called with iothread lock taken. */ 813 814 static int block_save_complete(QEMUFile *f, void *opaque) 815 { 816 int ret; 817 818 trace_migration_block_save("complete", block_mig_state.submitted, 819 block_mig_state.transferred); 820 821 ret = flush_blks(f); 822 if (ret) { 823 return ret; 824 } 825 826 blk_mig_reset_dirty_cursor(); 827 828 /* we know for sure that save bulk is completed and 829 all async read completed */ 830 blk_mig_lock(); 831 assert(block_mig_state.submitted == 0); 832 blk_mig_unlock(); 833 834 do { 835 ret = blk_mig_save_dirty_block(f, 0); 836 if (ret < 0) { 837 return ret; 838 } 839 } while (ret == 0); 840 841 /* report completion */ 842 qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS); 843 844 trace_migration_block_save_complete(); 845 846 qemu_put_be64(f, BLK_MIG_FLAG_EOS); 847 848 /* Make sure that our BlockBackends are gone, so that the block driver 849 * nodes can be inactivated. */ 850 block_migration_cleanup_bmds(); 851 852 return 0; 853 } 854 855 static void block_state_pending(void *opaque, uint64_t *must_precopy, 856 uint64_t *can_postcopy) 857 { 858 /* Estimate pending number of bytes to send */ 859 uint64_t pending; 860 861 qemu_mutex_lock_iothread(); 862 pending = get_remaining_dirty(); 863 qemu_mutex_unlock_iothread(); 864 865 blk_mig_lock(); 866 pending += block_mig_state.submitted * BLK_MIG_BLOCK_SIZE + 867 block_mig_state.read_done * BLK_MIG_BLOCK_SIZE; 868 blk_mig_unlock(); 869 870 /* Report at least one block pending during bulk phase */ 871 if (!pending && !block_mig_state.bulk_completed) { 872 pending = BLK_MIG_BLOCK_SIZE; 873 } 874 875 trace_migration_block_state_pending(pending); 876 /* We don't do postcopy */ 877 *must_precopy += pending; 878 } 879 880 static int block_load(QEMUFile *f, void *opaque, int version_id) 881 { 882 static int banner_printed; 883 int len, flags; 884 char device_name[256]; 885 int64_t addr; 886 BlockBackend *blk, *blk_prev = NULL; 887 Error *local_err = NULL; 888 uint8_t *buf; 889 int64_t total_sectors = 0; 890 int nr_sectors; 891 int ret; 892 BlockDriverInfo bdi; 893 int cluster_size = BLK_MIG_BLOCK_SIZE; 894 895 do { 896 addr = qemu_get_be64(f); 897 898 flags = addr & (BDRV_SECTOR_SIZE - 1); 899 addr >>= BDRV_SECTOR_BITS; 900 901 if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) { 902 /* get device name */ 903 len = qemu_get_byte(f); 904 qemu_get_buffer(f, (uint8_t *)device_name, len); 905 device_name[len] = '\0'; 906 907 blk = blk_by_name(device_name); 908 if (!blk) { 909 fprintf(stderr, "Error unknown block device %s\n", 910 device_name); 911 return -EINVAL; 912 } 913 914 if (blk != blk_prev) { 915 blk_prev = blk; 916 total_sectors = blk_nb_sectors(blk); 917 if (total_sectors <= 0) { 918 error_report("Error getting length of block device %s", 919 device_name); 920 return -EINVAL; 921 } 922 923 blk_activate(blk, &local_err); 924 if (local_err) { 925 error_report_err(local_err); 926 return -EINVAL; 927 } 928 929 ret = bdrv_get_info(blk_bs(blk), &bdi); 930 if (ret == 0 && bdi.cluster_size > 0 && 931 bdi.cluster_size <= BLK_MIG_BLOCK_SIZE && 932 BLK_MIG_BLOCK_SIZE % bdi.cluster_size == 0) { 933 cluster_size = bdi.cluster_size; 934 } else { 935 cluster_size = BLK_MIG_BLOCK_SIZE; 936 } 937 } 938 939 if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) { 940 nr_sectors = total_sectors - addr; 941 } else { 942 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; 943 } 944 945 if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { 946 ret = blk_pwrite_zeroes(blk, addr * BDRV_SECTOR_SIZE, 947 nr_sectors * BDRV_SECTOR_SIZE, 948 BDRV_REQ_MAY_UNMAP); 949 } else { 950 int i; 951 int64_t cur_addr; 952 uint8_t *cur_buf; 953 954 buf = g_malloc(BLK_MIG_BLOCK_SIZE); 955 qemu_get_buffer(f, buf, BLK_MIG_BLOCK_SIZE); 956 for (i = 0; i < BLK_MIG_BLOCK_SIZE / cluster_size; i++) { 957 cur_addr = addr * BDRV_SECTOR_SIZE + i * cluster_size; 958 cur_buf = buf + i * cluster_size; 959 960 if ((!block_mig_state.zero_blocks || 961 cluster_size < BLK_MIG_BLOCK_SIZE) && 962 buffer_is_zero(cur_buf, cluster_size)) { 963 ret = blk_pwrite_zeroes(blk, cur_addr, 964 cluster_size, 965 BDRV_REQ_MAY_UNMAP); 966 } else { 967 ret = blk_pwrite(blk, cur_addr, cluster_size, cur_buf, 968 0); 969 } 970 if (ret < 0) { 971 break; 972 } 973 } 974 g_free(buf); 975 } 976 977 if (ret < 0) { 978 return ret; 979 } 980 } else if (flags & BLK_MIG_FLAG_PROGRESS) { 981 if (!banner_printed) { 982 printf("Receiving block device images\n"); 983 banner_printed = 1; 984 } 985 printf("Completed %d %%%c", (int)addr, 986 (addr == 100) ? '\n' : '\r'); 987 fflush(stdout); 988 } else if (!(flags & BLK_MIG_FLAG_EOS)) { 989 fprintf(stderr, "Unknown block migration flags: 0x%x\n", flags); 990 return -EINVAL; 991 } 992 ret = qemu_file_get_error(f); 993 if (ret != 0) { 994 return ret; 995 } 996 } while (!(flags & BLK_MIG_FLAG_EOS)); 997 998 return 0; 999 } 1000 1001 static bool block_is_active(void *opaque) 1002 { 1003 return migrate_use_block(); 1004 } 1005 1006 static SaveVMHandlers savevm_block_handlers = { 1007 .save_setup = block_save_setup, 1008 .save_live_iterate = block_save_iterate, 1009 .save_live_complete_precopy = block_save_complete, 1010 .state_pending_exact = block_state_pending, 1011 .state_pending_estimate = block_state_pending, 1012 .load_state = block_load, 1013 .save_cleanup = block_migration_cleanup, 1014 .is_active = block_is_active, 1015 }; 1016 1017 void blk_mig_init(void) 1018 { 1019 QSIMPLEQ_INIT(&block_mig_state.bmds_list); 1020 QSIMPLEQ_INIT(&block_mig_state.blk_list); 1021 qemu_mutex_init(&block_mig_state.lock); 1022 1023 register_savevm_live("block", 0, 1, &savevm_block_handlers, 1024 &block_mig_state); 1025 } 1026