1 /* 2 * QEMU live block migration 3 * 4 * Copyright IBM, Corp. 2009 5 * 6 * Authors: 7 * Liran Schour <lirans@il.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu/osdep.h" 17 #include "qapi/error.h" 18 #include "qemu/error-report.h" 19 #include "qemu/main-loop.h" 20 #include "qemu/cutils.h" 21 #include "qemu/queue.h" 22 #include "block.h" 23 #include "block/dirty-bitmap.h" 24 #include "migration/misc.h" 25 #include "migration.h" 26 #include "migration/register.h" 27 #include "qemu-file.h" 28 #include "migration/vmstate.h" 29 #include "sysemu/block-backend.h" 30 #include "trace.h" 31 32 #define BLK_MIG_BLOCK_SIZE (1ULL << 20) 33 #define BDRV_SECTORS_PER_DIRTY_CHUNK (BLK_MIG_BLOCK_SIZE >> BDRV_SECTOR_BITS) 34 35 #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01 36 #define BLK_MIG_FLAG_EOS 0x02 37 #define BLK_MIG_FLAG_PROGRESS 0x04 38 #define BLK_MIG_FLAG_ZERO_BLOCK 0x08 39 40 #define MAX_IS_ALLOCATED_SEARCH (65536 * BDRV_SECTOR_SIZE) 41 42 #define MAX_IO_BUFFERS 512 43 #define MAX_PARALLEL_IO 16 44 45 /* #define DEBUG_BLK_MIGRATION */ 46 47 #ifdef DEBUG_BLK_MIGRATION 48 #define DPRINTF(fmt, ...) \ 49 do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0) 50 #else 51 #define DPRINTF(fmt, ...) \ 52 do { } while (0) 53 #endif 54 55 typedef struct BlkMigDevState { 56 /* Written during setup phase. Can be read without a lock. */ 57 BlockBackend *blk; 58 char *blk_name; 59 int shared_base; 60 int64_t total_sectors; 61 QSIMPLEQ_ENTRY(BlkMigDevState) entry; 62 Error *blocker; 63 64 /* Only used by migration thread. Does not need a lock. */ 65 int bulk_completed; 66 int64_t cur_sector; 67 int64_t cur_dirty; 68 69 /* Data in the aio_bitmap is protected by block migration lock. 70 * Allocation and free happen during setup and cleanup respectively. 71 */ 72 unsigned long *aio_bitmap; 73 74 /* Protected by block migration lock. */ 75 int64_t completed_sectors; 76 77 /* During migration this is protected by iothread lock / AioContext. 78 * Allocation and free happen during setup and cleanup respectively. 79 */ 80 BdrvDirtyBitmap *dirty_bitmap; 81 } BlkMigDevState; 82 83 typedef struct BlkMigBlock { 84 /* Only used by migration thread. */ 85 uint8_t *buf; 86 BlkMigDevState *bmds; 87 int64_t sector; 88 int nr_sectors; 89 QEMUIOVector qiov; 90 BlockAIOCB *aiocb; 91 92 /* Protected by block migration lock. */ 93 int ret; 94 QSIMPLEQ_ENTRY(BlkMigBlock) entry; 95 } BlkMigBlock; 96 97 typedef struct BlkMigState { 98 QSIMPLEQ_HEAD(, BlkMigDevState) bmds_list; 99 int64_t total_sector_sum; 100 bool zero_blocks; 101 102 /* Protected by lock. */ 103 QSIMPLEQ_HEAD(, BlkMigBlock) blk_list; 104 int submitted; 105 int read_done; 106 107 /* Only used by migration thread. Does not need a lock. */ 108 int transferred; 109 int prev_progress; 110 int bulk_completed; 111 112 /* Lock must be taken _inside_ the iothread lock and any AioContexts. */ 113 QemuMutex lock; 114 } BlkMigState; 115 116 static BlkMigState block_mig_state; 117 118 static void blk_mig_lock(void) 119 { 120 qemu_mutex_lock(&block_mig_state.lock); 121 } 122 123 static void blk_mig_unlock(void) 124 { 125 qemu_mutex_unlock(&block_mig_state.lock); 126 } 127 128 /* Must run outside of the iothread lock during the bulk phase, 129 * or the VM will stall. 130 */ 131 132 static void blk_send(QEMUFile *f, BlkMigBlock * blk) 133 { 134 int len; 135 uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK; 136 137 if (block_mig_state.zero_blocks && 138 buffer_is_zero(blk->buf, BLK_MIG_BLOCK_SIZE)) { 139 flags |= BLK_MIG_FLAG_ZERO_BLOCK; 140 } 141 142 /* sector number and flags */ 143 qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS) 144 | flags); 145 146 /* device name */ 147 len = strlen(blk->bmds->blk_name); 148 qemu_put_byte(f, len); 149 qemu_put_buffer(f, (uint8_t *) blk->bmds->blk_name, len); 150 151 /* if a block is zero we need to flush here since the network 152 * bandwidth is now a lot higher than the storage device bandwidth. 153 * thus if we queue zero blocks we slow down the migration */ 154 if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { 155 qemu_fflush(f); 156 return; 157 } 158 159 qemu_put_buffer(f, blk->buf, BLK_MIG_BLOCK_SIZE); 160 } 161 162 int blk_mig_active(void) 163 { 164 return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list); 165 } 166 167 int blk_mig_bulk_active(void) 168 { 169 return blk_mig_active() && !block_mig_state.bulk_completed; 170 } 171 172 uint64_t blk_mig_bytes_transferred(void) 173 { 174 BlkMigDevState *bmds; 175 uint64_t sum = 0; 176 177 blk_mig_lock(); 178 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 179 sum += bmds->completed_sectors; 180 } 181 blk_mig_unlock(); 182 return sum << BDRV_SECTOR_BITS; 183 } 184 185 uint64_t blk_mig_bytes_remaining(void) 186 { 187 return blk_mig_bytes_total() - blk_mig_bytes_transferred(); 188 } 189 190 uint64_t blk_mig_bytes_total(void) 191 { 192 BlkMigDevState *bmds; 193 uint64_t sum = 0; 194 195 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 196 sum += bmds->total_sectors; 197 } 198 return sum << BDRV_SECTOR_BITS; 199 } 200 201 202 /* Called with migration lock held. */ 203 204 static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector) 205 { 206 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; 207 208 if (sector < blk_nb_sectors(bmds->blk)) { 209 return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] & 210 (1UL << (chunk % (sizeof(unsigned long) * 8)))); 211 } else { 212 return 0; 213 } 214 } 215 216 /* Called with migration lock held. */ 217 218 static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num, 219 int nb_sectors, int set) 220 { 221 int64_t start, end; 222 unsigned long val, idx, bit; 223 224 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK; 225 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK; 226 227 for (; start <= end; start++) { 228 idx = start / (sizeof(unsigned long) * 8); 229 bit = start % (sizeof(unsigned long) * 8); 230 val = bmds->aio_bitmap[idx]; 231 if (set) { 232 val |= 1UL << bit; 233 } else { 234 val &= ~(1UL << bit); 235 } 236 bmds->aio_bitmap[idx] = val; 237 } 238 } 239 240 static void alloc_aio_bitmap(BlkMigDevState *bmds) 241 { 242 BlockBackend *bb = bmds->blk; 243 int64_t bitmap_size; 244 245 bitmap_size = blk_nb_sectors(bb) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; 246 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8; 247 248 bmds->aio_bitmap = g_malloc0(bitmap_size); 249 } 250 251 /* Never hold migration lock when yielding to the main loop! */ 252 253 static void blk_mig_read_cb(void *opaque, int ret) 254 { 255 BlkMigBlock *blk = opaque; 256 257 blk_mig_lock(); 258 blk->ret = ret; 259 260 QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry); 261 bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0); 262 263 block_mig_state.submitted--; 264 block_mig_state.read_done++; 265 assert(block_mig_state.submitted >= 0); 266 blk_mig_unlock(); 267 } 268 269 /* Called with no lock taken. */ 270 271 static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds) 272 { 273 int64_t total_sectors = bmds->total_sectors; 274 int64_t cur_sector = bmds->cur_sector; 275 BlockBackend *bb = bmds->blk; 276 BlkMigBlock *blk; 277 int nr_sectors; 278 int64_t count; 279 280 if (bmds->shared_base) { 281 qemu_mutex_lock_iothread(); 282 aio_context_acquire(blk_get_aio_context(bb)); 283 /* Skip unallocated sectors; intentionally treats failure or 284 * partial sector as an allocated sector */ 285 while (cur_sector < total_sectors && 286 !bdrv_is_allocated(blk_bs(bb), cur_sector * BDRV_SECTOR_SIZE, 287 MAX_IS_ALLOCATED_SEARCH, &count)) { 288 if (count < BDRV_SECTOR_SIZE) { 289 break; 290 } 291 cur_sector += count >> BDRV_SECTOR_BITS; 292 } 293 aio_context_release(blk_get_aio_context(bb)); 294 qemu_mutex_unlock_iothread(); 295 } 296 297 if (cur_sector >= total_sectors) { 298 bmds->cur_sector = bmds->completed_sectors = total_sectors; 299 return 1; 300 } 301 302 bmds->completed_sectors = cur_sector; 303 304 cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1); 305 306 /* we are going to transfer a full block even if it is not allocated */ 307 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; 308 309 if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { 310 nr_sectors = total_sectors - cur_sector; 311 } 312 313 blk = g_new(BlkMigBlock, 1); 314 blk->buf = g_malloc(BLK_MIG_BLOCK_SIZE); 315 blk->bmds = bmds; 316 blk->sector = cur_sector; 317 blk->nr_sectors = nr_sectors; 318 319 qemu_iovec_init_buf(&blk->qiov, blk->buf, nr_sectors * BDRV_SECTOR_SIZE); 320 321 blk_mig_lock(); 322 block_mig_state.submitted++; 323 blk_mig_unlock(); 324 325 /* We do not know if bs is under the main thread (and thus does 326 * not acquire the AioContext when doing AIO) or rather under 327 * dataplane. Thus acquire both the iothread mutex and the 328 * AioContext. 329 * 330 * This is ugly and will disappear when we make bdrv_* thread-safe, 331 * without the need to acquire the AioContext. 332 */ 333 qemu_mutex_lock_iothread(); 334 aio_context_acquire(blk_get_aio_context(bmds->blk)); 335 bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector * BDRV_SECTOR_SIZE, 336 nr_sectors * BDRV_SECTOR_SIZE); 337 blk->aiocb = blk_aio_preadv(bb, cur_sector * BDRV_SECTOR_SIZE, &blk->qiov, 338 0, blk_mig_read_cb, blk); 339 aio_context_release(blk_get_aio_context(bmds->blk)); 340 qemu_mutex_unlock_iothread(); 341 342 bmds->cur_sector = cur_sector + nr_sectors; 343 return (bmds->cur_sector >= total_sectors); 344 } 345 346 /* Called with iothread lock taken. */ 347 348 static int set_dirty_tracking(void) 349 { 350 BlkMigDevState *bmds; 351 int ret; 352 353 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 354 bmds->dirty_bitmap = bdrv_create_dirty_bitmap(blk_bs(bmds->blk), 355 BLK_MIG_BLOCK_SIZE, 356 NULL, NULL); 357 if (!bmds->dirty_bitmap) { 358 ret = -errno; 359 goto fail; 360 } 361 } 362 return 0; 363 364 fail: 365 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 366 if (bmds->dirty_bitmap) { 367 bdrv_release_dirty_bitmap(bmds->dirty_bitmap); 368 } 369 } 370 return ret; 371 } 372 373 /* Called with iothread lock taken. */ 374 375 static void unset_dirty_tracking(void) 376 { 377 BlkMigDevState *bmds; 378 379 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 380 bdrv_release_dirty_bitmap(bmds->dirty_bitmap); 381 } 382 } 383 384 static int init_blk_migration(QEMUFile *f) 385 { 386 BlockDriverState *bs; 387 BlkMigDevState *bmds; 388 int64_t sectors; 389 BdrvNextIterator it; 390 int i, num_bs = 0; 391 struct { 392 BlkMigDevState *bmds; 393 BlockDriverState *bs; 394 } *bmds_bs; 395 Error *local_err = NULL; 396 int ret; 397 398 block_mig_state.submitted = 0; 399 block_mig_state.read_done = 0; 400 block_mig_state.transferred = 0; 401 block_mig_state.total_sector_sum = 0; 402 block_mig_state.prev_progress = -1; 403 block_mig_state.bulk_completed = 0; 404 block_mig_state.zero_blocks = migrate_zero_blocks(); 405 406 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 407 num_bs++; 408 } 409 bmds_bs = g_malloc0(num_bs * sizeof(*bmds_bs)); 410 411 for (i = 0, bs = bdrv_first(&it); bs; bs = bdrv_next(&it), i++) { 412 if (bdrv_is_read_only(bs)) { 413 continue; 414 } 415 416 sectors = bdrv_nb_sectors(bs); 417 if (sectors <= 0) { 418 ret = sectors; 419 bdrv_next_cleanup(&it); 420 goto out; 421 } 422 423 bmds = g_new0(BlkMigDevState, 1); 424 bmds->blk = blk_new(qemu_get_aio_context(), 425 BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL); 426 bmds->blk_name = g_strdup(bdrv_get_device_name(bs)); 427 bmds->bulk_completed = 0; 428 bmds->total_sectors = sectors; 429 bmds->completed_sectors = 0; 430 bmds->shared_base = migrate_use_block_incremental(); 431 432 assert(i < num_bs); 433 bmds_bs[i].bmds = bmds; 434 bmds_bs[i].bs = bs; 435 436 block_mig_state.total_sector_sum += sectors; 437 438 if (bmds->shared_base) { 439 trace_migration_block_init_shared(bdrv_get_device_name(bs)); 440 } else { 441 trace_migration_block_init_full(bdrv_get_device_name(bs)); 442 } 443 444 QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry); 445 } 446 447 /* Can only insert new BDSes now because doing so while iterating block 448 * devices may end up in a deadlock (iterating the new BDSes, too). */ 449 for (i = 0; i < num_bs; i++) { 450 BlkMigDevState *bmds = bmds_bs[i].bmds; 451 BlockDriverState *bs = bmds_bs[i].bs; 452 453 if (bmds) { 454 ret = blk_insert_bs(bmds->blk, bs, &local_err); 455 if (ret < 0) { 456 error_report_err(local_err); 457 goto out; 458 } 459 460 alloc_aio_bitmap(bmds); 461 error_setg(&bmds->blocker, "block device is in use by migration"); 462 bdrv_op_block_all(bs, bmds->blocker); 463 } 464 } 465 466 ret = 0; 467 out: 468 g_free(bmds_bs); 469 return ret; 470 } 471 472 /* Called with no lock taken. */ 473 474 static int blk_mig_save_bulked_block(QEMUFile *f) 475 { 476 int64_t completed_sector_sum = 0; 477 BlkMigDevState *bmds; 478 int progress; 479 int ret = 0; 480 481 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 482 if (bmds->bulk_completed == 0) { 483 if (mig_save_device_bulk(f, bmds) == 1) { 484 /* completed bulk section for this device */ 485 bmds->bulk_completed = 1; 486 } 487 completed_sector_sum += bmds->completed_sectors; 488 ret = 1; 489 break; 490 } else { 491 completed_sector_sum += bmds->completed_sectors; 492 } 493 } 494 495 if (block_mig_state.total_sector_sum != 0) { 496 progress = completed_sector_sum * 100 / 497 block_mig_state.total_sector_sum; 498 } else { 499 progress = 100; 500 } 501 if (progress != block_mig_state.prev_progress) { 502 block_mig_state.prev_progress = progress; 503 qemu_put_be64(f, (progress << BDRV_SECTOR_BITS) 504 | BLK_MIG_FLAG_PROGRESS); 505 DPRINTF("Completed %d %%\r", progress); 506 } 507 508 return ret; 509 } 510 511 static void blk_mig_reset_dirty_cursor(void) 512 { 513 BlkMigDevState *bmds; 514 515 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 516 bmds->cur_dirty = 0; 517 } 518 } 519 520 /* Called with iothread lock and AioContext taken. */ 521 522 static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds, 523 int is_async) 524 { 525 BlkMigBlock *blk; 526 int64_t total_sectors = bmds->total_sectors; 527 int64_t sector; 528 int nr_sectors; 529 int ret = -EIO; 530 531 for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) { 532 blk_mig_lock(); 533 if (bmds_aio_inflight(bmds, sector)) { 534 blk_mig_unlock(); 535 blk_drain(bmds->blk); 536 } else { 537 blk_mig_unlock(); 538 } 539 bdrv_dirty_bitmap_lock(bmds->dirty_bitmap); 540 if (bdrv_dirty_bitmap_get_locked(bmds->dirty_bitmap, 541 sector * BDRV_SECTOR_SIZE)) { 542 if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { 543 nr_sectors = total_sectors - sector; 544 } else { 545 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; 546 } 547 bdrv_reset_dirty_bitmap_locked(bmds->dirty_bitmap, 548 sector * BDRV_SECTOR_SIZE, 549 nr_sectors * BDRV_SECTOR_SIZE); 550 bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap); 551 552 blk = g_new(BlkMigBlock, 1); 553 blk->buf = g_malloc(BLK_MIG_BLOCK_SIZE); 554 blk->bmds = bmds; 555 blk->sector = sector; 556 blk->nr_sectors = nr_sectors; 557 558 if (is_async) { 559 qemu_iovec_init_buf(&blk->qiov, blk->buf, 560 nr_sectors * BDRV_SECTOR_SIZE); 561 562 blk->aiocb = blk_aio_preadv(bmds->blk, 563 sector * BDRV_SECTOR_SIZE, 564 &blk->qiov, 0, blk_mig_read_cb, 565 blk); 566 567 blk_mig_lock(); 568 block_mig_state.submitted++; 569 bmds_set_aio_inflight(bmds, sector, nr_sectors, 1); 570 blk_mig_unlock(); 571 } else { 572 ret = blk_pread(bmds->blk, sector * BDRV_SECTOR_SIZE, 573 nr_sectors * BDRV_SECTOR_SIZE, blk->buf, 0); 574 if (ret < 0) { 575 goto error; 576 } 577 blk_send(f, blk); 578 579 g_free(blk->buf); 580 g_free(blk); 581 } 582 583 sector += nr_sectors; 584 bmds->cur_dirty = sector; 585 break; 586 } 587 588 bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap); 589 sector += BDRV_SECTORS_PER_DIRTY_CHUNK; 590 bmds->cur_dirty = sector; 591 } 592 593 return (bmds->cur_dirty >= bmds->total_sectors); 594 595 error: 596 trace_migration_block_save_device_dirty(sector); 597 g_free(blk->buf); 598 g_free(blk); 599 return ret; 600 } 601 602 /* Called with iothread lock taken. 603 * 604 * return value: 605 * 0: too much data for max_downtime 606 * 1: few enough data for max_downtime 607 */ 608 static int blk_mig_save_dirty_block(QEMUFile *f, int is_async) 609 { 610 BlkMigDevState *bmds; 611 int ret = 1; 612 613 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 614 aio_context_acquire(blk_get_aio_context(bmds->blk)); 615 ret = mig_save_device_dirty(f, bmds, is_async); 616 aio_context_release(blk_get_aio_context(bmds->blk)); 617 if (ret <= 0) { 618 break; 619 } 620 } 621 622 return ret; 623 } 624 625 /* Called with no locks taken. */ 626 627 static int flush_blks(QEMUFile *f) 628 { 629 BlkMigBlock *blk; 630 int ret = 0; 631 632 trace_migration_block_flush_blks("Enter", block_mig_state.submitted, 633 block_mig_state.read_done, 634 block_mig_state.transferred); 635 636 blk_mig_lock(); 637 while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) { 638 if (qemu_file_rate_limit(f)) { 639 break; 640 } 641 if (blk->ret < 0) { 642 ret = blk->ret; 643 break; 644 } 645 646 QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry); 647 blk_mig_unlock(); 648 blk_send(f, blk); 649 blk_mig_lock(); 650 651 g_free(blk->buf); 652 g_free(blk); 653 654 block_mig_state.read_done--; 655 block_mig_state.transferred++; 656 assert(block_mig_state.read_done >= 0); 657 } 658 blk_mig_unlock(); 659 660 trace_migration_block_flush_blks("Exit", block_mig_state.submitted, 661 block_mig_state.read_done, 662 block_mig_state.transferred); 663 return ret; 664 } 665 666 /* Called with iothread lock taken. */ 667 668 static int64_t get_remaining_dirty(void) 669 { 670 BlkMigDevState *bmds; 671 int64_t dirty = 0; 672 673 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 674 aio_context_acquire(blk_get_aio_context(bmds->blk)); 675 dirty += bdrv_get_dirty_count(bmds->dirty_bitmap); 676 aio_context_release(blk_get_aio_context(bmds->blk)); 677 } 678 679 return dirty; 680 } 681 682 683 684 /* Called with iothread lock taken. */ 685 static void block_migration_cleanup_bmds(void) 686 { 687 BlkMigDevState *bmds; 688 AioContext *ctx; 689 690 unset_dirty_tracking(); 691 692 while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) { 693 QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry); 694 bdrv_op_unblock_all(blk_bs(bmds->blk), bmds->blocker); 695 error_free(bmds->blocker); 696 697 /* Save ctx, because bmds->blk can disappear during blk_unref. */ 698 ctx = blk_get_aio_context(bmds->blk); 699 aio_context_acquire(ctx); 700 blk_unref(bmds->blk); 701 aio_context_release(ctx); 702 703 g_free(bmds->blk_name); 704 g_free(bmds->aio_bitmap); 705 g_free(bmds); 706 } 707 } 708 709 /* Called with iothread lock taken. */ 710 static void block_migration_cleanup(void *opaque) 711 { 712 BlkMigBlock *blk; 713 714 bdrv_drain_all(); 715 716 block_migration_cleanup_bmds(); 717 718 blk_mig_lock(); 719 while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) { 720 QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry); 721 g_free(blk->buf); 722 g_free(blk); 723 } 724 blk_mig_unlock(); 725 } 726 727 static int block_save_setup(QEMUFile *f, void *opaque) 728 { 729 int ret; 730 731 trace_migration_block_save("setup", block_mig_state.submitted, 732 block_mig_state.transferred); 733 734 qemu_mutex_lock_iothread(); 735 ret = init_blk_migration(f); 736 if (ret < 0) { 737 qemu_mutex_unlock_iothread(); 738 return ret; 739 } 740 741 /* start track dirty blocks */ 742 ret = set_dirty_tracking(); 743 744 qemu_mutex_unlock_iothread(); 745 746 if (ret) { 747 return ret; 748 } 749 750 ret = flush_blks(f); 751 blk_mig_reset_dirty_cursor(); 752 qemu_put_be64(f, BLK_MIG_FLAG_EOS); 753 754 return ret; 755 } 756 757 static int block_save_iterate(QEMUFile *f, void *opaque) 758 { 759 int ret; 760 int64_t last_bytes = qemu_file_total_transferred(f); 761 int64_t delta_bytes; 762 763 trace_migration_block_save("iterate", block_mig_state.submitted, 764 block_mig_state.transferred); 765 766 ret = flush_blks(f); 767 if (ret) { 768 return ret; 769 } 770 771 blk_mig_reset_dirty_cursor(); 772 773 /* control the rate of transfer */ 774 blk_mig_lock(); 775 while (block_mig_state.read_done * BLK_MIG_BLOCK_SIZE < 776 qemu_file_get_rate_limit(f) && 777 block_mig_state.submitted < MAX_PARALLEL_IO && 778 (block_mig_state.submitted + block_mig_state.read_done) < 779 MAX_IO_BUFFERS) { 780 blk_mig_unlock(); 781 if (block_mig_state.bulk_completed == 0) { 782 /* first finish the bulk phase */ 783 if (blk_mig_save_bulked_block(f) == 0) { 784 /* finished saving bulk on all devices */ 785 block_mig_state.bulk_completed = 1; 786 } 787 ret = 0; 788 } else { 789 /* Always called with iothread lock taken for 790 * simplicity, block_save_complete also calls it. 791 */ 792 qemu_mutex_lock_iothread(); 793 ret = blk_mig_save_dirty_block(f, 1); 794 qemu_mutex_unlock_iothread(); 795 } 796 if (ret < 0) { 797 return ret; 798 } 799 blk_mig_lock(); 800 if (ret != 0) { 801 /* no more dirty blocks */ 802 break; 803 } 804 } 805 blk_mig_unlock(); 806 807 ret = flush_blks(f); 808 if (ret) { 809 return ret; 810 } 811 812 qemu_put_be64(f, BLK_MIG_FLAG_EOS); 813 delta_bytes = qemu_file_total_transferred(f) - last_bytes; 814 if (delta_bytes > 0) { 815 return 1; 816 } else if (delta_bytes < 0) { 817 return -1; 818 } else { 819 return 0; 820 } 821 } 822 823 /* Called with iothread lock taken. */ 824 825 static int block_save_complete(QEMUFile *f, void *opaque) 826 { 827 int ret; 828 829 trace_migration_block_save("complete", block_mig_state.submitted, 830 block_mig_state.transferred); 831 832 ret = flush_blks(f); 833 if (ret) { 834 return ret; 835 } 836 837 blk_mig_reset_dirty_cursor(); 838 839 /* we know for sure that save bulk is completed and 840 all async read completed */ 841 blk_mig_lock(); 842 assert(block_mig_state.submitted == 0); 843 blk_mig_unlock(); 844 845 do { 846 ret = blk_mig_save_dirty_block(f, 0); 847 if (ret < 0) { 848 return ret; 849 } 850 } while (ret == 0); 851 852 /* report completion */ 853 qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS); 854 855 trace_migration_block_save_complete(); 856 857 qemu_put_be64(f, BLK_MIG_FLAG_EOS); 858 859 /* Make sure that our BlockBackends are gone, so that the block driver 860 * nodes can be inactivated. */ 861 block_migration_cleanup_bmds(); 862 863 return 0; 864 } 865 866 static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 867 uint64_t *res_precopy_only, 868 uint64_t *res_compatible, 869 uint64_t *res_postcopy_only) 870 { 871 /* Estimate pending number of bytes to send */ 872 uint64_t pending; 873 874 qemu_mutex_lock_iothread(); 875 pending = get_remaining_dirty(); 876 qemu_mutex_unlock_iothread(); 877 878 blk_mig_lock(); 879 pending += block_mig_state.submitted * BLK_MIG_BLOCK_SIZE + 880 block_mig_state.read_done * BLK_MIG_BLOCK_SIZE; 881 blk_mig_unlock(); 882 883 /* Report at least one block pending during bulk phase */ 884 if (!pending && !block_mig_state.bulk_completed) { 885 pending = BLK_MIG_BLOCK_SIZE; 886 } 887 888 trace_migration_block_save_pending(pending); 889 /* We don't do postcopy */ 890 *res_precopy_only += pending; 891 } 892 893 static int block_load(QEMUFile *f, void *opaque, int version_id) 894 { 895 static int banner_printed; 896 int len, flags; 897 char device_name[256]; 898 int64_t addr; 899 BlockBackend *blk, *blk_prev = NULL; 900 Error *local_err = NULL; 901 uint8_t *buf; 902 int64_t total_sectors = 0; 903 int nr_sectors; 904 int ret; 905 BlockDriverInfo bdi; 906 int cluster_size = BLK_MIG_BLOCK_SIZE; 907 908 do { 909 addr = qemu_get_be64(f); 910 911 flags = addr & (BDRV_SECTOR_SIZE - 1); 912 addr >>= BDRV_SECTOR_BITS; 913 914 if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) { 915 /* get device name */ 916 len = qemu_get_byte(f); 917 qemu_get_buffer(f, (uint8_t *)device_name, len); 918 device_name[len] = '\0'; 919 920 blk = blk_by_name(device_name); 921 if (!blk) { 922 fprintf(stderr, "Error unknown block device %s\n", 923 device_name); 924 return -EINVAL; 925 } 926 927 if (blk != blk_prev) { 928 blk_prev = blk; 929 total_sectors = blk_nb_sectors(blk); 930 if (total_sectors <= 0) { 931 error_report("Error getting length of block device %s", 932 device_name); 933 return -EINVAL; 934 } 935 936 blk_activate(blk, &local_err); 937 if (local_err) { 938 error_report_err(local_err); 939 return -EINVAL; 940 } 941 942 ret = bdrv_get_info(blk_bs(blk), &bdi); 943 if (ret == 0 && bdi.cluster_size > 0 && 944 bdi.cluster_size <= BLK_MIG_BLOCK_SIZE && 945 BLK_MIG_BLOCK_SIZE % bdi.cluster_size == 0) { 946 cluster_size = bdi.cluster_size; 947 } else { 948 cluster_size = BLK_MIG_BLOCK_SIZE; 949 } 950 } 951 952 if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) { 953 nr_sectors = total_sectors - addr; 954 } else { 955 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; 956 } 957 958 if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { 959 ret = blk_pwrite_zeroes(blk, addr * BDRV_SECTOR_SIZE, 960 nr_sectors * BDRV_SECTOR_SIZE, 961 BDRV_REQ_MAY_UNMAP); 962 } else { 963 int i; 964 int64_t cur_addr; 965 uint8_t *cur_buf; 966 967 buf = g_malloc(BLK_MIG_BLOCK_SIZE); 968 qemu_get_buffer(f, buf, BLK_MIG_BLOCK_SIZE); 969 for (i = 0; i < BLK_MIG_BLOCK_SIZE / cluster_size; i++) { 970 cur_addr = addr * BDRV_SECTOR_SIZE + i * cluster_size; 971 cur_buf = buf + i * cluster_size; 972 973 if ((!block_mig_state.zero_blocks || 974 cluster_size < BLK_MIG_BLOCK_SIZE) && 975 buffer_is_zero(cur_buf, cluster_size)) { 976 ret = blk_pwrite_zeroes(blk, cur_addr, 977 cluster_size, 978 BDRV_REQ_MAY_UNMAP); 979 } else { 980 ret = blk_pwrite(blk, cur_addr, cluster_size, cur_buf, 981 0); 982 } 983 if (ret < 0) { 984 break; 985 } 986 } 987 g_free(buf); 988 } 989 990 if (ret < 0) { 991 return ret; 992 } 993 } else if (flags & BLK_MIG_FLAG_PROGRESS) { 994 if (!banner_printed) { 995 printf("Receiving block device images\n"); 996 banner_printed = 1; 997 } 998 printf("Completed %d %%%c", (int)addr, 999 (addr == 100) ? '\n' : '\r'); 1000 fflush(stdout); 1001 } else if (!(flags & BLK_MIG_FLAG_EOS)) { 1002 fprintf(stderr, "Unknown block migration flags: 0x%x\n", flags); 1003 return -EINVAL; 1004 } 1005 ret = qemu_file_get_error(f); 1006 if (ret != 0) { 1007 return ret; 1008 } 1009 } while (!(flags & BLK_MIG_FLAG_EOS)); 1010 1011 return 0; 1012 } 1013 1014 static bool block_is_active(void *opaque) 1015 { 1016 return migrate_use_block(); 1017 } 1018 1019 static SaveVMHandlers savevm_block_handlers = { 1020 .save_setup = block_save_setup, 1021 .save_live_iterate = block_save_iterate, 1022 .save_live_complete_precopy = block_save_complete, 1023 .save_live_pending = block_save_pending, 1024 .load_state = block_load, 1025 .save_cleanup = block_migration_cleanup, 1026 .is_active = block_is_active, 1027 }; 1028 1029 void blk_mig_init(void) 1030 { 1031 QSIMPLEQ_INIT(&block_mig_state.bmds_list); 1032 QSIMPLEQ_INIT(&block_mig_state.blk_list); 1033 qemu_mutex_init(&block_mig_state.lock); 1034 1035 register_savevm_live("block", 0, 1, &savevm_block_handlers, 1036 &block_mig_state); 1037 } 1038