xref: /qemu/migration/block.c (revision 7a4e543d)
1 /*
2  * QEMU live block migration
3  *
4  * Copyright IBM, Corp. 2009
5  *
6  * Authors:
7  *  Liran Schour   <lirans@il.ibm.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  * Contributions after 2012-01-13 are licensed under the terms of the
13  * GNU GPL, version 2 or (at your option) any later version.
14  */
15 
16 #include "qemu/osdep.h"
17 #include "qemu-common.h"
18 #include "block/block.h"
19 #include "qemu/error-report.h"
20 #include "qemu/main-loop.h"
21 #include "hw/hw.h"
22 #include "qemu/queue.h"
23 #include "qemu/timer.h"
24 #include "migration/block.h"
25 #include "migration/migration.h"
26 #include "sysemu/blockdev.h"
27 #include "sysemu/block-backend.h"
28 
29 #define BLOCK_SIZE                       (1 << 20)
30 #define BDRV_SECTORS_PER_DIRTY_CHUNK     (BLOCK_SIZE >> BDRV_SECTOR_BITS)
31 
32 #define BLK_MIG_FLAG_DEVICE_BLOCK       0x01
33 #define BLK_MIG_FLAG_EOS                0x02
34 #define BLK_MIG_FLAG_PROGRESS           0x04
35 #define BLK_MIG_FLAG_ZERO_BLOCK         0x08
36 
37 #define MAX_IS_ALLOCATED_SEARCH 65536
38 
39 #define MAX_INFLIGHT_IO 512
40 
41 //#define DEBUG_BLK_MIGRATION
42 
43 #ifdef DEBUG_BLK_MIGRATION
44 #define DPRINTF(fmt, ...) \
45     do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
46 #else
47 #define DPRINTF(fmt, ...) \
48     do { } while (0)
49 #endif
50 
51 typedef struct BlkMigDevState {
52     /* Written during setup phase.  Can be read without a lock.  */
53     BlockDriverState *bs;
54     int shared_base;
55     int64_t total_sectors;
56     QSIMPLEQ_ENTRY(BlkMigDevState) entry;
57 
58     /* Only used by migration thread.  Does not need a lock.  */
59     int bulk_completed;
60     int64_t cur_sector;
61     int64_t cur_dirty;
62 
63     /* Protected by block migration lock.  */
64     unsigned long *aio_bitmap;
65     int64_t completed_sectors;
66     BdrvDirtyBitmap *dirty_bitmap;
67     Error *blocker;
68 } BlkMigDevState;
69 
70 typedef struct BlkMigBlock {
71     /* Only used by migration thread.  */
72     uint8_t *buf;
73     BlkMigDevState *bmds;
74     int64_t sector;
75     int nr_sectors;
76     struct iovec iov;
77     QEMUIOVector qiov;
78     BlockAIOCB *aiocb;
79 
80     /* Protected by block migration lock.  */
81     int ret;
82     QSIMPLEQ_ENTRY(BlkMigBlock) entry;
83 } BlkMigBlock;
84 
85 typedef struct BlkMigState {
86     /* Written during setup phase.  Can be read without a lock.  */
87     int blk_enable;
88     int shared_base;
89     QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list;
90     int64_t total_sector_sum;
91     bool zero_blocks;
92 
93     /* Protected by lock.  */
94     QSIMPLEQ_HEAD(blk_list, BlkMigBlock) blk_list;
95     int submitted;
96     int read_done;
97 
98     /* Only used by migration thread.  Does not need a lock.  */
99     int transferred;
100     int prev_progress;
101     int bulk_completed;
102 
103     /* Lock must be taken _inside_ the iothread lock.  */
104     QemuMutex lock;
105 } BlkMigState;
106 
107 static BlkMigState block_mig_state;
108 
109 static void blk_mig_lock(void)
110 {
111     qemu_mutex_lock(&block_mig_state.lock);
112 }
113 
114 static void blk_mig_unlock(void)
115 {
116     qemu_mutex_unlock(&block_mig_state.lock);
117 }
118 
119 /* Must run outside of the iothread lock during the bulk phase,
120  * or the VM will stall.
121  */
122 
123 static void blk_send(QEMUFile *f, BlkMigBlock * blk)
124 {
125     int len;
126     uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK;
127 
128     if (block_mig_state.zero_blocks &&
129         buffer_is_zero(blk->buf, BLOCK_SIZE)) {
130         flags |= BLK_MIG_FLAG_ZERO_BLOCK;
131     }
132 
133     /* sector number and flags */
134     qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
135                      | flags);
136 
137     /* device name */
138     len = strlen(bdrv_get_device_name(blk->bmds->bs));
139     qemu_put_byte(f, len);
140     qemu_put_buffer(f, (uint8_t *)bdrv_get_device_name(blk->bmds->bs), len);
141 
142     /* if a block is zero we need to flush here since the network
143      * bandwidth is now a lot higher than the storage device bandwidth.
144      * thus if we queue zero blocks we slow down the migration */
145     if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
146         qemu_fflush(f);
147         return;
148     }
149 
150     qemu_put_buffer(f, blk->buf, BLOCK_SIZE);
151 }
152 
153 int blk_mig_active(void)
154 {
155     return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list);
156 }
157 
158 uint64_t blk_mig_bytes_transferred(void)
159 {
160     BlkMigDevState *bmds;
161     uint64_t sum = 0;
162 
163     blk_mig_lock();
164     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
165         sum += bmds->completed_sectors;
166     }
167     blk_mig_unlock();
168     return sum << BDRV_SECTOR_BITS;
169 }
170 
171 uint64_t blk_mig_bytes_remaining(void)
172 {
173     return blk_mig_bytes_total() - blk_mig_bytes_transferred();
174 }
175 
176 uint64_t blk_mig_bytes_total(void)
177 {
178     BlkMigDevState *bmds;
179     uint64_t sum = 0;
180 
181     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
182         sum += bmds->total_sectors;
183     }
184     return sum << BDRV_SECTOR_BITS;
185 }
186 
187 
188 /* Called with migration lock held.  */
189 
190 static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
191 {
192     int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
193 
194     if (sector < bdrv_nb_sectors(bmds->bs)) {
195         return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
196             (1UL << (chunk % (sizeof(unsigned long) * 8))));
197     } else {
198         return 0;
199     }
200 }
201 
202 /* Called with migration lock held.  */
203 
204 static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
205                              int nb_sectors, int set)
206 {
207     int64_t start, end;
208     unsigned long val, idx, bit;
209 
210     start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
211     end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
212 
213     for (; start <= end; start++) {
214         idx = start / (sizeof(unsigned long) * 8);
215         bit = start % (sizeof(unsigned long) * 8);
216         val = bmds->aio_bitmap[idx];
217         if (set) {
218             val |= 1UL << bit;
219         } else {
220             val &= ~(1UL << bit);
221         }
222         bmds->aio_bitmap[idx] = val;
223     }
224 }
225 
226 static void alloc_aio_bitmap(BlkMigDevState *bmds)
227 {
228     BlockDriverState *bs = bmds->bs;
229     int64_t bitmap_size;
230 
231     bitmap_size = bdrv_nb_sectors(bs) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
232     bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
233 
234     bmds->aio_bitmap = g_malloc0(bitmap_size);
235 }
236 
237 /* Never hold migration lock when yielding to the main loop!  */
238 
239 static void blk_mig_read_cb(void *opaque, int ret)
240 {
241     BlkMigBlock *blk = opaque;
242 
243     blk_mig_lock();
244     blk->ret = ret;
245 
246     QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
247     bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
248 
249     block_mig_state.submitted--;
250     block_mig_state.read_done++;
251     assert(block_mig_state.submitted >= 0);
252     blk_mig_unlock();
253 }
254 
255 /* Called with no lock taken.  */
256 
257 static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
258 {
259     int64_t total_sectors = bmds->total_sectors;
260     int64_t cur_sector = bmds->cur_sector;
261     BlockDriverState *bs = bmds->bs;
262     BlkMigBlock *blk;
263     int nr_sectors;
264 
265     if (bmds->shared_base) {
266         qemu_mutex_lock_iothread();
267         while (cur_sector < total_sectors &&
268                !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
269                                   &nr_sectors)) {
270             cur_sector += nr_sectors;
271         }
272         qemu_mutex_unlock_iothread();
273     }
274 
275     if (cur_sector >= total_sectors) {
276         bmds->cur_sector = bmds->completed_sectors = total_sectors;
277         return 1;
278     }
279 
280     bmds->completed_sectors = cur_sector;
281 
282     cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
283 
284     /* we are going to transfer a full block even if it is not allocated */
285     nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
286 
287     if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
288         nr_sectors = total_sectors - cur_sector;
289     }
290 
291     blk = g_new(BlkMigBlock, 1);
292     blk->buf = g_malloc(BLOCK_SIZE);
293     blk->bmds = bmds;
294     blk->sector = cur_sector;
295     blk->nr_sectors = nr_sectors;
296 
297     blk->iov.iov_base = blk->buf;
298     blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
299     qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
300 
301     blk_mig_lock();
302     block_mig_state.submitted++;
303     blk_mig_unlock();
304 
305     qemu_mutex_lock_iothread();
306     blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
307                                 nr_sectors, blk_mig_read_cb, blk);
308 
309     bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector, nr_sectors);
310     qemu_mutex_unlock_iothread();
311 
312     bmds->cur_sector = cur_sector + nr_sectors;
313     return (bmds->cur_sector >= total_sectors);
314 }
315 
316 /* Called with iothread lock taken.  */
317 
318 static int set_dirty_tracking(void)
319 {
320     BlkMigDevState *bmds;
321     int ret;
322 
323     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
324         bmds->dirty_bitmap = bdrv_create_dirty_bitmap(bmds->bs, BLOCK_SIZE,
325                                                       NULL, NULL);
326         if (!bmds->dirty_bitmap) {
327             ret = -errno;
328             goto fail;
329         }
330     }
331     return 0;
332 
333 fail:
334     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
335         if (bmds->dirty_bitmap) {
336             bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap);
337         }
338     }
339     return ret;
340 }
341 
342 static void unset_dirty_tracking(void)
343 {
344     BlkMigDevState *bmds;
345 
346     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
347         bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap);
348     }
349 }
350 
351 static void init_blk_migration(QEMUFile *f)
352 {
353     BlockDriverState *bs;
354     BlkMigDevState *bmds;
355     int64_t sectors;
356 
357     block_mig_state.submitted = 0;
358     block_mig_state.read_done = 0;
359     block_mig_state.transferred = 0;
360     block_mig_state.total_sector_sum = 0;
361     block_mig_state.prev_progress = -1;
362     block_mig_state.bulk_completed = 0;
363     block_mig_state.zero_blocks = migrate_zero_blocks();
364 
365     for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) {
366         if (bdrv_is_read_only(bs)) {
367             continue;
368         }
369 
370         sectors = bdrv_nb_sectors(bs);
371         if (sectors <= 0) {
372             return;
373         }
374 
375         bmds = g_new0(BlkMigDevState, 1);
376         bmds->bs = bs;
377         bmds->bulk_completed = 0;
378         bmds->total_sectors = sectors;
379         bmds->completed_sectors = 0;
380         bmds->shared_base = block_mig_state.shared_base;
381         alloc_aio_bitmap(bmds);
382         error_setg(&bmds->blocker, "block device is in use by migration");
383         bdrv_op_block_all(bs, bmds->blocker);
384         bdrv_ref(bs);
385 
386         block_mig_state.total_sector_sum += sectors;
387 
388         if (bmds->shared_base) {
389             DPRINTF("Start migration for %s with shared base image\n",
390                     bdrv_get_device_name(bs));
391         } else {
392             DPRINTF("Start full migration for %s\n", bdrv_get_device_name(bs));
393         }
394 
395         QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
396     }
397 }
398 
399 /* Called with no lock taken.  */
400 
401 static int blk_mig_save_bulked_block(QEMUFile *f)
402 {
403     int64_t completed_sector_sum = 0;
404     BlkMigDevState *bmds;
405     int progress;
406     int ret = 0;
407 
408     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
409         if (bmds->bulk_completed == 0) {
410             if (mig_save_device_bulk(f, bmds) == 1) {
411                 /* completed bulk section for this device */
412                 bmds->bulk_completed = 1;
413             }
414             completed_sector_sum += bmds->completed_sectors;
415             ret = 1;
416             break;
417         } else {
418             completed_sector_sum += bmds->completed_sectors;
419         }
420     }
421 
422     if (block_mig_state.total_sector_sum != 0) {
423         progress = completed_sector_sum * 100 /
424                    block_mig_state.total_sector_sum;
425     } else {
426         progress = 100;
427     }
428     if (progress != block_mig_state.prev_progress) {
429         block_mig_state.prev_progress = progress;
430         qemu_put_be64(f, (progress << BDRV_SECTOR_BITS)
431                          | BLK_MIG_FLAG_PROGRESS);
432         DPRINTF("Completed %d %%\r", progress);
433     }
434 
435     return ret;
436 }
437 
438 static void blk_mig_reset_dirty_cursor(void)
439 {
440     BlkMigDevState *bmds;
441 
442     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
443         bmds->cur_dirty = 0;
444     }
445 }
446 
447 /* Called with iothread lock taken.  */
448 
449 static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
450                                  int is_async)
451 {
452     BlkMigBlock *blk;
453     int64_t total_sectors = bmds->total_sectors;
454     int64_t sector;
455     int nr_sectors;
456     int ret = -EIO;
457 
458     for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
459         blk_mig_lock();
460         if (bmds_aio_inflight(bmds, sector)) {
461             blk_mig_unlock();
462             bdrv_drain(bmds->bs);
463         } else {
464             blk_mig_unlock();
465         }
466         if (bdrv_get_dirty(bmds->bs, bmds->dirty_bitmap, sector)) {
467 
468             if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
469                 nr_sectors = total_sectors - sector;
470             } else {
471                 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
472             }
473             blk = g_new(BlkMigBlock, 1);
474             blk->buf = g_malloc(BLOCK_SIZE);
475             blk->bmds = bmds;
476             blk->sector = sector;
477             blk->nr_sectors = nr_sectors;
478 
479             if (is_async) {
480                 blk->iov.iov_base = blk->buf;
481                 blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
482                 qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
483 
484                 blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
485                                             nr_sectors, blk_mig_read_cb, blk);
486 
487                 blk_mig_lock();
488                 block_mig_state.submitted++;
489                 bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
490                 blk_mig_unlock();
491             } else {
492                 ret = bdrv_read(bmds->bs, sector, blk->buf, nr_sectors);
493                 if (ret < 0) {
494                     goto error;
495                 }
496                 blk_send(f, blk);
497 
498                 g_free(blk->buf);
499                 g_free(blk);
500             }
501 
502             bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, sector, nr_sectors);
503             break;
504         }
505         sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
506         bmds->cur_dirty = sector;
507     }
508 
509     return (bmds->cur_dirty >= bmds->total_sectors);
510 
511 error:
512     DPRINTF("Error reading sector %" PRId64 "\n", sector);
513     g_free(blk->buf);
514     g_free(blk);
515     return ret;
516 }
517 
518 /* Called with iothread lock taken.
519  *
520  * return value:
521  * 0: too much data for max_downtime
522  * 1: few enough data for max_downtime
523 */
524 static int blk_mig_save_dirty_block(QEMUFile *f, int is_async)
525 {
526     BlkMigDevState *bmds;
527     int ret = 1;
528 
529     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
530         ret = mig_save_device_dirty(f, bmds, is_async);
531         if (ret <= 0) {
532             break;
533         }
534     }
535 
536     return ret;
537 }
538 
539 /* Called with no locks taken.  */
540 
541 static int flush_blks(QEMUFile *f)
542 {
543     BlkMigBlock *blk;
544     int ret = 0;
545 
546     DPRINTF("%s Enter submitted %d read_done %d transferred %d\n",
547             __FUNCTION__, block_mig_state.submitted, block_mig_state.read_done,
548             block_mig_state.transferred);
549 
550     blk_mig_lock();
551     while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
552         if (qemu_file_rate_limit(f)) {
553             break;
554         }
555         if (blk->ret < 0) {
556             ret = blk->ret;
557             break;
558         }
559 
560         QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
561         blk_mig_unlock();
562         blk_send(f, blk);
563         blk_mig_lock();
564 
565         g_free(blk->buf);
566         g_free(blk);
567 
568         block_mig_state.read_done--;
569         block_mig_state.transferred++;
570         assert(block_mig_state.read_done >= 0);
571     }
572     blk_mig_unlock();
573 
574     DPRINTF("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__,
575             block_mig_state.submitted, block_mig_state.read_done,
576             block_mig_state.transferred);
577     return ret;
578 }
579 
580 /* Called with iothread lock taken.  */
581 
582 static int64_t get_remaining_dirty(void)
583 {
584     BlkMigDevState *bmds;
585     int64_t dirty = 0;
586 
587     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
588         dirty += bdrv_get_dirty_count(bmds->dirty_bitmap);
589     }
590 
591     return dirty << BDRV_SECTOR_BITS;
592 }
593 
594 /* Called with iothread lock taken.  */
595 
596 static void block_migration_cleanup(void *opaque)
597 {
598     BlkMigDevState *bmds;
599     BlkMigBlock *blk;
600 
601     bdrv_drain_all();
602 
603     unset_dirty_tracking();
604 
605     blk_mig_lock();
606     while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
607         QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
608         bdrv_op_unblock_all(bmds->bs, bmds->blocker);
609         error_free(bmds->blocker);
610         bdrv_unref(bmds->bs);
611         g_free(bmds->aio_bitmap);
612         g_free(bmds);
613     }
614 
615     while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
616         QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
617         g_free(blk->buf);
618         g_free(blk);
619     }
620     blk_mig_unlock();
621 }
622 
623 static int block_save_setup(QEMUFile *f, void *opaque)
624 {
625     int ret;
626 
627     DPRINTF("Enter save live setup submitted %d transferred %d\n",
628             block_mig_state.submitted, block_mig_state.transferred);
629 
630     qemu_mutex_lock_iothread();
631     init_blk_migration(f);
632 
633     /* start track dirty blocks */
634     ret = set_dirty_tracking();
635 
636     if (ret) {
637         qemu_mutex_unlock_iothread();
638         return ret;
639     }
640 
641     qemu_mutex_unlock_iothread();
642 
643     ret = flush_blks(f);
644     blk_mig_reset_dirty_cursor();
645     qemu_put_be64(f, BLK_MIG_FLAG_EOS);
646 
647     return ret;
648 }
649 
650 static int block_save_iterate(QEMUFile *f, void *opaque)
651 {
652     int ret;
653     int64_t last_ftell = qemu_ftell(f);
654     int64_t delta_ftell;
655 
656     DPRINTF("Enter save live iterate submitted %d transferred %d\n",
657             block_mig_state.submitted, block_mig_state.transferred);
658 
659     ret = flush_blks(f);
660     if (ret) {
661         return ret;
662     }
663 
664     blk_mig_reset_dirty_cursor();
665 
666     /* control the rate of transfer */
667     blk_mig_lock();
668     while ((block_mig_state.submitted +
669             block_mig_state.read_done) * BLOCK_SIZE <
670            qemu_file_get_rate_limit(f) &&
671            (block_mig_state.submitted +
672             block_mig_state.read_done) <
673            MAX_INFLIGHT_IO) {
674         blk_mig_unlock();
675         if (block_mig_state.bulk_completed == 0) {
676             /* first finish the bulk phase */
677             if (blk_mig_save_bulked_block(f) == 0) {
678                 /* finished saving bulk on all devices */
679                 block_mig_state.bulk_completed = 1;
680             }
681             ret = 0;
682         } else {
683             /* Always called with iothread lock taken for
684              * simplicity, block_save_complete also calls it.
685              */
686             qemu_mutex_lock_iothread();
687             ret = blk_mig_save_dirty_block(f, 1);
688             qemu_mutex_unlock_iothread();
689         }
690         if (ret < 0) {
691             return ret;
692         }
693         blk_mig_lock();
694         if (ret != 0) {
695             /* no more dirty blocks */
696             break;
697         }
698     }
699     blk_mig_unlock();
700 
701     ret = flush_blks(f);
702     if (ret) {
703         return ret;
704     }
705 
706     qemu_put_be64(f, BLK_MIG_FLAG_EOS);
707     delta_ftell = qemu_ftell(f) - last_ftell;
708     if (delta_ftell > 0) {
709         return 1;
710     } else if (delta_ftell < 0) {
711         return -1;
712     } else {
713         return 0;
714     }
715 }
716 
717 /* Called with iothread lock taken.  */
718 
719 static int block_save_complete(QEMUFile *f, void *opaque)
720 {
721     int ret;
722 
723     DPRINTF("Enter save live complete submitted %d transferred %d\n",
724             block_mig_state.submitted, block_mig_state.transferred);
725 
726     ret = flush_blks(f);
727     if (ret) {
728         return ret;
729     }
730 
731     blk_mig_reset_dirty_cursor();
732 
733     /* we know for sure that save bulk is completed and
734        all async read completed */
735     blk_mig_lock();
736     assert(block_mig_state.submitted == 0);
737     blk_mig_unlock();
738 
739     do {
740         ret = blk_mig_save_dirty_block(f, 0);
741         if (ret < 0) {
742             return ret;
743         }
744     } while (ret == 0);
745 
746     /* report completion */
747     qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS);
748 
749     DPRINTF("Block migration completed\n");
750 
751     qemu_put_be64(f, BLK_MIG_FLAG_EOS);
752 
753     return 0;
754 }
755 
756 static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
757                                uint64_t *non_postcopiable_pending,
758                                uint64_t *postcopiable_pending)
759 {
760     /* Estimate pending number of bytes to send */
761     uint64_t pending;
762 
763     qemu_mutex_lock_iothread();
764     blk_mig_lock();
765     pending = get_remaining_dirty() +
766                        block_mig_state.submitted * BLOCK_SIZE +
767                        block_mig_state.read_done * BLOCK_SIZE;
768 
769     /* Report at least one block pending during bulk phase */
770     if (pending <= max_size && !block_mig_state.bulk_completed) {
771         pending = max_size + BLOCK_SIZE;
772     }
773     blk_mig_unlock();
774     qemu_mutex_unlock_iothread();
775 
776     DPRINTF("Enter save live pending  %" PRIu64 "\n", pending);
777     /* We don't do postcopy */
778     *non_postcopiable_pending += pending;
779 }
780 
781 static int block_load(QEMUFile *f, void *opaque, int version_id)
782 {
783     static int banner_printed;
784     int len, flags;
785     char device_name[256];
786     int64_t addr;
787     BlockDriverState *bs, *bs_prev = NULL;
788     BlockBackend *blk;
789     uint8_t *buf;
790     int64_t total_sectors = 0;
791     int nr_sectors;
792     int ret;
793 
794     do {
795         addr = qemu_get_be64(f);
796 
797         flags = addr & ~BDRV_SECTOR_MASK;
798         addr >>= BDRV_SECTOR_BITS;
799 
800         if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
801             /* get device name */
802             len = qemu_get_byte(f);
803             qemu_get_buffer(f, (uint8_t *)device_name, len);
804             device_name[len] = '\0';
805 
806             blk = blk_by_name(device_name);
807             if (!blk) {
808                 fprintf(stderr, "Error unknown block device %s\n",
809                         device_name);
810                 return -EINVAL;
811             }
812             bs = blk_bs(blk);
813             if (!bs) {
814                 fprintf(stderr, "Block device %s has no medium\n",
815                         device_name);
816                 return -EINVAL;
817             }
818 
819             if (bs != bs_prev) {
820                 bs_prev = bs;
821                 total_sectors = bdrv_nb_sectors(bs);
822                 if (total_sectors <= 0) {
823                     error_report("Error getting length of block device %s",
824                                  device_name);
825                     return -EINVAL;
826                 }
827             }
828 
829             if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
830                 nr_sectors = total_sectors - addr;
831             } else {
832                 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
833             }
834 
835             if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
836                 ret = bdrv_write_zeroes(bs, addr, nr_sectors,
837                                         BDRV_REQ_MAY_UNMAP);
838             } else {
839                 buf = g_malloc(BLOCK_SIZE);
840                 qemu_get_buffer(f, buf, BLOCK_SIZE);
841                 ret = bdrv_write(bs, addr, buf, nr_sectors);
842                 g_free(buf);
843             }
844 
845             if (ret < 0) {
846                 return ret;
847             }
848         } else if (flags & BLK_MIG_FLAG_PROGRESS) {
849             if (!banner_printed) {
850                 printf("Receiving block device images\n");
851                 banner_printed = 1;
852             }
853             printf("Completed %d %%%c", (int)addr,
854                    (addr == 100) ? '\n' : '\r');
855             fflush(stdout);
856         } else if (!(flags & BLK_MIG_FLAG_EOS)) {
857             fprintf(stderr, "Unknown block migration flags: %#x\n", flags);
858             return -EINVAL;
859         }
860         ret = qemu_file_get_error(f);
861         if (ret != 0) {
862             return ret;
863         }
864     } while (!(flags & BLK_MIG_FLAG_EOS));
865 
866     return 0;
867 }
868 
869 static void block_set_params(const MigrationParams *params, void *opaque)
870 {
871     block_mig_state.blk_enable = params->blk;
872     block_mig_state.shared_base = params->shared;
873 
874     /* shared base means that blk_enable = 1 */
875     block_mig_state.blk_enable |= params->shared;
876 }
877 
878 static bool block_is_active(void *opaque)
879 {
880     return block_mig_state.blk_enable == 1;
881 }
882 
883 static SaveVMHandlers savevm_block_handlers = {
884     .set_params = block_set_params,
885     .save_live_setup = block_save_setup,
886     .save_live_iterate = block_save_iterate,
887     .save_live_complete_precopy = block_save_complete,
888     .save_live_pending = block_save_pending,
889     .load_state = block_load,
890     .cleanup = block_migration_cleanup,
891     .is_active = block_is_active,
892 };
893 
894 void blk_mig_init(void)
895 {
896     QSIMPLEQ_INIT(&block_mig_state.bmds_list);
897     QSIMPLEQ_INIT(&block_mig_state.blk_list);
898     qemu_mutex_init(&block_mig_state.lock);
899 
900     register_savevm_live(NULL, "block", 0, 1, &savevm_block_handlers,
901                          &block_mig_state);
902 }
903