xref: /qemu/migration/migration.c (revision 814bb12a)
1 /*
2  * QEMU live migration
3  *
4  * Copyright IBM, Corp. 2008
5  *
6  * Authors:
7  *  Anthony Liguori   <aliguori@us.ibm.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  * Contributions after 2012-01-13 are licensed under the terms of the
13  * GNU GPL, version 2 or (at your option) any later version.
14  */
15 
16 #include "qemu/osdep.h"
17 #include "qemu/cutils.h"
18 #include "qemu/error-report.h"
19 #include "qemu/main-loop.h"
20 #include "migration/migration.h"
21 #include "migration/qemu-file.h"
22 #include "sysemu/sysemu.h"
23 #include "block/block.h"
24 #include "qapi/qmp/qerror.h"
25 #include "qapi/util.h"
26 #include "qemu/sockets.h"
27 #include "qemu/rcu.h"
28 #include "migration/block.h"
29 #include "migration/postcopy-ram.h"
30 #include "qemu/thread.h"
31 #include "qmp-commands.h"
32 #include "trace.h"
33 #include "qapi-event.h"
34 #include "qom/cpu.h"
35 #include "exec/memory.h"
36 #include "exec/address-spaces.h"
37 #include "io/channel-buffer.h"
38 #include "io/channel-tls.h"
39 
40 #define MAX_THROTTLE  (32 << 20)      /* Migration transfer speed throttling */
41 
42 /* Amount of time to allocate to each "chunk" of bandwidth-throttled
43  * data. */
44 #define BUFFER_DELAY     100
45 #define XFER_LIMIT_RATIO (1000 / BUFFER_DELAY)
46 
47 /* Time in milliseconds we are allowed to stop the source,
48  * for sending the last part */
49 #define DEFAULT_MIGRATE_SET_DOWNTIME 300
50 
51 /* Default compression thread count */
52 #define DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT 8
53 /* Default decompression thread count, usually decompression is at
54  * least 4 times as fast as compression.*/
55 #define DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT 2
56 /*0: means nocompress, 1: best speed, ... 9: best compress ratio */
57 #define DEFAULT_MIGRATE_COMPRESS_LEVEL 1
58 /* Define default autoconverge cpu throttle migration parameters */
59 #define DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL 20
60 #define DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT 10
61 
62 /* Migration XBZRLE default cache size */
63 #define DEFAULT_MIGRATE_CACHE_SIZE (64 * 1024 * 1024)
64 
65 static NotifierList migration_state_notifiers =
66     NOTIFIER_LIST_INITIALIZER(migration_state_notifiers);
67 
68 static bool deferred_incoming;
69 
70 /*
71  * Current state of incoming postcopy; note this is not part of
72  * MigrationIncomingState since it's state is used during cleanup
73  * at the end as MIS is being freed.
74  */
75 static PostcopyState incoming_postcopy_state;
76 
77 /* When we add fault tolerance, we could have several
78    migrations at once.  For now we don't need to add
79    dynamic creation of migration */
80 
81 /* For outgoing */
82 MigrationState *migrate_get_current(void)
83 {
84     static bool once;
85     static MigrationState current_migration = {
86         .state = MIGRATION_STATUS_NONE,
87         .xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE,
88         .mbps = -1,
89         .parameters = {
90             .compress_level = DEFAULT_MIGRATE_COMPRESS_LEVEL,
91             .compress_threads = DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT,
92             .decompress_threads = DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT,
93             .cpu_throttle_initial = DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL,
94             .cpu_throttle_increment = DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT,
95             .max_bandwidth = MAX_THROTTLE,
96             .downtime_limit = DEFAULT_MIGRATE_SET_DOWNTIME,
97         },
98     };
99 
100     if (!once) {
101         qemu_mutex_init(&current_migration.src_page_req_mutex);
102         once = true;
103     }
104     return &current_migration;
105 }
106 
107 /* For incoming */
108 static MigrationIncomingState *mis_current;
109 
110 MigrationIncomingState *migration_incoming_get_current(void)
111 {
112     return mis_current;
113 }
114 
115 MigrationIncomingState *migration_incoming_state_new(QEMUFile* f)
116 {
117     mis_current = g_new0(MigrationIncomingState, 1);
118     mis_current->from_src_file = f;
119     mis_current->state = MIGRATION_STATUS_NONE;
120     QLIST_INIT(&mis_current->loadvm_handlers);
121     qemu_mutex_init(&mis_current->rp_mutex);
122     qemu_event_init(&mis_current->main_thread_load_event, false);
123 
124     return mis_current;
125 }
126 
127 void migration_incoming_state_destroy(void)
128 {
129     qemu_event_destroy(&mis_current->main_thread_load_event);
130     loadvm_free_handlers(mis_current);
131     g_free(mis_current);
132     mis_current = NULL;
133 }
134 
135 
136 typedef struct {
137     bool optional;
138     uint32_t size;
139     uint8_t runstate[100];
140     RunState state;
141     bool received;
142 } GlobalState;
143 
144 static GlobalState global_state;
145 
146 int global_state_store(void)
147 {
148     if (!runstate_store((char *)global_state.runstate,
149                         sizeof(global_state.runstate))) {
150         error_report("runstate name too big: %s", global_state.runstate);
151         trace_migrate_state_too_big();
152         return -EINVAL;
153     }
154     return 0;
155 }
156 
157 void global_state_store_running(void)
158 {
159     const char *state = RunState_lookup[RUN_STATE_RUNNING];
160     strncpy((char *)global_state.runstate,
161            state, sizeof(global_state.runstate));
162 }
163 
164 static bool global_state_received(void)
165 {
166     return global_state.received;
167 }
168 
169 static RunState global_state_get_runstate(void)
170 {
171     return global_state.state;
172 }
173 
174 void global_state_set_optional(void)
175 {
176     global_state.optional = true;
177 }
178 
179 static bool global_state_needed(void *opaque)
180 {
181     GlobalState *s = opaque;
182     char *runstate = (char *)s->runstate;
183 
184     /* If it is not optional, it is mandatory */
185 
186     if (s->optional == false) {
187         return true;
188     }
189 
190     /* If state is running or paused, it is not needed */
191 
192     if (strcmp(runstate, "running") == 0 ||
193         strcmp(runstate, "paused") == 0) {
194         return false;
195     }
196 
197     /* for any other state it is needed */
198     return true;
199 }
200 
201 static int global_state_post_load(void *opaque, int version_id)
202 {
203     GlobalState *s = opaque;
204     Error *local_err = NULL;
205     int r;
206     char *runstate = (char *)s->runstate;
207 
208     s->received = true;
209     trace_migrate_global_state_post_load(runstate);
210 
211     r = qapi_enum_parse(RunState_lookup, runstate, RUN_STATE__MAX,
212                                 -1, &local_err);
213 
214     if (r == -1) {
215         if (local_err) {
216             error_report_err(local_err);
217         }
218         return -EINVAL;
219     }
220     s->state = r;
221 
222     return 0;
223 }
224 
225 static void global_state_pre_save(void *opaque)
226 {
227     GlobalState *s = opaque;
228 
229     trace_migrate_global_state_pre_save((char *)s->runstate);
230     s->size = strlen((char *)s->runstate) + 1;
231 }
232 
233 static const VMStateDescription vmstate_globalstate = {
234     .name = "globalstate",
235     .version_id = 1,
236     .minimum_version_id = 1,
237     .post_load = global_state_post_load,
238     .pre_save = global_state_pre_save,
239     .needed = global_state_needed,
240     .fields = (VMStateField[]) {
241         VMSTATE_UINT32(size, GlobalState),
242         VMSTATE_BUFFER(runstate, GlobalState),
243         VMSTATE_END_OF_LIST()
244     },
245 };
246 
247 void register_global_state(void)
248 {
249     /* We would use it independently that we receive it */
250     strcpy((char *)&global_state.runstate, "");
251     global_state.received = false;
252     vmstate_register(NULL, 0, &vmstate_globalstate, &global_state);
253 }
254 
255 static void migrate_generate_event(int new_state)
256 {
257     if (migrate_use_events()) {
258         qapi_event_send_migration(new_state, &error_abort);
259     }
260 }
261 
262 /*
263  * Called on -incoming with a defer: uri.
264  * The migration can be started later after any parameters have been
265  * changed.
266  */
267 static void deferred_incoming_migration(Error **errp)
268 {
269     if (deferred_incoming) {
270         error_setg(errp, "Incoming migration already deferred");
271     }
272     deferred_incoming = true;
273 }
274 
275 /* Request a range of pages from the source VM at the given
276  * start address.
277  *   rbname: Name of the RAMBlock to request the page in, if NULL it's the same
278  *           as the last request (a name must have been given previously)
279  *   Start: Address offset within the RB
280  *   Len: Length in bytes required - must be a multiple of pagesize
281  */
282 void migrate_send_rp_req_pages(MigrationIncomingState *mis, const char *rbname,
283                                ram_addr_t start, size_t len)
284 {
285     uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname up to 256 */
286     size_t msglen = 12; /* start + len */
287 
288     *(uint64_t *)bufc = cpu_to_be64((uint64_t)start);
289     *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len);
290 
291     if (rbname) {
292         int rbname_len = strlen(rbname);
293         assert(rbname_len < 256);
294 
295         bufc[msglen++] = rbname_len;
296         memcpy(bufc + msglen, rbname, rbname_len);
297         msglen += rbname_len;
298         migrate_send_rp_message(mis, MIG_RP_MSG_REQ_PAGES_ID, msglen, bufc);
299     } else {
300         migrate_send_rp_message(mis, MIG_RP_MSG_REQ_PAGES, msglen, bufc);
301     }
302 }
303 
304 void qemu_start_incoming_migration(const char *uri, Error **errp)
305 {
306     const char *p;
307 
308     qapi_event_send_migration(MIGRATION_STATUS_SETUP, &error_abort);
309     if (!strcmp(uri, "defer")) {
310         deferred_incoming_migration(errp);
311     } else if (strstart(uri, "tcp:", &p)) {
312         tcp_start_incoming_migration(p, errp);
313 #ifdef CONFIG_RDMA
314     } else if (strstart(uri, "rdma:", &p)) {
315         rdma_start_incoming_migration(p, errp);
316 #endif
317     } else if (strstart(uri, "exec:", &p)) {
318         exec_start_incoming_migration(p, errp);
319     } else if (strstart(uri, "unix:", &p)) {
320         unix_start_incoming_migration(p, errp);
321     } else if (strstart(uri, "fd:", &p)) {
322         fd_start_incoming_migration(p, errp);
323     } else {
324         error_setg(errp, "unknown migration protocol: %s", uri);
325     }
326 }
327 
328 static void process_incoming_migration_bh(void *opaque)
329 {
330     Error *local_err = NULL;
331     MigrationIncomingState *mis = opaque;
332 
333     /* Make sure all file formats flush their mutable metadata */
334     bdrv_invalidate_cache_all(&local_err);
335     if (local_err) {
336         migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
337                           MIGRATION_STATUS_FAILED);
338         error_report_err(local_err);
339         migrate_decompress_threads_join();
340         exit(EXIT_FAILURE);
341     }
342 
343     /*
344      * This must happen after all error conditions are dealt with and
345      * we're sure the VM is going to be running on this host.
346      */
347     qemu_announce_self();
348 
349     /* If global state section was not received or we are in running
350        state, we need to obey autostart. Any other state is set with
351        runstate_set. */
352 
353     if (!global_state_received() ||
354         global_state_get_runstate() == RUN_STATE_RUNNING) {
355         if (autostart) {
356             vm_start();
357         } else {
358             runstate_set(RUN_STATE_PAUSED);
359         }
360     } else {
361         runstate_set(global_state_get_runstate());
362     }
363     migrate_decompress_threads_join();
364     /*
365      * This must happen after any state changes since as soon as an external
366      * observer sees this event they might start to prod at the VM assuming
367      * it's ready to use.
368      */
369     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
370                       MIGRATION_STATUS_COMPLETED);
371     qemu_bh_delete(mis->bh);
372     migration_incoming_state_destroy();
373 }
374 
375 static void process_incoming_migration_co(void *opaque)
376 {
377     QEMUFile *f = opaque;
378     MigrationIncomingState *mis;
379     PostcopyState ps;
380     int ret;
381 
382     mis = migration_incoming_state_new(f);
383     postcopy_state_set(POSTCOPY_INCOMING_NONE);
384     migrate_set_state(&mis->state, MIGRATION_STATUS_NONE,
385                       MIGRATION_STATUS_ACTIVE);
386     ret = qemu_loadvm_state(f);
387 
388     ps = postcopy_state_get();
389     trace_process_incoming_migration_co_end(ret, ps);
390     if (ps != POSTCOPY_INCOMING_NONE) {
391         if (ps == POSTCOPY_INCOMING_ADVISE) {
392             /*
393              * Where a migration had postcopy enabled (and thus went to advise)
394              * but managed to complete within the precopy period, we can use
395              * the normal exit.
396              */
397             postcopy_ram_incoming_cleanup(mis);
398         } else if (ret >= 0) {
399             /*
400              * Postcopy was started, cleanup should happen at the end of the
401              * postcopy thread.
402              */
403             trace_process_incoming_migration_co_postcopy_end_main();
404             return;
405         }
406         /* Else if something went wrong then just fall out of the normal exit */
407     }
408 
409     qemu_fclose(f);
410     free_xbzrle_decoded_buf();
411 
412     if (ret < 0) {
413         migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
414                           MIGRATION_STATUS_FAILED);
415         error_report("load of migration failed: %s", strerror(-ret));
416         migrate_decompress_threads_join();
417         exit(EXIT_FAILURE);
418     }
419 
420     mis->bh = qemu_bh_new(process_incoming_migration_bh, mis);
421     qemu_bh_schedule(mis->bh);
422 }
423 
424 void migration_fd_process_incoming(QEMUFile *f)
425 {
426     Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, f);
427 
428     migrate_decompress_threads_create();
429     qemu_file_set_blocking(f, false);
430     qemu_coroutine_enter(co);
431 }
432 
433 
434 void migration_channel_process_incoming(MigrationState *s,
435                                         QIOChannel *ioc)
436 {
437     trace_migration_set_incoming_channel(
438         ioc, object_get_typename(OBJECT(ioc)));
439 
440     if (s->parameters.tls_creds &&
441         !object_dynamic_cast(OBJECT(ioc),
442                              TYPE_QIO_CHANNEL_TLS)) {
443         Error *local_err = NULL;
444         migration_tls_channel_process_incoming(s, ioc, &local_err);
445         if (local_err) {
446             error_report_err(local_err);
447         }
448     } else {
449         QEMUFile *f = qemu_fopen_channel_input(ioc);
450         migration_fd_process_incoming(f);
451     }
452 }
453 
454 
455 void migration_channel_connect(MigrationState *s,
456                                QIOChannel *ioc,
457                                const char *hostname)
458 {
459     trace_migration_set_outgoing_channel(
460         ioc, object_get_typename(OBJECT(ioc)), hostname);
461 
462     if (s->parameters.tls_creds &&
463         !object_dynamic_cast(OBJECT(ioc),
464                              TYPE_QIO_CHANNEL_TLS)) {
465         Error *local_err = NULL;
466         migration_tls_channel_connect(s, ioc, hostname, &local_err);
467         if (local_err) {
468             migrate_fd_error(s, local_err);
469             error_free(local_err);
470         }
471     } else {
472         QEMUFile *f = qemu_fopen_channel_output(ioc);
473 
474         s->to_dst_file = f;
475 
476         migrate_fd_connect(s);
477     }
478 }
479 
480 
481 /*
482  * Send a message on the return channel back to the source
483  * of the migration.
484  */
485 void migrate_send_rp_message(MigrationIncomingState *mis,
486                              enum mig_rp_message_type message_type,
487                              uint16_t len, void *data)
488 {
489     trace_migrate_send_rp_message((int)message_type, len);
490     qemu_mutex_lock(&mis->rp_mutex);
491     qemu_put_be16(mis->to_src_file, (unsigned int)message_type);
492     qemu_put_be16(mis->to_src_file, len);
493     qemu_put_buffer(mis->to_src_file, data, len);
494     qemu_fflush(mis->to_src_file);
495     qemu_mutex_unlock(&mis->rp_mutex);
496 }
497 
498 /*
499  * Send a 'SHUT' message on the return channel with the given value
500  * to indicate that we've finished with the RP.  Non-0 value indicates
501  * error.
502  */
503 void migrate_send_rp_shut(MigrationIncomingState *mis,
504                           uint32_t value)
505 {
506     uint32_t buf;
507 
508     buf = cpu_to_be32(value);
509     migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf);
510 }
511 
512 /*
513  * Send a 'PONG' message on the return channel with the given value
514  * (normally in response to a 'PING')
515  */
516 void migrate_send_rp_pong(MigrationIncomingState *mis,
517                           uint32_t value)
518 {
519     uint32_t buf;
520 
521     buf = cpu_to_be32(value);
522     migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf);
523 }
524 
525 MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp)
526 {
527     MigrationCapabilityStatusList *head = NULL;
528     MigrationCapabilityStatusList *caps;
529     MigrationState *s = migrate_get_current();
530     int i;
531 
532     caps = NULL; /* silence compiler warning */
533     for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
534         if (head == NULL) {
535             head = g_malloc0(sizeof(*caps));
536             caps = head;
537         } else {
538             caps->next = g_malloc0(sizeof(*caps));
539             caps = caps->next;
540         }
541         caps->value =
542             g_malloc(sizeof(*caps->value));
543         caps->value->capability = i;
544         caps->value->state = s->enabled_capabilities[i];
545     }
546 
547     return head;
548 }
549 
550 MigrationParameters *qmp_query_migrate_parameters(Error **errp)
551 {
552     MigrationParameters *params;
553     MigrationState *s = migrate_get_current();
554 
555     params = g_malloc0(sizeof(*params));
556     params->has_compress_level = true;
557     params->compress_level = s->parameters.compress_level;
558     params->has_compress_threads = true;
559     params->compress_threads = s->parameters.compress_threads;
560     params->has_decompress_threads = true;
561     params->decompress_threads = s->parameters.decompress_threads;
562     params->has_cpu_throttle_initial = true;
563     params->cpu_throttle_initial = s->parameters.cpu_throttle_initial;
564     params->has_cpu_throttle_increment = true;
565     params->cpu_throttle_increment = s->parameters.cpu_throttle_increment;
566     params->has_tls_creds = !!s->parameters.tls_creds;
567     params->tls_creds = g_strdup(s->parameters.tls_creds);
568     params->has_tls_hostname = !!s->parameters.tls_hostname;
569     params->tls_hostname = g_strdup(s->parameters.tls_hostname);
570     params->has_max_bandwidth = true;
571     params->max_bandwidth = s->parameters.max_bandwidth;
572     params->has_downtime_limit = true;
573     params->downtime_limit = s->parameters.downtime_limit;
574 
575     return params;
576 }
577 
578 /*
579  * Return true if we're already in the middle of a migration
580  * (i.e. any of the active or setup states)
581  */
582 static bool migration_is_setup_or_active(int state)
583 {
584     switch (state) {
585     case MIGRATION_STATUS_ACTIVE:
586     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
587     case MIGRATION_STATUS_SETUP:
588         return true;
589 
590     default:
591         return false;
592 
593     }
594 }
595 
596 static void get_xbzrle_cache_stats(MigrationInfo *info)
597 {
598     if (migrate_use_xbzrle()) {
599         info->has_xbzrle_cache = true;
600         info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache));
601         info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size();
602         info->xbzrle_cache->bytes = xbzrle_mig_bytes_transferred();
603         info->xbzrle_cache->pages = xbzrle_mig_pages_transferred();
604         info->xbzrle_cache->cache_miss = xbzrle_mig_pages_cache_miss();
605         info->xbzrle_cache->cache_miss_rate = xbzrle_mig_cache_miss_rate();
606         info->xbzrle_cache->overflow = xbzrle_mig_pages_overflow();
607     }
608 }
609 
610 static void populate_ram_info(MigrationInfo *info, MigrationState *s)
611 {
612     info->has_ram = true;
613     info->ram = g_malloc0(sizeof(*info->ram));
614     info->ram->transferred = ram_bytes_transferred();
615     info->ram->total = ram_bytes_total();
616     info->ram->duplicate = dup_mig_pages_transferred();
617     info->ram->skipped = skipped_mig_pages_transferred();
618     info->ram->normal = norm_mig_pages_transferred();
619     info->ram->normal_bytes = norm_mig_bytes_transferred();
620     info->ram->mbps = s->mbps;
621     info->ram->dirty_sync_count = s->dirty_sync_count;
622     info->ram->postcopy_requests = s->postcopy_requests;
623 
624     if (s->state != MIGRATION_STATUS_COMPLETED) {
625         info->ram->remaining = ram_bytes_remaining();
626         info->ram->dirty_pages_rate = s->dirty_pages_rate;
627     }
628 }
629 
630 MigrationInfo *qmp_query_migrate(Error **errp)
631 {
632     MigrationInfo *info = g_malloc0(sizeof(*info));
633     MigrationState *s = migrate_get_current();
634 
635     switch (s->state) {
636     case MIGRATION_STATUS_NONE:
637         /* no migration has happened ever */
638         break;
639     case MIGRATION_STATUS_SETUP:
640         info->has_status = true;
641         info->has_total_time = false;
642         break;
643     case MIGRATION_STATUS_ACTIVE:
644     case MIGRATION_STATUS_CANCELLING:
645         info->has_status = true;
646         info->has_total_time = true;
647         info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME)
648             - s->total_time;
649         info->has_expected_downtime = true;
650         info->expected_downtime = s->expected_downtime;
651         info->has_setup_time = true;
652         info->setup_time = s->setup_time;
653 
654         populate_ram_info(info, s);
655 
656         if (blk_mig_active()) {
657             info->has_disk = true;
658             info->disk = g_malloc0(sizeof(*info->disk));
659             info->disk->transferred = blk_mig_bytes_transferred();
660             info->disk->remaining = blk_mig_bytes_remaining();
661             info->disk->total = blk_mig_bytes_total();
662         }
663 
664         if (cpu_throttle_active()) {
665             info->has_cpu_throttle_percentage = true;
666             info->cpu_throttle_percentage = cpu_throttle_get_percentage();
667         }
668 
669         get_xbzrle_cache_stats(info);
670         break;
671     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
672         /* Mostly the same as active; TODO add some postcopy stats */
673         info->has_status = true;
674         info->has_total_time = true;
675         info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME)
676             - s->total_time;
677         info->has_expected_downtime = true;
678         info->expected_downtime = s->expected_downtime;
679         info->has_setup_time = true;
680         info->setup_time = s->setup_time;
681 
682         populate_ram_info(info, s);
683 
684         if (blk_mig_active()) {
685             info->has_disk = true;
686             info->disk = g_malloc0(sizeof(*info->disk));
687             info->disk->transferred = blk_mig_bytes_transferred();
688             info->disk->remaining = blk_mig_bytes_remaining();
689             info->disk->total = blk_mig_bytes_total();
690         }
691 
692         get_xbzrle_cache_stats(info);
693         break;
694     case MIGRATION_STATUS_COMPLETED:
695         get_xbzrle_cache_stats(info);
696 
697         info->has_status = true;
698         info->has_total_time = true;
699         info->total_time = s->total_time;
700         info->has_downtime = true;
701         info->downtime = s->downtime;
702         info->has_setup_time = true;
703         info->setup_time = s->setup_time;
704 
705         populate_ram_info(info, s);
706         break;
707     case MIGRATION_STATUS_FAILED:
708         info->has_status = true;
709         if (s->error) {
710             info->has_error_desc = true;
711             info->error_desc = g_strdup(error_get_pretty(s->error));
712         }
713         break;
714     case MIGRATION_STATUS_CANCELLED:
715         info->has_status = true;
716         break;
717     }
718     info->status = s->state;
719 
720     return info;
721 }
722 
723 void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
724                                   Error **errp)
725 {
726     MigrationState *s = migrate_get_current();
727     MigrationCapabilityStatusList *cap;
728     bool old_postcopy_cap = migrate_postcopy_ram();
729 
730     if (migration_is_setup_or_active(s->state)) {
731         error_setg(errp, QERR_MIGRATION_ACTIVE);
732         return;
733     }
734 
735     for (cap = params; cap; cap = cap->next) {
736         s->enabled_capabilities[cap->value->capability] = cap->value->state;
737     }
738 
739     if (migrate_postcopy_ram()) {
740         if (migrate_use_compression()) {
741             /* The decompression threads asynchronously write into RAM
742              * rather than use the atomic copies needed to avoid
743              * userfaulting.  It should be possible to fix the decompression
744              * threads for compatibility in future.
745              */
746             error_report("Postcopy is not currently compatible with "
747                          "compression");
748             s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM] =
749                 false;
750         }
751         /* This check is reasonably expensive, so only when it's being
752          * set the first time, also it's only the destination that needs
753          * special support.
754          */
755         if (!old_postcopy_cap && runstate_check(RUN_STATE_INMIGRATE) &&
756             !postcopy_ram_supported_by_host()) {
757             /* postcopy_ram_supported_by_host will have emitted a more
758              * detailed message
759              */
760             error_report("Postcopy is not supported");
761             s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM] =
762                 false;
763         }
764     }
765 }
766 
767 void qmp_migrate_set_parameters(MigrationParameters *params, Error **errp)
768 {
769     MigrationState *s = migrate_get_current();
770 
771     if (params->has_compress_level &&
772         (params->compress_level < 0 || params->compress_level > 9)) {
773         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level",
774                    "is invalid, it should be in the range of 0 to 9");
775         return;
776     }
777     if (params->has_compress_threads &&
778         (params->compress_threads < 1 || params->compress_threads > 255)) {
779         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
780                    "compress_threads",
781                    "is invalid, it should be in the range of 1 to 255");
782         return;
783     }
784     if (params->has_decompress_threads &&
785         (params->decompress_threads < 1 || params->decompress_threads > 255)) {
786         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
787                    "decompress_threads",
788                    "is invalid, it should be in the range of 1 to 255");
789         return;
790     }
791     if (params->has_cpu_throttle_initial &&
792         (params->cpu_throttle_initial < 1 ||
793          params->cpu_throttle_initial > 99)) {
794         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
795                    "cpu_throttle_initial",
796                    "an integer in the range of 1 to 99");
797         return;
798     }
799     if (params->has_cpu_throttle_increment &&
800         (params->cpu_throttle_increment < 1 ||
801          params->cpu_throttle_increment > 99)) {
802         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
803                    "cpu_throttle_increment",
804                    "an integer in the range of 1 to 99");
805         return;
806     }
807     if (params->has_max_bandwidth &&
808         (params->max_bandwidth < 0 || params->max_bandwidth > SIZE_MAX)) {
809         error_setg(errp, "Parameter 'max_bandwidth' expects an integer in the"
810                          " range of 0 to %zu bytes/second", SIZE_MAX);
811         return;
812     }
813     if (params->has_downtime_limit &&
814         (params->downtime_limit < 0 || params->downtime_limit > 2000000)) {
815         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
816                    "downtime_limit",
817                    "an integer in the range of 0 to 2000000 milliseconds");
818         return;
819     }
820 
821     if (params->has_compress_level) {
822         s->parameters.compress_level = params->compress_level;
823     }
824     if (params->has_compress_threads) {
825         s->parameters.compress_threads = params->compress_threads;
826     }
827     if (params->has_decompress_threads) {
828         s->parameters.decompress_threads = params->decompress_threads;
829     }
830     if (params->has_cpu_throttle_initial) {
831         s->parameters.cpu_throttle_initial = params->cpu_throttle_initial;
832     }
833     if (params->has_cpu_throttle_increment) {
834         s->parameters.cpu_throttle_increment = params->cpu_throttle_increment;
835     }
836     if (params->has_tls_creds) {
837         g_free(s->parameters.tls_creds);
838         s->parameters.tls_creds = g_strdup(params->tls_creds);
839     }
840     if (params->has_tls_hostname) {
841         g_free(s->parameters.tls_hostname);
842         s->parameters.tls_hostname = g_strdup(params->tls_hostname);
843     }
844     if (params->has_max_bandwidth) {
845         s->parameters.max_bandwidth = params->max_bandwidth;
846         if (s->to_dst_file) {
847             qemu_file_set_rate_limit(s->to_dst_file,
848                                 s->parameters.max_bandwidth / XFER_LIMIT_RATIO);
849         }
850     }
851     if (params->has_downtime_limit) {
852         s->parameters.downtime_limit = params->downtime_limit;
853     }
854 }
855 
856 
857 void qmp_migrate_start_postcopy(Error **errp)
858 {
859     MigrationState *s = migrate_get_current();
860 
861     if (!migrate_postcopy_ram()) {
862         error_setg(errp, "Enable postcopy with migrate_set_capability before"
863                          " the start of migration");
864         return;
865     }
866 
867     if (s->state == MIGRATION_STATUS_NONE) {
868         error_setg(errp, "Postcopy must be started after migration has been"
869                          " started");
870         return;
871     }
872     /*
873      * we don't error if migration has finished since that would be racy
874      * with issuing this command.
875      */
876     atomic_set(&s->start_postcopy, true);
877 }
878 
879 /* shared migration helpers */
880 
881 void migrate_set_state(int *state, int old_state, int new_state)
882 {
883     if (atomic_cmpxchg(state, old_state, new_state) == old_state) {
884         trace_migrate_set_state(new_state);
885         migrate_generate_event(new_state);
886     }
887 }
888 
889 static void migrate_fd_cleanup(void *opaque)
890 {
891     MigrationState *s = opaque;
892 
893     qemu_bh_delete(s->cleanup_bh);
894     s->cleanup_bh = NULL;
895 
896     flush_page_queue(s);
897 
898     if (s->to_dst_file) {
899         trace_migrate_fd_cleanup();
900         qemu_mutex_unlock_iothread();
901         if (s->migration_thread_running) {
902             qemu_thread_join(&s->thread);
903             s->migration_thread_running = false;
904         }
905         qemu_mutex_lock_iothread();
906 
907         migrate_compress_threads_join();
908         qemu_fclose(s->to_dst_file);
909         s->to_dst_file = NULL;
910     }
911 
912     assert((s->state != MIGRATION_STATUS_ACTIVE) &&
913            (s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE));
914 
915     if (s->state == MIGRATION_STATUS_CANCELLING) {
916         migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING,
917                           MIGRATION_STATUS_CANCELLED);
918     }
919 
920     notifier_list_notify(&migration_state_notifiers, s);
921 }
922 
923 void migrate_fd_error(MigrationState *s, const Error *error)
924 {
925     trace_migrate_fd_error(error_get_pretty(error));
926     assert(s->to_dst_file == NULL);
927     migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
928                       MIGRATION_STATUS_FAILED);
929     if (!s->error) {
930         s->error = error_copy(error);
931     }
932     notifier_list_notify(&migration_state_notifiers, s);
933 }
934 
935 static void migrate_fd_cancel(MigrationState *s)
936 {
937     int old_state ;
938     QEMUFile *f = migrate_get_current()->to_dst_file;
939     trace_migrate_fd_cancel();
940 
941     if (s->rp_state.from_dst_file) {
942         /* shutdown the rp socket, so causing the rp thread to shutdown */
943         qemu_file_shutdown(s->rp_state.from_dst_file);
944     }
945 
946     do {
947         old_state = s->state;
948         if (!migration_is_setup_or_active(old_state)) {
949             break;
950         }
951         migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING);
952     } while (s->state != MIGRATION_STATUS_CANCELLING);
953 
954     /*
955      * If we're unlucky the migration code might be stuck somewhere in a
956      * send/write while the network has failed and is waiting to timeout;
957      * if we've got shutdown(2) available then we can force it to quit.
958      * The outgoing qemu file gets closed in migrate_fd_cleanup that is
959      * called in a bh, so there is no race against this cancel.
960      */
961     if (s->state == MIGRATION_STATUS_CANCELLING && f) {
962         qemu_file_shutdown(f);
963     }
964 }
965 
966 void add_migration_state_change_notifier(Notifier *notify)
967 {
968     notifier_list_add(&migration_state_notifiers, notify);
969 }
970 
971 void remove_migration_state_change_notifier(Notifier *notify)
972 {
973     notifier_remove(notify);
974 }
975 
976 bool migration_in_setup(MigrationState *s)
977 {
978     return s->state == MIGRATION_STATUS_SETUP;
979 }
980 
981 bool migration_has_finished(MigrationState *s)
982 {
983     return s->state == MIGRATION_STATUS_COMPLETED;
984 }
985 
986 bool migration_has_failed(MigrationState *s)
987 {
988     return (s->state == MIGRATION_STATUS_CANCELLED ||
989             s->state == MIGRATION_STATUS_FAILED);
990 }
991 
992 bool migration_in_postcopy(MigrationState *s)
993 {
994     return (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
995 }
996 
997 bool migration_in_postcopy_after_devices(MigrationState *s)
998 {
999     return migration_in_postcopy(s) && s->postcopy_after_devices;
1000 }
1001 
1002 MigrationState *migrate_init(const MigrationParams *params)
1003 {
1004     MigrationState *s = migrate_get_current();
1005 
1006     /*
1007      * Reinitialise all migration state, except
1008      * parameters/capabilities that the user set, and
1009      * locks.
1010      */
1011     s->bytes_xfer = 0;
1012     s->xfer_limit = 0;
1013     s->cleanup_bh = 0;
1014     s->to_dst_file = NULL;
1015     s->state = MIGRATION_STATUS_NONE;
1016     s->params = *params;
1017     s->rp_state.from_dst_file = NULL;
1018     s->rp_state.error = false;
1019     s->mbps = 0.0;
1020     s->downtime = 0;
1021     s->expected_downtime = 0;
1022     s->dirty_pages_rate = 0;
1023     s->dirty_bytes_rate = 0;
1024     s->setup_time = 0;
1025     s->dirty_sync_count = 0;
1026     s->start_postcopy = false;
1027     s->postcopy_after_devices = false;
1028     s->postcopy_requests = 0;
1029     s->migration_thread_running = false;
1030     s->last_req_rb = NULL;
1031     error_free(s->error);
1032     s->error = NULL;
1033 
1034     migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP);
1035 
1036     QSIMPLEQ_INIT(&s->src_page_requests);
1037 
1038     s->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1039     return s;
1040 }
1041 
1042 static GSList *migration_blockers;
1043 
1044 void migrate_add_blocker(Error *reason)
1045 {
1046     migration_blockers = g_slist_prepend(migration_blockers, reason);
1047 }
1048 
1049 void migrate_del_blocker(Error *reason)
1050 {
1051     migration_blockers = g_slist_remove(migration_blockers, reason);
1052 }
1053 
1054 void qmp_migrate_incoming(const char *uri, Error **errp)
1055 {
1056     Error *local_err = NULL;
1057     static bool once = true;
1058 
1059     if (!deferred_incoming) {
1060         error_setg(errp, "For use with '-incoming defer'");
1061         return;
1062     }
1063     if (!once) {
1064         error_setg(errp, "The incoming migration has already been started");
1065     }
1066 
1067     qemu_start_incoming_migration(uri, &local_err);
1068 
1069     if (local_err) {
1070         error_propagate(errp, local_err);
1071         return;
1072     }
1073 
1074     once = false;
1075 }
1076 
1077 bool migration_is_blocked(Error **errp)
1078 {
1079     if (qemu_savevm_state_blocked(errp)) {
1080         return true;
1081     }
1082 
1083     if (migration_blockers) {
1084         *errp = error_copy(migration_blockers->data);
1085         return true;
1086     }
1087 
1088     return false;
1089 }
1090 
1091 void qmp_migrate(const char *uri, bool has_blk, bool blk,
1092                  bool has_inc, bool inc, bool has_detach, bool detach,
1093                  Error **errp)
1094 {
1095     Error *local_err = NULL;
1096     MigrationState *s = migrate_get_current();
1097     MigrationParams params;
1098     const char *p;
1099 
1100     params.blk = has_blk && blk;
1101     params.shared = has_inc && inc;
1102 
1103     if (migration_is_setup_or_active(s->state) ||
1104         s->state == MIGRATION_STATUS_CANCELLING) {
1105         error_setg(errp, QERR_MIGRATION_ACTIVE);
1106         return;
1107     }
1108     if (runstate_check(RUN_STATE_INMIGRATE)) {
1109         error_setg(errp, "Guest is waiting for an incoming migration");
1110         return;
1111     }
1112 
1113     if (migration_is_blocked(errp)) {
1114         return;
1115     }
1116 
1117     s = migrate_init(&params);
1118 
1119     if (strstart(uri, "tcp:", &p)) {
1120         tcp_start_outgoing_migration(s, p, &local_err);
1121 #ifdef CONFIG_RDMA
1122     } else if (strstart(uri, "rdma:", &p)) {
1123         rdma_start_outgoing_migration(s, p, &local_err);
1124 #endif
1125     } else if (strstart(uri, "exec:", &p)) {
1126         exec_start_outgoing_migration(s, p, &local_err);
1127     } else if (strstart(uri, "unix:", &p)) {
1128         unix_start_outgoing_migration(s, p, &local_err);
1129     } else if (strstart(uri, "fd:", &p)) {
1130         fd_start_outgoing_migration(s, p, &local_err);
1131     } else {
1132         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri",
1133                    "a valid migration protocol");
1134         migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
1135                           MIGRATION_STATUS_FAILED);
1136         return;
1137     }
1138 
1139     if (local_err) {
1140         migrate_fd_error(s, local_err);
1141         error_propagate(errp, local_err);
1142         return;
1143     }
1144 }
1145 
1146 void qmp_migrate_cancel(Error **errp)
1147 {
1148     migrate_fd_cancel(migrate_get_current());
1149 }
1150 
1151 void qmp_migrate_set_cache_size(int64_t value, Error **errp)
1152 {
1153     MigrationState *s = migrate_get_current();
1154     int64_t new_size;
1155 
1156     /* Check for truncation */
1157     if (value != (size_t)value) {
1158         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
1159                    "exceeding address space");
1160         return;
1161     }
1162 
1163     /* Cache should not be larger than guest ram size */
1164     if (value > ram_bytes_total()) {
1165         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
1166                    "exceeds guest ram size ");
1167         return;
1168     }
1169 
1170     new_size = xbzrle_cache_resize(value);
1171     if (new_size < 0) {
1172         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
1173                    "is smaller than page size");
1174         return;
1175     }
1176 
1177     s->xbzrle_cache_size = new_size;
1178 }
1179 
1180 int64_t qmp_query_migrate_cache_size(Error **errp)
1181 {
1182     return migrate_xbzrle_cache_size();
1183 }
1184 
1185 void qmp_migrate_set_speed(int64_t value, Error **errp)
1186 {
1187     MigrationParameters p = {
1188         .has_max_bandwidth = true,
1189         .max_bandwidth = value,
1190     };
1191 
1192     qmp_migrate_set_parameters(&p, errp);
1193 }
1194 
1195 void qmp_migrate_set_downtime(double value, Error **errp)
1196 {
1197     value *= 1000; /* Convert to milliseconds */
1198     value = MAX(0, MIN(INT64_MAX, value));
1199 
1200     MigrationParameters p = {
1201         .has_downtime_limit = true,
1202         .downtime_limit = value,
1203     };
1204 
1205     qmp_migrate_set_parameters(&p, errp);
1206 }
1207 
1208 bool migrate_postcopy_ram(void)
1209 {
1210     MigrationState *s;
1211 
1212     s = migrate_get_current();
1213 
1214     return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM];
1215 }
1216 
1217 bool migrate_auto_converge(void)
1218 {
1219     MigrationState *s;
1220 
1221     s = migrate_get_current();
1222 
1223     return s->enabled_capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE];
1224 }
1225 
1226 bool migrate_zero_blocks(void)
1227 {
1228     MigrationState *s;
1229 
1230     s = migrate_get_current();
1231 
1232     return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS];
1233 }
1234 
1235 bool migrate_use_compression(void)
1236 {
1237     MigrationState *s;
1238 
1239     s = migrate_get_current();
1240 
1241     return s->enabled_capabilities[MIGRATION_CAPABILITY_COMPRESS];
1242 }
1243 
1244 int migrate_compress_level(void)
1245 {
1246     MigrationState *s;
1247 
1248     s = migrate_get_current();
1249 
1250     return s->parameters.compress_level;
1251 }
1252 
1253 int migrate_compress_threads(void)
1254 {
1255     MigrationState *s;
1256 
1257     s = migrate_get_current();
1258 
1259     return s->parameters.compress_threads;
1260 }
1261 
1262 int migrate_decompress_threads(void)
1263 {
1264     MigrationState *s;
1265 
1266     s = migrate_get_current();
1267 
1268     return s->parameters.decompress_threads;
1269 }
1270 
1271 bool migrate_use_events(void)
1272 {
1273     MigrationState *s;
1274 
1275     s = migrate_get_current();
1276 
1277     return s->enabled_capabilities[MIGRATION_CAPABILITY_EVENTS];
1278 }
1279 
1280 int migrate_use_xbzrle(void)
1281 {
1282     MigrationState *s;
1283 
1284     s = migrate_get_current();
1285 
1286     return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE];
1287 }
1288 
1289 int64_t migrate_xbzrle_cache_size(void)
1290 {
1291     MigrationState *s;
1292 
1293     s = migrate_get_current();
1294 
1295     return s->xbzrle_cache_size;
1296 }
1297 
1298 /* migration thread support */
1299 /*
1300  * Something bad happened to the RP stream, mark an error
1301  * The caller shall print or trace something to indicate why
1302  */
1303 static void mark_source_rp_bad(MigrationState *s)
1304 {
1305     s->rp_state.error = true;
1306 }
1307 
1308 static struct rp_cmd_args {
1309     ssize_t     len; /* -1 = variable */
1310     const char *name;
1311 } rp_cmd_args[] = {
1312     [MIG_RP_MSG_INVALID]        = { .len = -1, .name = "INVALID" },
1313     [MIG_RP_MSG_SHUT]           = { .len =  4, .name = "SHUT" },
1314     [MIG_RP_MSG_PONG]           = { .len =  4, .name = "PONG" },
1315     [MIG_RP_MSG_REQ_PAGES]      = { .len = 12, .name = "REQ_PAGES" },
1316     [MIG_RP_MSG_REQ_PAGES_ID]   = { .len = -1, .name = "REQ_PAGES_ID" },
1317     [MIG_RP_MSG_MAX]            = { .len = -1, .name = "MAX" },
1318 };
1319 
1320 /*
1321  * Process a request for pages received on the return path,
1322  * We're allowed to send more than requested (e.g. to round to our page size)
1323  * and we don't need to send pages that have already been sent.
1324  */
1325 static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
1326                                        ram_addr_t start, size_t len)
1327 {
1328     long our_host_ps = getpagesize();
1329 
1330     trace_migrate_handle_rp_req_pages(rbname, start, len);
1331 
1332     /*
1333      * Since we currently insist on matching page sizes, just sanity check
1334      * we're being asked for whole host pages.
1335      */
1336     if (start & (our_host_ps-1) ||
1337        (len & (our_host_ps-1))) {
1338         error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT
1339                      " len: %zd", __func__, start, len);
1340         mark_source_rp_bad(ms);
1341         return;
1342     }
1343 
1344     if (ram_save_queue_pages(ms, rbname, start, len)) {
1345         mark_source_rp_bad(ms);
1346     }
1347 }
1348 
1349 /*
1350  * Handles messages sent on the return path towards the source VM
1351  *
1352  */
1353 static void *source_return_path_thread(void *opaque)
1354 {
1355     MigrationState *ms = opaque;
1356     QEMUFile *rp = ms->rp_state.from_dst_file;
1357     uint16_t header_len, header_type;
1358     uint8_t buf[512];
1359     uint32_t tmp32, sibling_error;
1360     ram_addr_t start = 0; /* =0 to silence warning */
1361     size_t  len = 0, expected_len;
1362     int res;
1363 
1364     trace_source_return_path_thread_entry();
1365     while (!ms->rp_state.error && !qemu_file_get_error(rp) &&
1366            migration_is_setup_or_active(ms->state)) {
1367         trace_source_return_path_thread_loop_top();
1368         header_type = qemu_get_be16(rp);
1369         header_len = qemu_get_be16(rp);
1370 
1371         if (header_type >= MIG_RP_MSG_MAX ||
1372             header_type == MIG_RP_MSG_INVALID) {
1373             error_report("RP: Received invalid message 0x%04x length 0x%04x",
1374                     header_type, header_len);
1375             mark_source_rp_bad(ms);
1376             goto out;
1377         }
1378 
1379         if ((rp_cmd_args[header_type].len != -1 &&
1380             header_len != rp_cmd_args[header_type].len) ||
1381             header_len > sizeof(buf)) {
1382             error_report("RP: Received '%s' message (0x%04x) with"
1383                     "incorrect length %d expecting %zu",
1384                     rp_cmd_args[header_type].name, header_type, header_len,
1385                     (size_t)rp_cmd_args[header_type].len);
1386             mark_source_rp_bad(ms);
1387             goto out;
1388         }
1389 
1390         /* We know we've got a valid header by this point */
1391         res = qemu_get_buffer(rp, buf, header_len);
1392         if (res != header_len) {
1393             error_report("RP: Failed reading data for message 0x%04x"
1394                          " read %d expected %d",
1395                          header_type, res, header_len);
1396             mark_source_rp_bad(ms);
1397             goto out;
1398         }
1399 
1400         /* OK, we have the message and the data */
1401         switch (header_type) {
1402         case MIG_RP_MSG_SHUT:
1403             sibling_error = ldl_be_p(buf);
1404             trace_source_return_path_thread_shut(sibling_error);
1405             if (sibling_error) {
1406                 error_report("RP: Sibling indicated error %d", sibling_error);
1407                 mark_source_rp_bad(ms);
1408             }
1409             /*
1410              * We'll let the main thread deal with closing the RP
1411              * we could do a shutdown(2) on it, but we're the only user
1412              * anyway, so there's nothing gained.
1413              */
1414             goto out;
1415 
1416         case MIG_RP_MSG_PONG:
1417             tmp32 = ldl_be_p(buf);
1418             trace_source_return_path_thread_pong(tmp32);
1419             break;
1420 
1421         case MIG_RP_MSG_REQ_PAGES:
1422             start = ldq_be_p(buf);
1423             len = ldl_be_p(buf + 8);
1424             migrate_handle_rp_req_pages(ms, NULL, start, len);
1425             break;
1426 
1427         case MIG_RP_MSG_REQ_PAGES_ID:
1428             expected_len = 12 + 1; /* header + termination */
1429 
1430             if (header_len >= expected_len) {
1431                 start = ldq_be_p(buf);
1432                 len = ldl_be_p(buf + 8);
1433                 /* Now we expect an idstr */
1434                 tmp32 = buf[12]; /* Length of the following idstr */
1435                 buf[13 + tmp32] = '\0';
1436                 expected_len += tmp32;
1437             }
1438             if (header_len != expected_len) {
1439                 error_report("RP: Req_Page_id with length %d expecting %zd",
1440                         header_len, expected_len);
1441                 mark_source_rp_bad(ms);
1442                 goto out;
1443             }
1444             migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len);
1445             break;
1446 
1447         default:
1448             break;
1449         }
1450     }
1451     if (qemu_file_get_error(rp)) {
1452         trace_source_return_path_thread_bad_end();
1453         mark_source_rp_bad(ms);
1454     }
1455 
1456     trace_source_return_path_thread_end();
1457 out:
1458     ms->rp_state.from_dst_file = NULL;
1459     qemu_fclose(rp);
1460     return NULL;
1461 }
1462 
1463 static int open_return_path_on_source(MigrationState *ms)
1464 {
1465 
1466     ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
1467     if (!ms->rp_state.from_dst_file) {
1468         return -1;
1469     }
1470 
1471     trace_open_return_path_on_source();
1472     qemu_thread_create(&ms->rp_state.rp_thread, "return path",
1473                        source_return_path_thread, ms, QEMU_THREAD_JOINABLE);
1474 
1475     trace_open_return_path_on_source_continue();
1476 
1477     return 0;
1478 }
1479 
1480 /* Returns 0 if the RP was ok, otherwise there was an error on the RP */
1481 static int await_return_path_close_on_source(MigrationState *ms)
1482 {
1483     /*
1484      * If this is a normal exit then the destination will send a SHUT and the
1485      * rp_thread will exit, however if there's an error we need to cause
1486      * it to exit.
1487      */
1488     if (qemu_file_get_error(ms->to_dst_file) && ms->rp_state.from_dst_file) {
1489         /*
1490          * shutdown(2), if we have it, will cause it to unblock if it's stuck
1491          * waiting for the destination.
1492          */
1493         qemu_file_shutdown(ms->rp_state.from_dst_file);
1494         mark_source_rp_bad(ms);
1495     }
1496     trace_await_return_path_close_on_source_joining();
1497     qemu_thread_join(&ms->rp_state.rp_thread);
1498     trace_await_return_path_close_on_source_close();
1499     return ms->rp_state.error;
1500 }
1501 
1502 /*
1503  * Switch from normal iteration to postcopy
1504  * Returns non-0 on error
1505  */
1506 static int postcopy_start(MigrationState *ms, bool *old_vm_running)
1507 {
1508     int ret;
1509     QIOChannelBuffer *bioc;
1510     QEMUFile *fb;
1511     int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1512     migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE,
1513                       MIGRATION_STATUS_POSTCOPY_ACTIVE);
1514 
1515     trace_postcopy_start();
1516     qemu_mutex_lock_iothread();
1517     trace_postcopy_start_set_run();
1518 
1519     qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
1520     *old_vm_running = runstate_is_running();
1521     global_state_store();
1522     ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
1523     if (ret < 0) {
1524         goto fail;
1525     }
1526 
1527     ret = bdrv_inactivate_all();
1528     if (ret < 0) {
1529         goto fail;
1530     }
1531 
1532     /*
1533      * Cause any non-postcopiable, but iterative devices to
1534      * send out their final data.
1535      */
1536     qemu_savevm_state_complete_precopy(ms->to_dst_file, true);
1537 
1538     /*
1539      * in Finish migrate and with the io-lock held everything should
1540      * be quiet, but we've potentially still got dirty pages and we
1541      * need to tell the destination to throw any pages it's already received
1542      * that are dirty
1543      */
1544     if (ram_postcopy_send_discard_bitmap(ms)) {
1545         error_report("postcopy send discard bitmap failed");
1546         goto fail;
1547     }
1548 
1549     /*
1550      * send rest of state - note things that are doing postcopy
1551      * will notice we're in POSTCOPY_ACTIVE and not actually
1552      * wrap their state up here
1553      */
1554     qemu_file_set_rate_limit(ms->to_dst_file, INT64_MAX);
1555     /* Ping just for debugging, helps line traces up */
1556     qemu_savevm_send_ping(ms->to_dst_file, 2);
1557 
1558     /*
1559      * While loading the device state we may trigger page transfer
1560      * requests and the fd must be free to process those, and thus
1561      * the destination must read the whole device state off the fd before
1562      * it starts processing it.  Unfortunately the ad-hoc migration format
1563      * doesn't allow the destination to know the size to read without fully
1564      * parsing it through each devices load-state code (especially the open
1565      * coded devices that use get/put).
1566      * So we wrap the device state up in a package with a length at the start;
1567      * to do this we use a qemu_buf to hold the whole of the device state.
1568      */
1569     bioc = qio_channel_buffer_new(4096);
1570     fb = qemu_fopen_channel_output(QIO_CHANNEL(bioc));
1571     object_unref(OBJECT(bioc));
1572 
1573     /*
1574      * Make sure the receiver can get incoming pages before we send the rest
1575      * of the state
1576      */
1577     qemu_savevm_send_postcopy_listen(fb);
1578 
1579     qemu_savevm_state_complete_precopy(fb, false);
1580     qemu_savevm_send_ping(fb, 3);
1581 
1582     qemu_savevm_send_postcopy_run(fb);
1583 
1584     /* <><> end of stuff going into the package */
1585 
1586     /* Now send that blob */
1587     if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) {
1588         goto fail_closefb;
1589     }
1590     qemu_fclose(fb);
1591 
1592     /* Send a notify to give a chance for anything that needs to happen
1593      * at the transition to postcopy and after the device state; in particular
1594      * spice needs to trigger a transition now
1595      */
1596     ms->postcopy_after_devices = true;
1597     notifier_list_notify(&migration_state_notifiers, ms);
1598 
1599     ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
1600 
1601     qemu_mutex_unlock_iothread();
1602 
1603     /*
1604      * Although this ping is just for debug, it could potentially be
1605      * used for getting a better measurement of downtime at the source.
1606      */
1607     qemu_savevm_send_ping(ms->to_dst_file, 4);
1608 
1609     ret = qemu_file_get_error(ms->to_dst_file);
1610     if (ret) {
1611         error_report("postcopy_start: Migration stream errored");
1612         migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1613                               MIGRATION_STATUS_FAILED);
1614     }
1615 
1616     return ret;
1617 
1618 fail_closefb:
1619     qemu_fclose(fb);
1620 fail:
1621     migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1622                           MIGRATION_STATUS_FAILED);
1623     qemu_mutex_unlock_iothread();
1624     return -1;
1625 }
1626 
1627 /**
1628  * migration_completion: Used by migration_thread when there's not much left.
1629  *   The caller 'breaks' the loop when this returns.
1630  *
1631  * @s: Current migration state
1632  * @current_active_state: The migration state we expect to be in
1633  * @*old_vm_running: Pointer to old_vm_running flag
1634  * @*start_time: Pointer to time to update
1635  */
1636 static void migration_completion(MigrationState *s, int current_active_state,
1637                                  bool *old_vm_running,
1638                                  int64_t *start_time)
1639 {
1640     int ret;
1641 
1642     if (s->state == MIGRATION_STATUS_ACTIVE) {
1643         qemu_mutex_lock_iothread();
1644         *start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1645         qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
1646         *old_vm_running = runstate_is_running();
1647         ret = global_state_store();
1648 
1649         if (!ret) {
1650             ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
1651             if (ret >= 0) {
1652                 ret = bdrv_inactivate_all();
1653             }
1654             if (ret >= 0) {
1655                 qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
1656                 qemu_savevm_state_complete_precopy(s->to_dst_file, false);
1657             }
1658         }
1659         qemu_mutex_unlock_iothread();
1660 
1661         if (ret < 0) {
1662             goto fail;
1663         }
1664     } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
1665         trace_migration_completion_postcopy_end();
1666 
1667         qemu_savevm_state_complete_postcopy(s->to_dst_file);
1668         trace_migration_completion_postcopy_end_after_complete();
1669     }
1670 
1671     /*
1672      * If rp was opened we must clean up the thread before
1673      * cleaning everything else up (since if there are no failures
1674      * it will wait for the destination to send it's status in
1675      * a SHUT command).
1676      * Postcopy opens rp if enabled (even if it's not avtivated)
1677      */
1678     if (migrate_postcopy_ram()) {
1679         int rp_error;
1680         trace_migration_completion_postcopy_end_before_rp();
1681         rp_error = await_return_path_close_on_source(s);
1682         trace_migration_completion_postcopy_end_after_rp(rp_error);
1683         if (rp_error) {
1684             goto fail_invalidate;
1685         }
1686     }
1687 
1688     if (qemu_file_get_error(s->to_dst_file)) {
1689         trace_migration_completion_file_err();
1690         goto fail_invalidate;
1691     }
1692 
1693     migrate_set_state(&s->state, current_active_state,
1694                       MIGRATION_STATUS_COMPLETED);
1695     return;
1696 
1697 fail_invalidate:
1698     /* If not doing postcopy, vm_start() will be called: let's regain
1699      * control on images.
1700      */
1701     if (s->state == MIGRATION_STATUS_ACTIVE) {
1702         Error *local_err = NULL;
1703 
1704         bdrv_invalidate_cache_all(&local_err);
1705         if (local_err) {
1706             error_report_err(local_err);
1707         }
1708     }
1709 
1710 fail:
1711     migrate_set_state(&s->state, current_active_state,
1712                       MIGRATION_STATUS_FAILED);
1713 }
1714 
1715 /*
1716  * Master migration thread on the source VM.
1717  * It drives the migration and pumps the data down the outgoing channel.
1718  */
1719 static void *migration_thread(void *opaque)
1720 {
1721     MigrationState *s = opaque;
1722     /* Used by the bandwidth calcs, updated later */
1723     int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1724     int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
1725     int64_t initial_bytes = 0;
1726     int64_t max_size = 0;
1727     int64_t start_time = initial_time;
1728     int64_t end_time;
1729     bool old_vm_running = false;
1730     bool entered_postcopy = false;
1731     /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
1732     enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
1733 
1734     rcu_register_thread();
1735 
1736     qemu_savevm_state_header(s->to_dst_file);
1737 
1738     if (migrate_postcopy_ram()) {
1739         /* Now tell the dest that it should open its end so it can reply */
1740         qemu_savevm_send_open_return_path(s->to_dst_file);
1741 
1742         /* And do a ping that will make stuff easier to debug */
1743         qemu_savevm_send_ping(s->to_dst_file, 1);
1744 
1745         /*
1746          * Tell the destination that we *might* want to do postcopy later;
1747          * if the other end can't do postcopy it should fail now, nice and
1748          * early.
1749          */
1750         qemu_savevm_send_postcopy_advise(s->to_dst_file);
1751     }
1752 
1753     qemu_savevm_state_begin(s->to_dst_file, &s->params);
1754 
1755     s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
1756     current_active_state = MIGRATION_STATUS_ACTIVE;
1757     migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
1758                       MIGRATION_STATUS_ACTIVE);
1759 
1760     trace_migration_thread_setup_complete();
1761 
1762     while (s->state == MIGRATION_STATUS_ACTIVE ||
1763            s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
1764         int64_t current_time;
1765         uint64_t pending_size;
1766 
1767         if (!qemu_file_rate_limit(s->to_dst_file)) {
1768             uint64_t pend_post, pend_nonpost;
1769 
1770             qemu_savevm_state_pending(s->to_dst_file, max_size, &pend_nonpost,
1771                                       &pend_post);
1772             pending_size = pend_nonpost + pend_post;
1773             trace_migrate_pending(pending_size, max_size,
1774                                   pend_post, pend_nonpost);
1775             if (pending_size && pending_size >= max_size) {
1776                 /* Still a significant amount to transfer */
1777 
1778                 if (migrate_postcopy_ram() &&
1779                     s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE &&
1780                     pend_nonpost <= max_size &&
1781                     atomic_read(&s->start_postcopy)) {
1782 
1783                     if (!postcopy_start(s, &old_vm_running)) {
1784                         current_active_state = MIGRATION_STATUS_POSTCOPY_ACTIVE;
1785                         entered_postcopy = true;
1786                     }
1787 
1788                     continue;
1789                 }
1790                 /* Just another iteration step */
1791                 qemu_savevm_state_iterate(s->to_dst_file, entered_postcopy);
1792             } else {
1793                 trace_migration_thread_low_pending(pending_size);
1794                 migration_completion(s, current_active_state,
1795                                      &old_vm_running, &start_time);
1796                 break;
1797             }
1798         }
1799 
1800         if (qemu_file_get_error(s->to_dst_file)) {
1801             migrate_set_state(&s->state, current_active_state,
1802                               MIGRATION_STATUS_FAILED);
1803             trace_migration_thread_file_err();
1804             break;
1805         }
1806         current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1807         if (current_time >= initial_time + BUFFER_DELAY) {
1808             uint64_t transferred_bytes = qemu_ftell(s->to_dst_file) -
1809                                          initial_bytes;
1810             uint64_t time_spent = current_time - initial_time;
1811             double bandwidth = (double)transferred_bytes / time_spent;
1812             max_size = bandwidth * s->parameters.downtime_limit;
1813 
1814             s->mbps = (((double) transferred_bytes * 8.0) /
1815                     ((double) time_spent / 1000.0)) / 1000.0 / 1000.0;
1816 
1817             trace_migrate_transferred(transferred_bytes, time_spent,
1818                                       bandwidth, max_size);
1819             /* if we haven't sent anything, we don't want to recalculate
1820                10000 is a small enough number for our purposes */
1821             if (s->dirty_bytes_rate && transferred_bytes > 10000) {
1822                 s->expected_downtime = s->dirty_bytes_rate / bandwidth;
1823             }
1824 
1825             qemu_file_reset_rate_limit(s->to_dst_file);
1826             initial_time = current_time;
1827             initial_bytes = qemu_ftell(s->to_dst_file);
1828         }
1829         if (qemu_file_rate_limit(s->to_dst_file)) {
1830             /* usleep expects microseconds */
1831             g_usleep((initial_time + BUFFER_DELAY - current_time)*1000);
1832         }
1833     }
1834 
1835     trace_migration_thread_after_loop();
1836     /* If we enabled cpu throttling for auto-converge, turn it off. */
1837     cpu_throttle_stop();
1838     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1839 
1840     qemu_mutex_lock_iothread();
1841     qemu_savevm_state_cleanup();
1842     if (s->state == MIGRATION_STATUS_COMPLETED) {
1843         uint64_t transferred_bytes = qemu_ftell(s->to_dst_file);
1844         s->total_time = end_time - s->total_time;
1845         if (!entered_postcopy) {
1846             s->downtime = end_time - start_time;
1847         }
1848         if (s->total_time) {
1849             s->mbps = (((double) transferred_bytes * 8.0) /
1850                        ((double) s->total_time)) / 1000;
1851         }
1852         runstate_set(RUN_STATE_POSTMIGRATE);
1853     } else {
1854         if (old_vm_running && !entered_postcopy) {
1855             vm_start();
1856         } else {
1857             if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {
1858                 runstate_set(RUN_STATE_POSTMIGRATE);
1859             }
1860         }
1861     }
1862     qemu_bh_schedule(s->cleanup_bh);
1863     qemu_mutex_unlock_iothread();
1864 
1865     rcu_unregister_thread();
1866     return NULL;
1867 }
1868 
1869 void migrate_fd_connect(MigrationState *s)
1870 {
1871     s->expected_downtime = s->parameters.downtime_limit;
1872     s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s);
1873 
1874     qemu_file_set_blocking(s->to_dst_file, true);
1875     qemu_file_set_rate_limit(s->to_dst_file,
1876                              s->parameters.max_bandwidth / XFER_LIMIT_RATIO);
1877 
1878     /* Notify before starting migration thread */
1879     notifier_list_notify(&migration_state_notifiers, s);
1880 
1881     /*
1882      * Open the return path; currently for postcopy but other things might
1883      * also want it.
1884      */
1885     if (migrate_postcopy_ram()) {
1886         if (open_return_path_on_source(s)) {
1887             error_report("Unable to open return-path for postcopy");
1888             migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
1889                               MIGRATION_STATUS_FAILED);
1890             migrate_fd_cleanup(s);
1891             return;
1892         }
1893     }
1894 
1895     migrate_compress_threads_create();
1896     qemu_thread_create(&s->thread, "migration", migration_thread, s,
1897                        QEMU_THREAD_JOINABLE);
1898     s->migration_thread_running = true;
1899 }
1900 
1901 PostcopyState  postcopy_state_get(void)
1902 {
1903     return atomic_mb_read(&incoming_postcopy_state);
1904 }
1905 
1906 /* Set the state and return the old state */
1907 PostcopyState postcopy_state_set(PostcopyState new_state)
1908 {
1909     return atomic_xchg(&incoming_postcopy_state, new_state);
1910 }
1911 
1912