xref: /qemu/migration/ram.c (revision a976ed3f)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include "qemu/cutils.h"
32 #include "qemu/bitops.h"
33 #include "qemu/bitmap.h"
34 #include "qemu/main-loop.h"
35 #include "xbzrle.h"
36 #include "ram.h"
37 #include "migration.h"
38 #include "migration/register.h"
39 #include "migration/misc.h"
40 #include "qemu-file.h"
41 #include "postcopy-ram.h"
42 #include "page_cache.h"
43 #include "qemu/error-report.h"
44 #include "qapi/error.h"
45 #include "qapi/qapi-types-migration.h"
46 #include "qapi/qapi-events-migration.h"
47 #include "qapi/qmp/qerror.h"
48 #include "trace.h"
49 #include "exec/ram_addr.h"
50 #include "exec/target_page.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
53 #include "block.h"
54 #include "sysemu/sysemu.h"
55 #include "savevm.h"
56 #include "qemu/iov.h"
57 #include "multifd.h"
58 
59 /***********************************************************/
60 /* ram save/restore */
61 
62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
63  * worked for pages that where filled with the same char.  We switched
64  * it to only search for the zero value.  And to avoid confusion with
65  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
66  */
67 
68 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
69 #define RAM_SAVE_FLAG_ZERO     0x02
70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
71 #define RAM_SAVE_FLAG_PAGE     0x08
72 #define RAM_SAVE_FLAG_EOS      0x10
73 #define RAM_SAVE_FLAG_CONTINUE 0x20
74 #define RAM_SAVE_FLAG_XBZRLE   0x40
75 /* 0x80 is reserved in migration.h start with 0x100 next */
76 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
77 
78 static inline bool is_zero_range(uint8_t *p, uint64_t size)
79 {
80     return buffer_is_zero(p, size);
81 }
82 
83 XBZRLECacheStats xbzrle_counters;
84 
85 /* struct contains XBZRLE cache and a static page
86    used by the compression */
87 static struct {
88     /* buffer used for XBZRLE encoding */
89     uint8_t *encoded_buf;
90     /* buffer for storing page content */
91     uint8_t *current_buf;
92     /* Cache for XBZRLE, Protected by lock. */
93     PageCache *cache;
94     QemuMutex lock;
95     /* it will store a page full of zeros */
96     uint8_t *zero_target_page;
97     /* buffer used for XBZRLE decoding */
98     uint8_t *decoded_buf;
99 } XBZRLE;
100 
101 static void XBZRLE_cache_lock(void)
102 {
103     if (migrate_use_xbzrle())
104         qemu_mutex_lock(&XBZRLE.lock);
105 }
106 
107 static void XBZRLE_cache_unlock(void)
108 {
109     if (migrate_use_xbzrle())
110         qemu_mutex_unlock(&XBZRLE.lock);
111 }
112 
113 /**
114  * xbzrle_cache_resize: resize the xbzrle cache
115  *
116  * This function is called from qmp_migrate_set_cache_size in main
117  * thread, possibly while a migration is in progress.  A running
118  * migration may be using the cache and might finish during this call,
119  * hence changes to the cache are protected by XBZRLE.lock().
120  *
121  * Returns 0 for success or -1 for error
122  *
123  * @new_size: new cache size
124  * @errp: set *errp if the check failed, with reason
125  */
126 int xbzrle_cache_resize(int64_t new_size, Error **errp)
127 {
128     PageCache *new_cache;
129     int64_t ret = 0;
130 
131     /* Check for truncation */
132     if (new_size != (size_t)new_size) {
133         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
134                    "exceeding address space");
135         return -1;
136     }
137 
138     if (new_size == migrate_xbzrle_cache_size()) {
139         /* nothing to do */
140         return 0;
141     }
142 
143     XBZRLE_cache_lock();
144 
145     if (XBZRLE.cache != NULL) {
146         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
147         if (!new_cache) {
148             ret = -1;
149             goto out;
150         }
151 
152         cache_fini(XBZRLE.cache);
153         XBZRLE.cache = new_cache;
154     }
155 out:
156     XBZRLE_cache_unlock();
157     return ret;
158 }
159 
160 static bool ramblock_is_ignored(RAMBlock *block)
161 {
162     return !qemu_ram_is_migratable(block) ||
163            (migrate_ignore_shared() && qemu_ram_is_shared(block));
164 }
165 
166 /* Should be holding either ram_list.mutex, or the RCU lock. */
167 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
168     INTERNAL_RAMBLOCK_FOREACH(block)                   \
169         if (ramblock_is_ignored(block)) {} else
170 
171 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
172     INTERNAL_RAMBLOCK_FOREACH(block)                   \
173         if (!qemu_ram_is_migratable(block)) {} else
174 
175 #undef RAMBLOCK_FOREACH
176 
177 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
178 {
179     RAMBlock *block;
180     int ret = 0;
181 
182     RCU_READ_LOCK_GUARD();
183 
184     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
185         ret = func(block, opaque);
186         if (ret) {
187             break;
188         }
189     }
190     return ret;
191 }
192 
193 static void ramblock_recv_map_init(void)
194 {
195     RAMBlock *rb;
196 
197     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
198         assert(!rb->receivedmap);
199         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
200     }
201 }
202 
203 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
204 {
205     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
206                     rb->receivedmap);
207 }
208 
209 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
210 {
211     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
212 }
213 
214 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
215 {
216     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
217 }
218 
219 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
220                                     size_t nr)
221 {
222     bitmap_set_atomic(rb->receivedmap,
223                       ramblock_recv_bitmap_offset(host_addr, rb),
224                       nr);
225 }
226 
227 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
228 
229 /*
230  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
231  *
232  * Returns >0 if success with sent bytes, or <0 if error.
233  */
234 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
235                                   const char *block_name)
236 {
237     RAMBlock *block = qemu_ram_block_by_name(block_name);
238     unsigned long *le_bitmap, nbits;
239     uint64_t size;
240 
241     if (!block) {
242         error_report("%s: invalid block name: %s", __func__, block_name);
243         return -1;
244     }
245 
246     nbits = block->used_length >> TARGET_PAGE_BITS;
247 
248     /*
249      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
250      * machines we may need 4 more bytes for padding (see below
251      * comment). So extend it a bit before hand.
252      */
253     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
254 
255     /*
256      * Always use little endian when sending the bitmap. This is
257      * required that when source and destination VMs are not using the
258      * same endianess. (Note: big endian won't work.)
259      */
260     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
261 
262     /* Size of the bitmap, in bytes */
263     size = DIV_ROUND_UP(nbits, 8);
264 
265     /*
266      * size is always aligned to 8 bytes for 64bit machines, but it
267      * may not be true for 32bit machines. We need this padding to
268      * make sure the migration can survive even between 32bit and
269      * 64bit machines.
270      */
271     size = ROUND_UP(size, 8);
272 
273     qemu_put_be64(file, size);
274     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
275     /*
276      * Mark as an end, in case the middle part is screwed up due to
277      * some "misterious" reason.
278      */
279     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
280     qemu_fflush(file);
281 
282     g_free(le_bitmap);
283 
284     if (qemu_file_get_error(file)) {
285         return qemu_file_get_error(file);
286     }
287 
288     return size + sizeof(size);
289 }
290 
291 /*
292  * An outstanding page request, on the source, having been received
293  * and queued
294  */
295 struct RAMSrcPageRequest {
296     RAMBlock *rb;
297     hwaddr    offset;
298     hwaddr    len;
299 
300     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
301 };
302 
303 /* State of RAM for migration */
304 struct RAMState {
305     /* QEMUFile used for this migration */
306     QEMUFile *f;
307     /* Last block that we have visited searching for dirty pages */
308     RAMBlock *last_seen_block;
309     /* Last block from where we have sent data */
310     RAMBlock *last_sent_block;
311     /* Last dirty target page we have sent */
312     ram_addr_t last_page;
313     /* last ram version we have seen */
314     uint32_t last_version;
315     /* We are in the first round */
316     bool ram_bulk_stage;
317     /* The free page optimization is enabled */
318     bool fpo_enabled;
319     /* How many times we have dirty too many pages */
320     int dirty_rate_high_cnt;
321     /* these variables are used for bitmap sync */
322     /* last time we did a full bitmap_sync */
323     int64_t time_last_bitmap_sync;
324     /* bytes transferred at start_time */
325     uint64_t bytes_xfer_prev;
326     /* number of dirty pages since start_time */
327     uint64_t num_dirty_pages_period;
328     /* xbzrle misses since the beginning of the period */
329     uint64_t xbzrle_cache_miss_prev;
330 
331     /* compression statistics since the beginning of the period */
332     /* amount of count that no free thread to compress data */
333     uint64_t compress_thread_busy_prev;
334     /* amount bytes after compression */
335     uint64_t compressed_size_prev;
336     /* amount of compressed pages */
337     uint64_t compress_pages_prev;
338 
339     /* total handled target pages at the beginning of period */
340     uint64_t target_page_count_prev;
341     /* total handled target pages since start */
342     uint64_t target_page_count;
343     /* number of dirty bits in the bitmap */
344     uint64_t migration_dirty_pages;
345     /* Protects modification of the bitmap and migration dirty pages */
346     QemuMutex bitmap_mutex;
347     /* The RAMBlock used in the last src_page_requests */
348     RAMBlock *last_req_rb;
349     /* Queue of outstanding page requests from the destination */
350     QemuMutex src_page_req_mutex;
351     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
352 };
353 typedef struct RAMState RAMState;
354 
355 static RAMState *ram_state;
356 
357 static NotifierWithReturnList precopy_notifier_list;
358 
359 void precopy_infrastructure_init(void)
360 {
361     notifier_with_return_list_init(&precopy_notifier_list);
362 }
363 
364 void precopy_add_notifier(NotifierWithReturn *n)
365 {
366     notifier_with_return_list_add(&precopy_notifier_list, n);
367 }
368 
369 void precopy_remove_notifier(NotifierWithReturn *n)
370 {
371     notifier_with_return_remove(n);
372 }
373 
374 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
375 {
376     PrecopyNotifyData pnd;
377     pnd.reason = reason;
378     pnd.errp = errp;
379 
380     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
381 }
382 
383 void precopy_enable_free_page_optimization(void)
384 {
385     if (!ram_state) {
386         return;
387     }
388 
389     ram_state->fpo_enabled = true;
390 }
391 
392 uint64_t ram_bytes_remaining(void)
393 {
394     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
395                        0;
396 }
397 
398 MigrationStats ram_counters;
399 
400 /* used by the search for pages to send */
401 struct PageSearchStatus {
402     /* Current block being searched */
403     RAMBlock    *block;
404     /* Current page to search from */
405     unsigned long page;
406     /* Set once we wrap around */
407     bool         complete_round;
408 };
409 typedef struct PageSearchStatus PageSearchStatus;
410 
411 CompressionStats compression_counters;
412 
413 struct CompressParam {
414     bool done;
415     bool quit;
416     bool zero_page;
417     QEMUFile *file;
418     QemuMutex mutex;
419     QemuCond cond;
420     RAMBlock *block;
421     ram_addr_t offset;
422 
423     /* internally used fields */
424     z_stream stream;
425     uint8_t *originbuf;
426 };
427 typedef struct CompressParam CompressParam;
428 
429 struct DecompressParam {
430     bool done;
431     bool quit;
432     QemuMutex mutex;
433     QemuCond cond;
434     void *des;
435     uint8_t *compbuf;
436     int len;
437     z_stream stream;
438 };
439 typedef struct DecompressParam DecompressParam;
440 
441 static CompressParam *comp_param;
442 static QemuThread *compress_threads;
443 /* comp_done_cond is used to wake up the migration thread when
444  * one of the compression threads has finished the compression.
445  * comp_done_lock is used to co-work with comp_done_cond.
446  */
447 static QemuMutex comp_done_lock;
448 static QemuCond comp_done_cond;
449 /* The empty QEMUFileOps will be used by file in CompressParam */
450 static const QEMUFileOps empty_ops = { };
451 
452 static QEMUFile *decomp_file;
453 static DecompressParam *decomp_param;
454 static QemuThread *decompress_threads;
455 static QemuMutex decomp_done_lock;
456 static QemuCond decomp_done_cond;
457 
458 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
459                                  ram_addr_t offset, uint8_t *source_buf);
460 
461 static void *do_data_compress(void *opaque)
462 {
463     CompressParam *param = opaque;
464     RAMBlock *block;
465     ram_addr_t offset;
466     bool zero_page;
467 
468     qemu_mutex_lock(&param->mutex);
469     while (!param->quit) {
470         if (param->block) {
471             block = param->block;
472             offset = param->offset;
473             param->block = NULL;
474             qemu_mutex_unlock(&param->mutex);
475 
476             zero_page = do_compress_ram_page(param->file, &param->stream,
477                                              block, offset, param->originbuf);
478 
479             qemu_mutex_lock(&comp_done_lock);
480             param->done = true;
481             param->zero_page = zero_page;
482             qemu_cond_signal(&comp_done_cond);
483             qemu_mutex_unlock(&comp_done_lock);
484 
485             qemu_mutex_lock(&param->mutex);
486         } else {
487             qemu_cond_wait(&param->cond, &param->mutex);
488         }
489     }
490     qemu_mutex_unlock(&param->mutex);
491 
492     return NULL;
493 }
494 
495 static void compress_threads_save_cleanup(void)
496 {
497     int i, thread_count;
498 
499     if (!migrate_use_compression() || !comp_param) {
500         return;
501     }
502 
503     thread_count = migrate_compress_threads();
504     for (i = 0; i < thread_count; i++) {
505         /*
506          * we use it as a indicator which shows if the thread is
507          * properly init'd or not
508          */
509         if (!comp_param[i].file) {
510             break;
511         }
512 
513         qemu_mutex_lock(&comp_param[i].mutex);
514         comp_param[i].quit = true;
515         qemu_cond_signal(&comp_param[i].cond);
516         qemu_mutex_unlock(&comp_param[i].mutex);
517 
518         qemu_thread_join(compress_threads + i);
519         qemu_mutex_destroy(&comp_param[i].mutex);
520         qemu_cond_destroy(&comp_param[i].cond);
521         deflateEnd(&comp_param[i].stream);
522         g_free(comp_param[i].originbuf);
523         qemu_fclose(comp_param[i].file);
524         comp_param[i].file = NULL;
525     }
526     qemu_mutex_destroy(&comp_done_lock);
527     qemu_cond_destroy(&comp_done_cond);
528     g_free(compress_threads);
529     g_free(comp_param);
530     compress_threads = NULL;
531     comp_param = NULL;
532 }
533 
534 static int compress_threads_save_setup(void)
535 {
536     int i, thread_count;
537 
538     if (!migrate_use_compression()) {
539         return 0;
540     }
541     thread_count = migrate_compress_threads();
542     compress_threads = g_new0(QemuThread, thread_count);
543     comp_param = g_new0(CompressParam, thread_count);
544     qemu_cond_init(&comp_done_cond);
545     qemu_mutex_init(&comp_done_lock);
546     for (i = 0; i < thread_count; i++) {
547         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
548         if (!comp_param[i].originbuf) {
549             goto exit;
550         }
551 
552         if (deflateInit(&comp_param[i].stream,
553                         migrate_compress_level()) != Z_OK) {
554             g_free(comp_param[i].originbuf);
555             goto exit;
556         }
557 
558         /* comp_param[i].file is just used as a dummy buffer to save data,
559          * set its ops to empty.
560          */
561         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
562         comp_param[i].done = true;
563         comp_param[i].quit = false;
564         qemu_mutex_init(&comp_param[i].mutex);
565         qemu_cond_init(&comp_param[i].cond);
566         qemu_thread_create(compress_threads + i, "compress",
567                            do_data_compress, comp_param + i,
568                            QEMU_THREAD_JOINABLE);
569     }
570     return 0;
571 
572 exit:
573     compress_threads_save_cleanup();
574     return -1;
575 }
576 
577 /**
578  * save_page_header: write page header to wire
579  *
580  * If this is the 1st block, it also writes the block identification
581  *
582  * Returns the number of bytes written
583  *
584  * @f: QEMUFile where to send the data
585  * @block: block that contains the page we want to send
586  * @offset: offset inside the block for the page
587  *          in the lower bits, it contains flags
588  */
589 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
590                                ram_addr_t offset)
591 {
592     size_t size, len;
593 
594     if (block == rs->last_sent_block) {
595         offset |= RAM_SAVE_FLAG_CONTINUE;
596     }
597     qemu_put_be64(f, offset);
598     size = 8;
599 
600     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
601         len = strlen(block->idstr);
602         qemu_put_byte(f, len);
603         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
604         size += 1 + len;
605         rs->last_sent_block = block;
606     }
607     return size;
608 }
609 
610 /**
611  * mig_throttle_guest_down: throotle down the guest
612  *
613  * Reduce amount of guest cpu execution to hopefully slow down memory
614  * writes. If guest dirty memory rate is reduced below the rate at
615  * which we can transfer pages to the destination then we should be
616  * able to complete migration. Some workloads dirty memory way too
617  * fast and will not effectively converge, even with auto-converge.
618  */
619 static void mig_throttle_guest_down(void)
620 {
621     MigrationState *s = migrate_get_current();
622     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
623     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
624     int pct_max = s->parameters.max_cpu_throttle;
625 
626     /* We have not started throttling yet. Let's start it. */
627     if (!cpu_throttle_active()) {
628         cpu_throttle_set(pct_initial);
629     } else {
630         /* Throttling already on, just increase the rate */
631         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
632                          pct_max));
633     }
634 }
635 
636 /**
637  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
638  *
639  * @rs: current RAM state
640  * @current_addr: address for the zero page
641  *
642  * Update the xbzrle cache to reflect a page that's been sent as all 0.
643  * The important thing is that a stale (not-yet-0'd) page be replaced
644  * by the new data.
645  * As a bonus, if the page wasn't in the cache it gets added so that
646  * when a small write is made into the 0'd page it gets XBZRLE sent.
647  */
648 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
649 {
650     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
651         return;
652     }
653 
654     /* We don't care if this fails to allocate a new cache page
655      * as long as it updated an old one */
656     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
657                  ram_counters.dirty_sync_count);
658 }
659 
660 #define ENCODING_FLAG_XBZRLE 0x1
661 
662 /**
663  * save_xbzrle_page: compress and send current page
664  *
665  * Returns: 1 means that we wrote the page
666  *          0 means that page is identical to the one already sent
667  *          -1 means that xbzrle would be longer than normal
668  *
669  * @rs: current RAM state
670  * @current_data: pointer to the address of the page contents
671  * @current_addr: addr of the page
672  * @block: block that contains the page we want to send
673  * @offset: offset inside the block for the page
674  * @last_stage: if we are at the completion stage
675  */
676 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
677                             ram_addr_t current_addr, RAMBlock *block,
678                             ram_addr_t offset, bool last_stage)
679 {
680     int encoded_len = 0, bytes_xbzrle;
681     uint8_t *prev_cached_page;
682 
683     if (!cache_is_cached(XBZRLE.cache, current_addr,
684                          ram_counters.dirty_sync_count)) {
685         xbzrle_counters.cache_miss++;
686         if (!last_stage) {
687             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
688                              ram_counters.dirty_sync_count) == -1) {
689                 return -1;
690             } else {
691                 /* update *current_data when the page has been
692                    inserted into cache */
693                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
694             }
695         }
696         return -1;
697     }
698 
699     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
700 
701     /* save current buffer into memory */
702     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
703 
704     /* XBZRLE encoding (if there is no overflow) */
705     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
706                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
707                                        TARGET_PAGE_SIZE);
708 
709     /*
710      * Update the cache contents, so that it corresponds to the data
711      * sent, in all cases except where we skip the page.
712      */
713     if (!last_stage && encoded_len != 0) {
714         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
715         /*
716          * In the case where we couldn't compress, ensure that the caller
717          * sends the data from the cache, since the guest might have
718          * changed the RAM since we copied it.
719          */
720         *current_data = prev_cached_page;
721     }
722 
723     if (encoded_len == 0) {
724         trace_save_xbzrle_page_skipping();
725         return 0;
726     } else if (encoded_len == -1) {
727         trace_save_xbzrle_page_overflow();
728         xbzrle_counters.overflow++;
729         return -1;
730     }
731 
732     /* Send XBZRLE based compressed page */
733     bytes_xbzrle = save_page_header(rs, rs->f, block,
734                                     offset | RAM_SAVE_FLAG_XBZRLE);
735     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
736     qemu_put_be16(rs->f, encoded_len);
737     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
738     bytes_xbzrle += encoded_len + 1 + 2;
739     xbzrle_counters.pages++;
740     xbzrle_counters.bytes += bytes_xbzrle;
741     ram_counters.transferred += bytes_xbzrle;
742 
743     return 1;
744 }
745 
746 /**
747  * migration_bitmap_find_dirty: find the next dirty page from start
748  *
749  * Returns the page offset within memory region of the start of a dirty page
750  *
751  * @rs: current RAM state
752  * @rb: RAMBlock where to search for dirty pages
753  * @start: page where we start the search
754  */
755 static inline
756 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
757                                           unsigned long start)
758 {
759     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
760     unsigned long *bitmap = rb->bmap;
761     unsigned long next;
762 
763     if (ramblock_is_ignored(rb)) {
764         return size;
765     }
766 
767     /*
768      * When the free page optimization is enabled, we need to check the bitmap
769      * to send the non-free pages rather than all the pages in the bulk stage.
770      */
771     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
772         next = start + 1;
773     } else {
774         next = find_next_bit(bitmap, size, start);
775     }
776 
777     return next;
778 }
779 
780 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
781                                                 RAMBlock *rb,
782                                                 unsigned long page)
783 {
784     bool ret;
785 
786     qemu_mutex_lock(&rs->bitmap_mutex);
787 
788     /*
789      * Clear dirty bitmap if needed.  This _must_ be called before we
790      * send any of the page in the chunk because we need to make sure
791      * we can capture further page content changes when we sync dirty
792      * log the next time.  So as long as we are going to send any of
793      * the page in the chunk we clear the remote dirty bitmap for all.
794      * Clearing it earlier won't be a problem, but too late will.
795      */
796     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
797         uint8_t shift = rb->clear_bmap_shift;
798         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
799         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
800 
801         /*
802          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
803          * can make things easier sometimes since then start address
804          * of the small chunk will always be 64 pages aligned so the
805          * bitmap will always be aligned to unsigned long.  We should
806          * even be able to remove this restriction but I'm simply
807          * keeping it.
808          */
809         assert(shift >= 6);
810         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
811         memory_region_clear_dirty_bitmap(rb->mr, start, size);
812     }
813 
814     ret = test_and_clear_bit(page, rb->bmap);
815 
816     if (ret) {
817         rs->migration_dirty_pages--;
818     }
819     qemu_mutex_unlock(&rs->bitmap_mutex);
820 
821     return ret;
822 }
823 
824 /* Called with RCU critical section */
825 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
826 {
827     rs->migration_dirty_pages +=
828         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
829                                               &rs->num_dirty_pages_period);
830 }
831 
832 /**
833  * ram_pagesize_summary: calculate all the pagesizes of a VM
834  *
835  * Returns a summary bitmap of the page sizes of all RAMBlocks
836  *
837  * For VMs with just normal pages this is equivalent to the host page
838  * size. If it's got some huge pages then it's the OR of all the
839  * different page sizes.
840  */
841 uint64_t ram_pagesize_summary(void)
842 {
843     RAMBlock *block;
844     uint64_t summary = 0;
845 
846     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
847         summary |= block->page_size;
848     }
849 
850     return summary;
851 }
852 
853 uint64_t ram_get_total_transferred_pages(void)
854 {
855     return  ram_counters.normal + ram_counters.duplicate +
856                 compression_counters.pages + xbzrle_counters.pages;
857 }
858 
859 static void migration_update_rates(RAMState *rs, int64_t end_time)
860 {
861     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
862     double compressed_size;
863 
864     /* calculate period counters */
865     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
866                 / (end_time - rs->time_last_bitmap_sync);
867 
868     if (!page_count) {
869         return;
870     }
871 
872     if (migrate_use_xbzrle()) {
873         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
874             rs->xbzrle_cache_miss_prev) / page_count;
875         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
876     }
877 
878     if (migrate_use_compression()) {
879         compression_counters.busy_rate = (double)(compression_counters.busy -
880             rs->compress_thread_busy_prev) / page_count;
881         rs->compress_thread_busy_prev = compression_counters.busy;
882 
883         compressed_size = compression_counters.compressed_size -
884                           rs->compressed_size_prev;
885         if (compressed_size) {
886             double uncompressed_size = (compression_counters.pages -
887                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
888 
889             /* Compression-Ratio = Uncompressed-size / Compressed-size */
890             compression_counters.compression_rate =
891                                         uncompressed_size / compressed_size;
892 
893             rs->compress_pages_prev = compression_counters.pages;
894             rs->compressed_size_prev = compression_counters.compressed_size;
895         }
896     }
897 }
898 
899 static void migration_trigger_throttle(RAMState *rs)
900 {
901     MigrationState *s = migrate_get_current();
902     uint64_t threshold = s->parameters.throttle_trigger_threshold;
903 
904     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
905     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
906     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
907 
908     /* During block migration the auto-converge logic incorrectly detects
909      * that ram migration makes no progress. Avoid this by disabling the
910      * throttling logic during the bulk phase of block migration. */
911     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
912         /* The following detection logic can be refined later. For now:
913            Check to see if the ratio between dirtied bytes and the approx.
914            amount of bytes that just got transferred since the last time
915            we were in this routine reaches the threshold. If that happens
916            twice, start or increase throttling. */
917 
918         if ((bytes_dirty_period > bytes_dirty_threshold) &&
919             (++rs->dirty_rate_high_cnt >= 2)) {
920             trace_migration_throttle();
921             rs->dirty_rate_high_cnt = 0;
922             mig_throttle_guest_down();
923         }
924     }
925 }
926 
927 static void migration_bitmap_sync(RAMState *rs)
928 {
929     RAMBlock *block;
930     int64_t end_time;
931 
932     ram_counters.dirty_sync_count++;
933 
934     if (!rs->time_last_bitmap_sync) {
935         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
936     }
937 
938     trace_migration_bitmap_sync_start();
939     memory_global_dirty_log_sync();
940 
941     qemu_mutex_lock(&rs->bitmap_mutex);
942     WITH_RCU_READ_LOCK_GUARD() {
943         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
944             ramblock_sync_dirty_bitmap(rs, block);
945         }
946         ram_counters.remaining = ram_bytes_remaining();
947     }
948     qemu_mutex_unlock(&rs->bitmap_mutex);
949 
950     memory_global_after_dirty_log_sync();
951     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
952 
953     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
954 
955     /* more than 1 second = 1000 millisecons */
956     if (end_time > rs->time_last_bitmap_sync + 1000) {
957         migration_trigger_throttle(rs);
958 
959         migration_update_rates(rs, end_time);
960 
961         rs->target_page_count_prev = rs->target_page_count;
962 
963         /* reset period counters */
964         rs->time_last_bitmap_sync = end_time;
965         rs->num_dirty_pages_period = 0;
966         rs->bytes_xfer_prev = ram_counters.transferred;
967     }
968     if (migrate_use_events()) {
969         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
970     }
971 }
972 
973 static void migration_bitmap_sync_precopy(RAMState *rs)
974 {
975     Error *local_err = NULL;
976 
977     /*
978      * The current notifier usage is just an optimization to migration, so we
979      * don't stop the normal migration process in the error case.
980      */
981     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
982         error_report_err(local_err);
983         local_err = NULL;
984     }
985 
986     migration_bitmap_sync(rs);
987 
988     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
989         error_report_err(local_err);
990     }
991 }
992 
993 /**
994  * save_zero_page_to_file: send the zero page to the file
995  *
996  * Returns the size of data written to the file, 0 means the page is not
997  * a zero page
998  *
999  * @rs: current RAM state
1000  * @file: the file where the data is saved
1001  * @block: block that contains the page we want to send
1002  * @offset: offset inside the block for the page
1003  */
1004 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1005                                   RAMBlock *block, ram_addr_t offset)
1006 {
1007     uint8_t *p = block->host + offset;
1008     int len = 0;
1009 
1010     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1011         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1012         qemu_put_byte(file, 0);
1013         len += 1;
1014     }
1015     return len;
1016 }
1017 
1018 /**
1019  * save_zero_page: send the zero page to the stream
1020  *
1021  * Returns the number of pages written.
1022  *
1023  * @rs: current RAM state
1024  * @block: block that contains the page we want to send
1025  * @offset: offset inside the block for the page
1026  */
1027 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1028 {
1029     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1030 
1031     if (len) {
1032         ram_counters.duplicate++;
1033         ram_counters.transferred += len;
1034         return 1;
1035     }
1036     return -1;
1037 }
1038 
1039 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1040 {
1041     if (!migrate_release_ram() || !migration_in_postcopy()) {
1042         return;
1043     }
1044 
1045     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1046 }
1047 
1048 /*
1049  * @pages: the number of pages written by the control path,
1050  *        < 0 - error
1051  *        > 0 - number of pages written
1052  *
1053  * Return true if the pages has been saved, otherwise false is returned.
1054  */
1055 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1056                               int *pages)
1057 {
1058     uint64_t bytes_xmit = 0;
1059     int ret;
1060 
1061     *pages = -1;
1062     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1063                                 &bytes_xmit);
1064     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1065         return false;
1066     }
1067 
1068     if (bytes_xmit) {
1069         ram_counters.transferred += bytes_xmit;
1070         *pages = 1;
1071     }
1072 
1073     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1074         return true;
1075     }
1076 
1077     if (bytes_xmit > 0) {
1078         ram_counters.normal++;
1079     } else if (bytes_xmit == 0) {
1080         ram_counters.duplicate++;
1081     }
1082 
1083     return true;
1084 }
1085 
1086 /*
1087  * directly send the page to the stream
1088  *
1089  * Returns the number of pages written.
1090  *
1091  * @rs: current RAM state
1092  * @block: block that contains the page we want to send
1093  * @offset: offset inside the block for the page
1094  * @buf: the page to be sent
1095  * @async: send to page asyncly
1096  */
1097 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1098                             uint8_t *buf, bool async)
1099 {
1100     ram_counters.transferred += save_page_header(rs, rs->f, block,
1101                                                  offset | RAM_SAVE_FLAG_PAGE);
1102     if (async) {
1103         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1104                               migrate_release_ram() &
1105                               migration_in_postcopy());
1106     } else {
1107         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1108     }
1109     ram_counters.transferred += TARGET_PAGE_SIZE;
1110     ram_counters.normal++;
1111     return 1;
1112 }
1113 
1114 /**
1115  * ram_save_page: send the given page to the stream
1116  *
1117  * Returns the number of pages written.
1118  *          < 0 - error
1119  *          >=0 - Number of pages written - this might legally be 0
1120  *                if xbzrle noticed the page was the same.
1121  *
1122  * @rs: current RAM state
1123  * @block: block that contains the page we want to send
1124  * @offset: offset inside the block for the page
1125  * @last_stage: if we are at the completion stage
1126  */
1127 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1128 {
1129     int pages = -1;
1130     uint8_t *p;
1131     bool send_async = true;
1132     RAMBlock *block = pss->block;
1133     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1134     ram_addr_t current_addr = block->offset + offset;
1135 
1136     p = block->host + offset;
1137     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1138 
1139     XBZRLE_cache_lock();
1140     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1141         migrate_use_xbzrle()) {
1142         pages = save_xbzrle_page(rs, &p, current_addr, block,
1143                                  offset, last_stage);
1144         if (!last_stage) {
1145             /* Can't send this cached data async, since the cache page
1146              * might get updated before it gets to the wire
1147              */
1148             send_async = false;
1149         }
1150     }
1151 
1152     /* XBZRLE overflow or normal page */
1153     if (pages == -1) {
1154         pages = save_normal_page(rs, block, offset, p, send_async);
1155     }
1156 
1157     XBZRLE_cache_unlock();
1158 
1159     return pages;
1160 }
1161 
1162 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1163                                  ram_addr_t offset)
1164 {
1165     if (multifd_queue_page(rs->f, block, offset) < 0) {
1166         return -1;
1167     }
1168     ram_counters.normal++;
1169 
1170     return 1;
1171 }
1172 
1173 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1174                                  ram_addr_t offset, uint8_t *source_buf)
1175 {
1176     RAMState *rs = ram_state;
1177     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1178     bool zero_page = false;
1179     int ret;
1180 
1181     if (save_zero_page_to_file(rs, f, block, offset)) {
1182         zero_page = true;
1183         goto exit;
1184     }
1185 
1186     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1187 
1188     /*
1189      * copy it to a internal buffer to avoid it being modified by VM
1190      * so that we can catch up the error during compression and
1191      * decompression
1192      */
1193     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1194     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1195     if (ret < 0) {
1196         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1197         error_report("compressed data failed!");
1198         return false;
1199     }
1200 
1201 exit:
1202     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1203     return zero_page;
1204 }
1205 
1206 static void
1207 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1208 {
1209     ram_counters.transferred += bytes_xmit;
1210 
1211     if (param->zero_page) {
1212         ram_counters.duplicate++;
1213         return;
1214     }
1215 
1216     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1217     compression_counters.compressed_size += bytes_xmit - 8;
1218     compression_counters.pages++;
1219 }
1220 
1221 static bool save_page_use_compression(RAMState *rs);
1222 
1223 static void flush_compressed_data(RAMState *rs)
1224 {
1225     int idx, len, thread_count;
1226 
1227     if (!save_page_use_compression(rs)) {
1228         return;
1229     }
1230     thread_count = migrate_compress_threads();
1231 
1232     qemu_mutex_lock(&comp_done_lock);
1233     for (idx = 0; idx < thread_count; idx++) {
1234         while (!comp_param[idx].done) {
1235             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1236         }
1237     }
1238     qemu_mutex_unlock(&comp_done_lock);
1239 
1240     for (idx = 0; idx < thread_count; idx++) {
1241         qemu_mutex_lock(&comp_param[idx].mutex);
1242         if (!comp_param[idx].quit) {
1243             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1244             /*
1245              * it's safe to fetch zero_page without holding comp_done_lock
1246              * as there is no further request submitted to the thread,
1247              * i.e, the thread should be waiting for a request at this point.
1248              */
1249             update_compress_thread_counts(&comp_param[idx], len);
1250         }
1251         qemu_mutex_unlock(&comp_param[idx].mutex);
1252     }
1253 }
1254 
1255 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1256                                        ram_addr_t offset)
1257 {
1258     param->block = block;
1259     param->offset = offset;
1260 }
1261 
1262 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1263                                            ram_addr_t offset)
1264 {
1265     int idx, thread_count, bytes_xmit = -1, pages = -1;
1266     bool wait = migrate_compress_wait_thread();
1267 
1268     thread_count = migrate_compress_threads();
1269     qemu_mutex_lock(&comp_done_lock);
1270 retry:
1271     for (idx = 0; idx < thread_count; idx++) {
1272         if (comp_param[idx].done) {
1273             comp_param[idx].done = false;
1274             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1275             qemu_mutex_lock(&comp_param[idx].mutex);
1276             set_compress_params(&comp_param[idx], block, offset);
1277             qemu_cond_signal(&comp_param[idx].cond);
1278             qemu_mutex_unlock(&comp_param[idx].mutex);
1279             pages = 1;
1280             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1281             break;
1282         }
1283     }
1284 
1285     /*
1286      * wait for the free thread if the user specifies 'compress-wait-thread',
1287      * otherwise we will post the page out in the main thread as normal page.
1288      */
1289     if (pages < 0 && wait) {
1290         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1291         goto retry;
1292     }
1293     qemu_mutex_unlock(&comp_done_lock);
1294 
1295     return pages;
1296 }
1297 
1298 /**
1299  * find_dirty_block: find the next dirty page and update any state
1300  * associated with the search process.
1301  *
1302  * Returns true if a page is found
1303  *
1304  * @rs: current RAM state
1305  * @pss: data about the state of the current dirty page scan
1306  * @again: set to false if the search has scanned the whole of RAM
1307  */
1308 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1309 {
1310     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1311     if (pss->complete_round && pss->block == rs->last_seen_block &&
1312         pss->page >= rs->last_page) {
1313         /*
1314          * We've been once around the RAM and haven't found anything.
1315          * Give up.
1316          */
1317         *again = false;
1318         return false;
1319     }
1320     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1321         >= pss->block->used_length) {
1322         /* Didn't find anything in this RAM Block */
1323         pss->page = 0;
1324         pss->block = QLIST_NEXT_RCU(pss->block, next);
1325         if (!pss->block) {
1326             /*
1327              * If memory migration starts over, we will meet a dirtied page
1328              * which may still exists in compression threads's ring, so we
1329              * should flush the compressed data to make sure the new page
1330              * is not overwritten by the old one in the destination.
1331              *
1332              * Also If xbzrle is on, stop using the data compression at this
1333              * point. In theory, xbzrle can do better than compression.
1334              */
1335             flush_compressed_data(rs);
1336 
1337             /* Hit the end of the list */
1338             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1339             /* Flag that we've looped */
1340             pss->complete_round = true;
1341             rs->ram_bulk_stage = false;
1342         }
1343         /* Didn't find anything this time, but try again on the new block */
1344         *again = true;
1345         return false;
1346     } else {
1347         /* Can go around again, but... */
1348         *again = true;
1349         /* We've found something so probably don't need to */
1350         return true;
1351     }
1352 }
1353 
1354 /**
1355  * unqueue_page: gets a page of the queue
1356  *
1357  * Helper for 'get_queued_page' - gets a page off the queue
1358  *
1359  * Returns the block of the page (or NULL if none available)
1360  *
1361  * @rs: current RAM state
1362  * @offset: used to return the offset within the RAMBlock
1363  */
1364 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1365 {
1366     RAMBlock *block = NULL;
1367 
1368     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1369         return NULL;
1370     }
1371 
1372     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1373     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1374         struct RAMSrcPageRequest *entry =
1375                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1376         block = entry->rb;
1377         *offset = entry->offset;
1378 
1379         if (entry->len > TARGET_PAGE_SIZE) {
1380             entry->len -= TARGET_PAGE_SIZE;
1381             entry->offset += TARGET_PAGE_SIZE;
1382         } else {
1383             memory_region_unref(block->mr);
1384             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1385             g_free(entry);
1386             migration_consume_urgent_request();
1387         }
1388     }
1389 
1390     return block;
1391 }
1392 
1393 /**
1394  * get_queued_page: unqueue a page from the postcopy requests
1395  *
1396  * Skips pages that are already sent (!dirty)
1397  *
1398  * Returns true if a queued page is found
1399  *
1400  * @rs: current RAM state
1401  * @pss: data about the state of the current dirty page scan
1402  */
1403 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1404 {
1405     RAMBlock  *block;
1406     ram_addr_t offset;
1407     bool dirty;
1408 
1409     do {
1410         block = unqueue_page(rs, &offset);
1411         /*
1412          * We're sending this page, and since it's postcopy nothing else
1413          * will dirty it, and we must make sure it doesn't get sent again
1414          * even if this queue request was received after the background
1415          * search already sent it.
1416          */
1417         if (block) {
1418             unsigned long page;
1419 
1420             page = offset >> TARGET_PAGE_BITS;
1421             dirty = test_bit(page, block->bmap);
1422             if (!dirty) {
1423                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1424                                                 page);
1425             } else {
1426                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1427             }
1428         }
1429 
1430     } while (block && !dirty);
1431 
1432     if (block) {
1433         /*
1434          * As soon as we start servicing pages out of order, then we have
1435          * to kill the bulk stage, since the bulk stage assumes
1436          * in (migration_bitmap_find_and_reset_dirty) that every page is
1437          * dirty, that's no longer true.
1438          */
1439         rs->ram_bulk_stage = false;
1440 
1441         /*
1442          * We want the background search to continue from the queued page
1443          * since the guest is likely to want other pages near to the page
1444          * it just requested.
1445          */
1446         pss->block = block;
1447         pss->page = offset >> TARGET_PAGE_BITS;
1448 
1449         /*
1450          * This unqueued page would break the "one round" check, even is
1451          * really rare.
1452          */
1453         pss->complete_round = false;
1454     }
1455 
1456     return !!block;
1457 }
1458 
1459 /**
1460  * migration_page_queue_free: drop any remaining pages in the ram
1461  * request queue
1462  *
1463  * It should be empty at the end anyway, but in error cases there may
1464  * be some left.  in case that there is any page left, we drop it.
1465  *
1466  */
1467 static void migration_page_queue_free(RAMState *rs)
1468 {
1469     struct RAMSrcPageRequest *mspr, *next_mspr;
1470     /* This queue generally should be empty - but in the case of a failed
1471      * migration might have some droppings in.
1472      */
1473     RCU_READ_LOCK_GUARD();
1474     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1475         memory_region_unref(mspr->rb->mr);
1476         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1477         g_free(mspr);
1478     }
1479 }
1480 
1481 /**
1482  * ram_save_queue_pages: queue the page for transmission
1483  *
1484  * A request from postcopy destination for example.
1485  *
1486  * Returns zero on success or negative on error
1487  *
1488  * @rbname: Name of the RAMBLock of the request. NULL means the
1489  *          same that last one.
1490  * @start: starting address from the start of the RAMBlock
1491  * @len: length (in bytes) to send
1492  */
1493 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1494 {
1495     RAMBlock *ramblock;
1496     RAMState *rs = ram_state;
1497 
1498     ram_counters.postcopy_requests++;
1499     RCU_READ_LOCK_GUARD();
1500 
1501     if (!rbname) {
1502         /* Reuse last RAMBlock */
1503         ramblock = rs->last_req_rb;
1504 
1505         if (!ramblock) {
1506             /*
1507              * Shouldn't happen, we can't reuse the last RAMBlock if
1508              * it's the 1st request.
1509              */
1510             error_report("ram_save_queue_pages no previous block");
1511             return -1;
1512         }
1513     } else {
1514         ramblock = qemu_ram_block_by_name(rbname);
1515 
1516         if (!ramblock) {
1517             /* We shouldn't be asked for a non-existent RAMBlock */
1518             error_report("ram_save_queue_pages no block '%s'", rbname);
1519             return -1;
1520         }
1521         rs->last_req_rb = ramblock;
1522     }
1523     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1524     if (start+len > ramblock->used_length) {
1525         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1526                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1527                      __func__, start, len, ramblock->used_length);
1528         return -1;
1529     }
1530 
1531     struct RAMSrcPageRequest *new_entry =
1532         g_malloc0(sizeof(struct RAMSrcPageRequest));
1533     new_entry->rb = ramblock;
1534     new_entry->offset = start;
1535     new_entry->len = len;
1536 
1537     memory_region_ref(ramblock->mr);
1538     qemu_mutex_lock(&rs->src_page_req_mutex);
1539     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1540     migration_make_urgent_request();
1541     qemu_mutex_unlock(&rs->src_page_req_mutex);
1542 
1543     return 0;
1544 }
1545 
1546 static bool save_page_use_compression(RAMState *rs)
1547 {
1548     if (!migrate_use_compression()) {
1549         return false;
1550     }
1551 
1552     /*
1553      * If xbzrle is on, stop using the data compression after first
1554      * round of migration even if compression is enabled. In theory,
1555      * xbzrle can do better than compression.
1556      */
1557     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1558         return true;
1559     }
1560 
1561     return false;
1562 }
1563 
1564 /*
1565  * try to compress the page before posting it out, return true if the page
1566  * has been properly handled by compression, otherwise needs other
1567  * paths to handle it
1568  */
1569 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1570 {
1571     if (!save_page_use_compression(rs)) {
1572         return false;
1573     }
1574 
1575     /*
1576      * When starting the process of a new block, the first page of
1577      * the block should be sent out before other pages in the same
1578      * block, and all the pages in last block should have been sent
1579      * out, keeping this order is important, because the 'cont' flag
1580      * is used to avoid resending the block name.
1581      *
1582      * We post the fist page as normal page as compression will take
1583      * much CPU resource.
1584      */
1585     if (block != rs->last_sent_block) {
1586         flush_compressed_data(rs);
1587         return false;
1588     }
1589 
1590     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1591         return true;
1592     }
1593 
1594     compression_counters.busy++;
1595     return false;
1596 }
1597 
1598 /**
1599  * ram_save_target_page: save one target page
1600  *
1601  * Returns the number of pages written
1602  *
1603  * @rs: current RAM state
1604  * @pss: data about the page we want to send
1605  * @last_stage: if we are at the completion stage
1606  */
1607 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1608                                 bool last_stage)
1609 {
1610     RAMBlock *block = pss->block;
1611     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1612     int res;
1613 
1614     if (control_save_page(rs, block, offset, &res)) {
1615         return res;
1616     }
1617 
1618     if (save_compress_page(rs, block, offset)) {
1619         return 1;
1620     }
1621 
1622     res = save_zero_page(rs, block, offset);
1623     if (res > 0) {
1624         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1625          * page would be stale
1626          */
1627         if (!save_page_use_compression(rs)) {
1628             XBZRLE_cache_lock();
1629             xbzrle_cache_zero_page(rs, block->offset + offset);
1630             XBZRLE_cache_unlock();
1631         }
1632         ram_release_pages(block->idstr, offset, res);
1633         return res;
1634     }
1635 
1636     /*
1637      * Do not use multifd for:
1638      * 1. Compression as the first page in the new block should be posted out
1639      *    before sending the compressed page
1640      * 2. In postcopy as one whole host page should be placed
1641      */
1642     if (!save_page_use_compression(rs) && migrate_use_multifd()
1643         && !migration_in_postcopy()) {
1644         return ram_save_multifd_page(rs, block, offset);
1645     }
1646 
1647     return ram_save_page(rs, pss, last_stage);
1648 }
1649 
1650 /**
1651  * ram_save_host_page: save a whole host page
1652  *
1653  * Starting at *offset send pages up to the end of the current host
1654  * page. It's valid for the initial offset to point into the middle of
1655  * a host page in which case the remainder of the hostpage is sent.
1656  * Only dirty target pages are sent. Note that the host page size may
1657  * be a huge page for this block.
1658  * The saving stops at the boundary of the used_length of the block
1659  * if the RAMBlock isn't a multiple of the host page size.
1660  *
1661  * Returns the number of pages written or negative on error
1662  *
1663  * @rs: current RAM state
1664  * @ms: current migration state
1665  * @pss: data about the page we want to send
1666  * @last_stage: if we are at the completion stage
1667  */
1668 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1669                               bool last_stage)
1670 {
1671     int tmppages, pages = 0;
1672     size_t pagesize_bits =
1673         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1674 
1675     if (ramblock_is_ignored(pss->block)) {
1676         error_report("block %s should not be migrated !", pss->block->idstr);
1677         return 0;
1678     }
1679 
1680     do {
1681         /* Check the pages is dirty and if it is send it */
1682         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1683             pss->page++;
1684             continue;
1685         }
1686 
1687         tmppages = ram_save_target_page(rs, pss, last_stage);
1688         if (tmppages < 0) {
1689             return tmppages;
1690         }
1691 
1692         pages += tmppages;
1693         pss->page++;
1694         /* Allow rate limiting to happen in the middle of huge pages */
1695         migration_rate_limit();
1696     } while ((pss->page & (pagesize_bits - 1)) &&
1697              offset_in_ramblock(pss->block,
1698                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
1699 
1700     /* The offset we leave with is the last one we looked at */
1701     pss->page--;
1702     return pages;
1703 }
1704 
1705 /**
1706  * ram_find_and_save_block: finds a dirty page and sends it to f
1707  *
1708  * Called within an RCU critical section.
1709  *
1710  * Returns the number of pages written where zero means no dirty pages,
1711  * or negative on error
1712  *
1713  * @rs: current RAM state
1714  * @last_stage: if we are at the completion stage
1715  *
1716  * On systems where host-page-size > target-page-size it will send all the
1717  * pages in a host page that are dirty.
1718  */
1719 
1720 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1721 {
1722     PageSearchStatus pss;
1723     int pages = 0;
1724     bool again, found;
1725 
1726     /* No dirty page as there is zero RAM */
1727     if (!ram_bytes_total()) {
1728         return pages;
1729     }
1730 
1731     pss.block = rs->last_seen_block;
1732     pss.page = rs->last_page;
1733     pss.complete_round = false;
1734 
1735     if (!pss.block) {
1736         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1737     }
1738 
1739     do {
1740         again = true;
1741         found = get_queued_page(rs, &pss);
1742 
1743         if (!found) {
1744             /* priority queue empty, so just search for something dirty */
1745             found = find_dirty_block(rs, &pss, &again);
1746         }
1747 
1748         if (found) {
1749             pages = ram_save_host_page(rs, &pss, last_stage);
1750         }
1751     } while (!pages && again);
1752 
1753     rs->last_seen_block = pss.block;
1754     rs->last_page = pss.page;
1755 
1756     return pages;
1757 }
1758 
1759 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1760 {
1761     uint64_t pages = size / TARGET_PAGE_SIZE;
1762 
1763     if (zero) {
1764         ram_counters.duplicate += pages;
1765     } else {
1766         ram_counters.normal += pages;
1767         ram_counters.transferred += size;
1768         qemu_update_position(f, size);
1769     }
1770 }
1771 
1772 static uint64_t ram_bytes_total_common(bool count_ignored)
1773 {
1774     RAMBlock *block;
1775     uint64_t total = 0;
1776 
1777     RCU_READ_LOCK_GUARD();
1778 
1779     if (count_ignored) {
1780         RAMBLOCK_FOREACH_MIGRATABLE(block) {
1781             total += block->used_length;
1782         }
1783     } else {
1784         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1785             total += block->used_length;
1786         }
1787     }
1788     return total;
1789 }
1790 
1791 uint64_t ram_bytes_total(void)
1792 {
1793     return ram_bytes_total_common(false);
1794 }
1795 
1796 static void xbzrle_load_setup(void)
1797 {
1798     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1799 }
1800 
1801 static void xbzrle_load_cleanup(void)
1802 {
1803     g_free(XBZRLE.decoded_buf);
1804     XBZRLE.decoded_buf = NULL;
1805 }
1806 
1807 static void ram_state_cleanup(RAMState **rsp)
1808 {
1809     if (*rsp) {
1810         migration_page_queue_free(*rsp);
1811         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1812         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1813         g_free(*rsp);
1814         *rsp = NULL;
1815     }
1816 }
1817 
1818 static void xbzrle_cleanup(void)
1819 {
1820     XBZRLE_cache_lock();
1821     if (XBZRLE.cache) {
1822         cache_fini(XBZRLE.cache);
1823         g_free(XBZRLE.encoded_buf);
1824         g_free(XBZRLE.current_buf);
1825         g_free(XBZRLE.zero_target_page);
1826         XBZRLE.cache = NULL;
1827         XBZRLE.encoded_buf = NULL;
1828         XBZRLE.current_buf = NULL;
1829         XBZRLE.zero_target_page = NULL;
1830     }
1831     XBZRLE_cache_unlock();
1832 }
1833 
1834 static void ram_save_cleanup(void *opaque)
1835 {
1836     RAMState **rsp = opaque;
1837     RAMBlock *block;
1838 
1839     /* caller have hold iothread lock or is in a bh, so there is
1840      * no writing race against the migration bitmap
1841      */
1842     memory_global_dirty_log_stop();
1843 
1844     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1845         g_free(block->clear_bmap);
1846         block->clear_bmap = NULL;
1847         g_free(block->bmap);
1848         block->bmap = NULL;
1849     }
1850 
1851     xbzrle_cleanup();
1852     compress_threads_save_cleanup();
1853     ram_state_cleanup(rsp);
1854 }
1855 
1856 static void ram_state_reset(RAMState *rs)
1857 {
1858     rs->last_seen_block = NULL;
1859     rs->last_sent_block = NULL;
1860     rs->last_page = 0;
1861     rs->last_version = ram_list.version;
1862     rs->ram_bulk_stage = true;
1863     rs->fpo_enabled = false;
1864 }
1865 
1866 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1867 
1868 /*
1869  * 'expected' is the value you expect the bitmap mostly to be full
1870  * of; it won't bother printing lines that are all this value.
1871  * If 'todump' is null the migration bitmap is dumped.
1872  */
1873 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1874                            unsigned long pages)
1875 {
1876     int64_t cur;
1877     int64_t linelen = 128;
1878     char linebuf[129];
1879 
1880     for (cur = 0; cur < pages; cur += linelen) {
1881         int64_t curb;
1882         bool found = false;
1883         /*
1884          * Last line; catch the case where the line length
1885          * is longer than remaining ram
1886          */
1887         if (cur + linelen > pages) {
1888             linelen = pages - cur;
1889         }
1890         for (curb = 0; curb < linelen; curb++) {
1891             bool thisbit = test_bit(cur + curb, todump);
1892             linebuf[curb] = thisbit ? '1' : '.';
1893             found = found || (thisbit != expected);
1894         }
1895         if (found) {
1896             linebuf[curb] = '\0';
1897             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1898         }
1899     }
1900 }
1901 
1902 /* **** functions for postcopy ***** */
1903 
1904 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1905 {
1906     struct RAMBlock *block;
1907 
1908     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1909         unsigned long *bitmap = block->bmap;
1910         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1911         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1912 
1913         while (run_start < range) {
1914             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1915             ram_discard_range(block->idstr,
1916                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1917                               ((ram_addr_t)(run_end - run_start))
1918                                 << TARGET_PAGE_BITS);
1919             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1920         }
1921     }
1922 }
1923 
1924 /**
1925  * postcopy_send_discard_bm_ram: discard a RAMBlock
1926  *
1927  * Returns zero on success
1928  *
1929  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1930  *
1931  * @ms: current migration state
1932  * @block: RAMBlock to discard
1933  */
1934 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
1935 {
1936     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1937     unsigned long current;
1938     unsigned long *bitmap = block->bmap;
1939 
1940     for (current = 0; current < end; ) {
1941         unsigned long one = find_next_bit(bitmap, end, current);
1942         unsigned long zero, discard_length;
1943 
1944         if (one >= end) {
1945             break;
1946         }
1947 
1948         zero = find_next_zero_bit(bitmap, end, one + 1);
1949 
1950         if (zero >= end) {
1951             discard_length = end - one;
1952         } else {
1953             discard_length = zero - one;
1954         }
1955         postcopy_discard_send_range(ms, one, discard_length);
1956         current = one + discard_length;
1957     }
1958 
1959     return 0;
1960 }
1961 
1962 /**
1963  * postcopy_each_ram_send_discard: discard all RAMBlocks
1964  *
1965  * Returns 0 for success or negative for error
1966  *
1967  * Utility for the outgoing postcopy code.
1968  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1969  *   passing it bitmap indexes and name.
1970  * (qemu_ram_foreach_block ends up passing unscaled lengths
1971  *  which would mean postcopy code would have to deal with target page)
1972  *
1973  * @ms: current migration state
1974  */
1975 static int postcopy_each_ram_send_discard(MigrationState *ms)
1976 {
1977     struct RAMBlock *block;
1978     int ret;
1979 
1980     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1981         postcopy_discard_send_init(ms, block->idstr);
1982 
1983         /*
1984          * Postcopy sends chunks of bitmap over the wire, but it
1985          * just needs indexes at this point, avoids it having
1986          * target page specific code.
1987          */
1988         ret = postcopy_send_discard_bm_ram(ms, block);
1989         postcopy_discard_send_finish(ms);
1990         if (ret) {
1991             return ret;
1992         }
1993     }
1994 
1995     return 0;
1996 }
1997 
1998 /**
1999  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2000  *
2001  * Helper for postcopy_chunk_hostpages; it's called twice to
2002  * canonicalize the two bitmaps, that are similar, but one is
2003  * inverted.
2004  *
2005  * Postcopy requires that all target pages in a hostpage are dirty or
2006  * clean, not a mix.  This function canonicalizes the bitmaps.
2007  *
2008  * @ms: current migration state
2009  * @block: block that contains the page we want to canonicalize
2010  */
2011 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2012 {
2013     RAMState *rs = ram_state;
2014     unsigned long *bitmap = block->bmap;
2015     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2016     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2017     unsigned long run_start;
2018 
2019     if (block->page_size == TARGET_PAGE_SIZE) {
2020         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2021         return;
2022     }
2023 
2024     /* Find a dirty page */
2025     run_start = find_next_bit(bitmap, pages, 0);
2026 
2027     while (run_start < pages) {
2028 
2029         /*
2030          * If the start of this run of pages is in the middle of a host
2031          * page, then we need to fixup this host page.
2032          */
2033         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2034             /* Find the end of this run */
2035             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2036             /*
2037              * If the end isn't at the start of a host page, then the
2038              * run doesn't finish at the end of a host page
2039              * and we need to discard.
2040              */
2041         }
2042 
2043         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2044             unsigned long page;
2045             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2046                                                              host_ratio);
2047             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2048 
2049             /* Clean up the bitmap */
2050             for (page = fixup_start_addr;
2051                  page < fixup_start_addr + host_ratio; page++) {
2052                 /*
2053                  * Remark them as dirty, updating the count for any pages
2054                  * that weren't previously dirty.
2055                  */
2056                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2057             }
2058         }
2059 
2060         /* Find the next dirty page for the next iteration */
2061         run_start = find_next_bit(bitmap, pages, run_start);
2062     }
2063 }
2064 
2065 /**
2066  * postcopy_chunk_hostpages: discard any partially sent host page
2067  *
2068  * Utility for the outgoing postcopy code.
2069  *
2070  * Discard any partially sent host-page size chunks, mark any partially
2071  * dirty host-page size chunks as all dirty.  In this case the host-page
2072  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2073  *
2074  * Returns zero on success
2075  *
2076  * @ms: current migration state
2077  * @block: block we want to work with
2078  */
2079 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2080 {
2081     postcopy_discard_send_init(ms, block->idstr);
2082 
2083     /*
2084      * Ensure that all partially dirty host pages are made fully dirty.
2085      */
2086     postcopy_chunk_hostpages_pass(ms, block);
2087 
2088     postcopy_discard_send_finish(ms);
2089     return 0;
2090 }
2091 
2092 /**
2093  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2094  *
2095  * Returns zero on success
2096  *
2097  * Transmit the set of pages to be discarded after precopy to the target
2098  * these are pages that:
2099  *     a) Have been previously transmitted but are now dirty again
2100  *     b) Pages that have never been transmitted, this ensures that
2101  *        any pages on the destination that have been mapped by background
2102  *        tasks get discarded (transparent huge pages is the specific concern)
2103  * Hopefully this is pretty sparse
2104  *
2105  * @ms: current migration state
2106  */
2107 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2108 {
2109     RAMState *rs = ram_state;
2110     RAMBlock *block;
2111     int ret;
2112 
2113     RCU_READ_LOCK_GUARD();
2114 
2115     /* This should be our last sync, the src is now paused */
2116     migration_bitmap_sync(rs);
2117 
2118     /* Easiest way to make sure we don't resume in the middle of a host-page */
2119     rs->last_seen_block = NULL;
2120     rs->last_sent_block = NULL;
2121     rs->last_page = 0;
2122 
2123     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2124         /* Deal with TPS != HPS and huge pages */
2125         ret = postcopy_chunk_hostpages(ms, block);
2126         if (ret) {
2127             return ret;
2128         }
2129 
2130 #ifdef DEBUG_POSTCOPY
2131         ram_debug_dump_bitmap(block->bmap, true,
2132                               block->used_length >> TARGET_PAGE_BITS);
2133 #endif
2134     }
2135     trace_ram_postcopy_send_discard_bitmap();
2136 
2137     return postcopy_each_ram_send_discard(ms);
2138 }
2139 
2140 /**
2141  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2142  *
2143  * Returns zero on success
2144  *
2145  * @rbname: name of the RAMBlock of the request. NULL means the
2146  *          same that last one.
2147  * @start: RAMBlock starting page
2148  * @length: RAMBlock size
2149  */
2150 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2151 {
2152     trace_ram_discard_range(rbname, start, length);
2153 
2154     RCU_READ_LOCK_GUARD();
2155     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2156 
2157     if (!rb) {
2158         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2159         return -1;
2160     }
2161 
2162     /*
2163      * On source VM, we don't need to update the received bitmap since
2164      * we don't even have one.
2165      */
2166     if (rb->receivedmap) {
2167         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2168                      length >> qemu_target_page_bits());
2169     }
2170 
2171     return ram_block_discard_range(rb, start, length);
2172 }
2173 
2174 /*
2175  * For every allocation, we will try not to crash the VM if the
2176  * allocation failed.
2177  */
2178 static int xbzrle_init(void)
2179 {
2180     Error *local_err = NULL;
2181 
2182     if (!migrate_use_xbzrle()) {
2183         return 0;
2184     }
2185 
2186     XBZRLE_cache_lock();
2187 
2188     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2189     if (!XBZRLE.zero_target_page) {
2190         error_report("%s: Error allocating zero page", __func__);
2191         goto err_out;
2192     }
2193 
2194     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2195                               TARGET_PAGE_SIZE, &local_err);
2196     if (!XBZRLE.cache) {
2197         error_report_err(local_err);
2198         goto free_zero_page;
2199     }
2200 
2201     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2202     if (!XBZRLE.encoded_buf) {
2203         error_report("%s: Error allocating encoded_buf", __func__);
2204         goto free_cache;
2205     }
2206 
2207     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2208     if (!XBZRLE.current_buf) {
2209         error_report("%s: Error allocating current_buf", __func__);
2210         goto free_encoded_buf;
2211     }
2212 
2213     /* We are all good */
2214     XBZRLE_cache_unlock();
2215     return 0;
2216 
2217 free_encoded_buf:
2218     g_free(XBZRLE.encoded_buf);
2219     XBZRLE.encoded_buf = NULL;
2220 free_cache:
2221     cache_fini(XBZRLE.cache);
2222     XBZRLE.cache = NULL;
2223 free_zero_page:
2224     g_free(XBZRLE.zero_target_page);
2225     XBZRLE.zero_target_page = NULL;
2226 err_out:
2227     XBZRLE_cache_unlock();
2228     return -ENOMEM;
2229 }
2230 
2231 static int ram_state_init(RAMState **rsp)
2232 {
2233     *rsp = g_try_new0(RAMState, 1);
2234 
2235     if (!*rsp) {
2236         error_report("%s: Init ramstate fail", __func__);
2237         return -1;
2238     }
2239 
2240     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2241     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2242     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2243 
2244     /*
2245      * Count the total number of pages used by ram blocks not including any
2246      * gaps due to alignment or unplugs.
2247      * This must match with the initial values of dirty bitmap.
2248      */
2249     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2250     ram_state_reset(*rsp);
2251 
2252     return 0;
2253 }
2254 
2255 static void ram_list_init_bitmaps(void)
2256 {
2257     MigrationState *ms = migrate_get_current();
2258     RAMBlock *block;
2259     unsigned long pages;
2260     uint8_t shift;
2261 
2262     /* Skip setting bitmap if there is no RAM */
2263     if (ram_bytes_total()) {
2264         shift = ms->clear_bitmap_shift;
2265         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2266             error_report("clear_bitmap_shift (%u) too big, using "
2267                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2268             shift = CLEAR_BITMAP_SHIFT_MAX;
2269         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2270             error_report("clear_bitmap_shift (%u) too small, using "
2271                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2272             shift = CLEAR_BITMAP_SHIFT_MIN;
2273         }
2274 
2275         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2276             pages = block->max_length >> TARGET_PAGE_BITS;
2277             /*
2278              * The initial dirty bitmap for migration must be set with all
2279              * ones to make sure we'll migrate every guest RAM page to
2280              * destination.
2281              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2282              * new migration after a failed migration, ram_list.
2283              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2284              * guest memory.
2285              */
2286             block->bmap = bitmap_new(pages);
2287             bitmap_set(block->bmap, 0, pages);
2288             block->clear_bmap_shift = shift;
2289             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2290         }
2291     }
2292 }
2293 
2294 static void ram_init_bitmaps(RAMState *rs)
2295 {
2296     /* For memory_global_dirty_log_start below.  */
2297     qemu_mutex_lock_iothread();
2298     qemu_mutex_lock_ramlist();
2299 
2300     WITH_RCU_READ_LOCK_GUARD() {
2301         ram_list_init_bitmaps();
2302         memory_global_dirty_log_start();
2303         migration_bitmap_sync_precopy(rs);
2304     }
2305     qemu_mutex_unlock_ramlist();
2306     qemu_mutex_unlock_iothread();
2307 }
2308 
2309 static int ram_init_all(RAMState **rsp)
2310 {
2311     if (ram_state_init(rsp)) {
2312         return -1;
2313     }
2314 
2315     if (xbzrle_init()) {
2316         ram_state_cleanup(rsp);
2317         return -1;
2318     }
2319 
2320     ram_init_bitmaps(*rsp);
2321 
2322     return 0;
2323 }
2324 
2325 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2326 {
2327     RAMBlock *block;
2328     uint64_t pages = 0;
2329 
2330     /*
2331      * Postcopy is not using xbzrle/compression, so no need for that.
2332      * Also, since source are already halted, we don't need to care
2333      * about dirty page logging as well.
2334      */
2335 
2336     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2337         pages += bitmap_count_one(block->bmap,
2338                                   block->used_length >> TARGET_PAGE_BITS);
2339     }
2340 
2341     /* This may not be aligned with current bitmaps. Recalculate. */
2342     rs->migration_dirty_pages = pages;
2343 
2344     rs->last_seen_block = NULL;
2345     rs->last_sent_block = NULL;
2346     rs->last_page = 0;
2347     rs->last_version = ram_list.version;
2348     /*
2349      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2350      * matter what we have sent.
2351      */
2352     rs->ram_bulk_stage = false;
2353 
2354     /* Update RAMState cache of output QEMUFile */
2355     rs->f = out;
2356 
2357     trace_ram_state_resume_prepare(pages);
2358 }
2359 
2360 /*
2361  * This function clears bits of the free pages reported by the caller from the
2362  * migration dirty bitmap. @addr is the host address corresponding to the
2363  * start of the continuous guest free pages, and @len is the total bytes of
2364  * those pages.
2365  */
2366 void qemu_guest_free_page_hint(void *addr, size_t len)
2367 {
2368     RAMBlock *block;
2369     ram_addr_t offset;
2370     size_t used_len, start, npages;
2371     MigrationState *s = migrate_get_current();
2372 
2373     /* This function is currently expected to be used during live migration */
2374     if (!migration_is_setup_or_active(s->state)) {
2375         return;
2376     }
2377 
2378     for (; len > 0; len -= used_len, addr += used_len) {
2379         block = qemu_ram_block_from_host(addr, false, &offset);
2380         if (unlikely(!block || offset >= block->used_length)) {
2381             /*
2382              * The implementation might not support RAMBlock resize during
2383              * live migration, but it could happen in theory with future
2384              * updates. So we add a check here to capture that case.
2385              */
2386             error_report_once("%s unexpected error", __func__);
2387             return;
2388         }
2389 
2390         if (len <= block->used_length - offset) {
2391             used_len = len;
2392         } else {
2393             used_len = block->used_length - offset;
2394         }
2395 
2396         start = offset >> TARGET_PAGE_BITS;
2397         npages = used_len >> TARGET_PAGE_BITS;
2398 
2399         qemu_mutex_lock(&ram_state->bitmap_mutex);
2400         ram_state->migration_dirty_pages -=
2401                       bitmap_count_one_with_offset(block->bmap, start, npages);
2402         bitmap_clear(block->bmap, start, npages);
2403         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2404     }
2405 }
2406 
2407 /*
2408  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2409  * long-running RCU critical section.  When rcu-reclaims in the code
2410  * start to become numerous it will be necessary to reduce the
2411  * granularity of these critical sections.
2412  */
2413 
2414 /**
2415  * ram_save_setup: Setup RAM for migration
2416  *
2417  * Returns zero to indicate success and negative for error
2418  *
2419  * @f: QEMUFile where to send the data
2420  * @opaque: RAMState pointer
2421  */
2422 static int ram_save_setup(QEMUFile *f, void *opaque)
2423 {
2424     RAMState **rsp = opaque;
2425     RAMBlock *block;
2426 
2427     if (compress_threads_save_setup()) {
2428         return -1;
2429     }
2430 
2431     /* migration has already setup the bitmap, reuse it. */
2432     if (!migration_in_colo_state()) {
2433         if (ram_init_all(rsp) != 0) {
2434             compress_threads_save_cleanup();
2435             return -1;
2436         }
2437     }
2438     (*rsp)->f = f;
2439 
2440     WITH_RCU_READ_LOCK_GUARD() {
2441         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2442 
2443         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2444             qemu_put_byte(f, strlen(block->idstr));
2445             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2446             qemu_put_be64(f, block->used_length);
2447             if (migrate_postcopy_ram() && block->page_size !=
2448                                           qemu_host_page_size) {
2449                 qemu_put_be64(f, block->page_size);
2450             }
2451             if (migrate_ignore_shared()) {
2452                 qemu_put_be64(f, block->mr->addr);
2453             }
2454         }
2455     }
2456 
2457     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2458     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2459 
2460     multifd_send_sync_main(f);
2461     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2462     qemu_fflush(f);
2463 
2464     return 0;
2465 }
2466 
2467 /**
2468  * ram_save_iterate: iterative stage for migration
2469  *
2470  * Returns zero to indicate success and negative for error
2471  *
2472  * @f: QEMUFile where to send the data
2473  * @opaque: RAMState pointer
2474  */
2475 static int ram_save_iterate(QEMUFile *f, void *opaque)
2476 {
2477     RAMState **temp = opaque;
2478     RAMState *rs = *temp;
2479     int ret = 0;
2480     int i;
2481     int64_t t0;
2482     int done = 0;
2483 
2484     if (blk_mig_bulk_active()) {
2485         /* Avoid transferring ram during bulk phase of block migration as
2486          * the bulk phase will usually take a long time and transferring
2487          * ram updates during that time is pointless. */
2488         goto out;
2489     }
2490 
2491     WITH_RCU_READ_LOCK_GUARD() {
2492         if (ram_list.version != rs->last_version) {
2493             ram_state_reset(rs);
2494         }
2495 
2496         /* Read version before ram_list.blocks */
2497         smp_rmb();
2498 
2499         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2500 
2501         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2502         i = 0;
2503         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2504                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2505             int pages;
2506 
2507             if (qemu_file_get_error(f)) {
2508                 break;
2509             }
2510 
2511             pages = ram_find_and_save_block(rs, false);
2512             /* no more pages to sent */
2513             if (pages == 0) {
2514                 done = 1;
2515                 break;
2516             }
2517 
2518             if (pages < 0) {
2519                 qemu_file_set_error(f, pages);
2520                 break;
2521             }
2522 
2523             rs->target_page_count += pages;
2524 
2525             /*
2526              * During postcopy, it is necessary to make sure one whole host
2527              * page is sent in one chunk.
2528              */
2529             if (migrate_postcopy_ram()) {
2530                 flush_compressed_data(rs);
2531             }
2532 
2533             /*
2534              * we want to check in the 1st loop, just in case it was the 1st
2535              * time and we had to sync the dirty bitmap.
2536              * qemu_clock_get_ns() is a bit expensive, so we only check each
2537              * some iterations
2538              */
2539             if ((i & 63) == 0) {
2540                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2541                               1000000;
2542                 if (t1 > MAX_WAIT) {
2543                     trace_ram_save_iterate_big_wait(t1, i);
2544                     break;
2545                 }
2546             }
2547             i++;
2548         }
2549     }
2550 
2551     /*
2552      * Must occur before EOS (or any QEMUFile operation)
2553      * because of RDMA protocol.
2554      */
2555     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2556 
2557 out:
2558     if (ret >= 0
2559         && migration_is_setup_or_active(migrate_get_current()->state)) {
2560         multifd_send_sync_main(rs->f);
2561         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2562         qemu_fflush(f);
2563         ram_counters.transferred += 8;
2564 
2565         ret = qemu_file_get_error(f);
2566     }
2567     if (ret < 0) {
2568         return ret;
2569     }
2570 
2571     return done;
2572 }
2573 
2574 /**
2575  * ram_save_complete: function called to send the remaining amount of ram
2576  *
2577  * Returns zero to indicate success or negative on error
2578  *
2579  * Called with iothread lock
2580  *
2581  * @f: QEMUFile where to send the data
2582  * @opaque: RAMState pointer
2583  */
2584 static int ram_save_complete(QEMUFile *f, void *opaque)
2585 {
2586     RAMState **temp = opaque;
2587     RAMState *rs = *temp;
2588     int ret = 0;
2589 
2590     WITH_RCU_READ_LOCK_GUARD() {
2591         if (!migration_in_postcopy()) {
2592             migration_bitmap_sync_precopy(rs);
2593         }
2594 
2595         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2596 
2597         /* try transferring iterative blocks of memory */
2598 
2599         /* flush all remaining blocks regardless of rate limiting */
2600         while (true) {
2601             int pages;
2602 
2603             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2604             /* no more blocks to sent */
2605             if (pages == 0) {
2606                 break;
2607             }
2608             if (pages < 0) {
2609                 ret = pages;
2610                 break;
2611             }
2612         }
2613 
2614         flush_compressed_data(rs);
2615         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2616     }
2617 
2618     if (ret >= 0) {
2619         multifd_send_sync_main(rs->f);
2620         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2621         qemu_fflush(f);
2622     }
2623 
2624     return ret;
2625 }
2626 
2627 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2628                              uint64_t *res_precopy_only,
2629                              uint64_t *res_compatible,
2630                              uint64_t *res_postcopy_only)
2631 {
2632     RAMState **temp = opaque;
2633     RAMState *rs = *temp;
2634     uint64_t remaining_size;
2635 
2636     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2637 
2638     if (!migration_in_postcopy() &&
2639         remaining_size < max_size) {
2640         qemu_mutex_lock_iothread();
2641         WITH_RCU_READ_LOCK_GUARD() {
2642             migration_bitmap_sync_precopy(rs);
2643         }
2644         qemu_mutex_unlock_iothread();
2645         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2646     }
2647 
2648     if (migrate_postcopy_ram()) {
2649         /* We can do postcopy, and all the data is postcopiable */
2650         *res_compatible += remaining_size;
2651     } else {
2652         *res_precopy_only += remaining_size;
2653     }
2654 }
2655 
2656 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2657 {
2658     unsigned int xh_len;
2659     int xh_flags;
2660     uint8_t *loaded_data;
2661 
2662     /* extract RLE header */
2663     xh_flags = qemu_get_byte(f);
2664     xh_len = qemu_get_be16(f);
2665 
2666     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2667         error_report("Failed to load XBZRLE page - wrong compression!");
2668         return -1;
2669     }
2670 
2671     if (xh_len > TARGET_PAGE_SIZE) {
2672         error_report("Failed to load XBZRLE page - len overflow!");
2673         return -1;
2674     }
2675     loaded_data = XBZRLE.decoded_buf;
2676     /* load data and decode */
2677     /* it can change loaded_data to point to an internal buffer */
2678     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2679 
2680     /* decode RLE */
2681     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2682                              TARGET_PAGE_SIZE) == -1) {
2683         error_report("Failed to load XBZRLE page - decode error!");
2684         return -1;
2685     }
2686 
2687     return 0;
2688 }
2689 
2690 /**
2691  * ram_block_from_stream: read a RAMBlock id from the migration stream
2692  *
2693  * Must be called from within a rcu critical section.
2694  *
2695  * Returns a pointer from within the RCU-protected ram_list.
2696  *
2697  * @f: QEMUFile where to read the data from
2698  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2699  */
2700 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2701 {
2702     static RAMBlock *block = NULL;
2703     char id[256];
2704     uint8_t len;
2705 
2706     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2707         if (!block) {
2708             error_report("Ack, bad migration stream!");
2709             return NULL;
2710         }
2711         return block;
2712     }
2713 
2714     len = qemu_get_byte(f);
2715     qemu_get_buffer(f, (uint8_t *)id, len);
2716     id[len] = 0;
2717 
2718     block = qemu_ram_block_by_name(id);
2719     if (!block) {
2720         error_report("Can't find block %s", id);
2721         return NULL;
2722     }
2723 
2724     if (ramblock_is_ignored(block)) {
2725         error_report("block %s should not be migrated !", id);
2726         return NULL;
2727     }
2728 
2729     return block;
2730 }
2731 
2732 static inline void *host_from_ram_block_offset(RAMBlock *block,
2733                                                ram_addr_t offset)
2734 {
2735     if (!offset_in_ramblock(block, offset)) {
2736         return NULL;
2737     }
2738 
2739     return block->host + offset;
2740 }
2741 
2742 static inline void *colo_cache_from_block_offset(RAMBlock *block,
2743                              ram_addr_t offset, bool record_bitmap)
2744 {
2745     if (!offset_in_ramblock(block, offset)) {
2746         return NULL;
2747     }
2748     if (!block->colo_cache) {
2749         error_report("%s: colo_cache is NULL in block :%s",
2750                      __func__, block->idstr);
2751         return NULL;
2752     }
2753 
2754     /*
2755     * During colo checkpoint, we need bitmap of these migrated pages.
2756     * It help us to decide which pages in ram cache should be flushed
2757     * into VM's RAM later.
2758     */
2759     if (record_bitmap &&
2760         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
2761         ram_state->migration_dirty_pages++;
2762     }
2763     return block->colo_cache + offset;
2764 }
2765 
2766 /**
2767  * ram_handle_compressed: handle the zero page case
2768  *
2769  * If a page (or a whole RDMA chunk) has been
2770  * determined to be zero, then zap it.
2771  *
2772  * @host: host address for the zero page
2773  * @ch: what the page is filled from.  We only support zero
2774  * @size: size of the zero page
2775  */
2776 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2777 {
2778     if (ch != 0 || !is_zero_range(host, size)) {
2779         memset(host, ch, size);
2780     }
2781 }
2782 
2783 /* return the size after decompression, or negative value on error */
2784 static int
2785 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2786                      const uint8_t *source, size_t source_len)
2787 {
2788     int err;
2789 
2790     err = inflateReset(stream);
2791     if (err != Z_OK) {
2792         return -1;
2793     }
2794 
2795     stream->avail_in = source_len;
2796     stream->next_in = (uint8_t *)source;
2797     stream->avail_out = dest_len;
2798     stream->next_out = dest;
2799 
2800     err = inflate(stream, Z_NO_FLUSH);
2801     if (err != Z_STREAM_END) {
2802         return -1;
2803     }
2804 
2805     return stream->total_out;
2806 }
2807 
2808 static void *do_data_decompress(void *opaque)
2809 {
2810     DecompressParam *param = opaque;
2811     unsigned long pagesize;
2812     uint8_t *des;
2813     int len, ret;
2814 
2815     qemu_mutex_lock(&param->mutex);
2816     while (!param->quit) {
2817         if (param->des) {
2818             des = param->des;
2819             len = param->len;
2820             param->des = 0;
2821             qemu_mutex_unlock(&param->mutex);
2822 
2823             pagesize = TARGET_PAGE_SIZE;
2824 
2825             ret = qemu_uncompress_data(&param->stream, des, pagesize,
2826                                        param->compbuf, len);
2827             if (ret < 0 && migrate_get_current()->decompress_error_check) {
2828                 error_report("decompress data failed");
2829                 qemu_file_set_error(decomp_file, ret);
2830             }
2831 
2832             qemu_mutex_lock(&decomp_done_lock);
2833             param->done = true;
2834             qemu_cond_signal(&decomp_done_cond);
2835             qemu_mutex_unlock(&decomp_done_lock);
2836 
2837             qemu_mutex_lock(&param->mutex);
2838         } else {
2839             qemu_cond_wait(&param->cond, &param->mutex);
2840         }
2841     }
2842     qemu_mutex_unlock(&param->mutex);
2843 
2844     return NULL;
2845 }
2846 
2847 static int wait_for_decompress_done(void)
2848 {
2849     int idx, thread_count;
2850 
2851     if (!migrate_use_compression()) {
2852         return 0;
2853     }
2854 
2855     thread_count = migrate_decompress_threads();
2856     qemu_mutex_lock(&decomp_done_lock);
2857     for (idx = 0; idx < thread_count; idx++) {
2858         while (!decomp_param[idx].done) {
2859             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2860         }
2861     }
2862     qemu_mutex_unlock(&decomp_done_lock);
2863     return qemu_file_get_error(decomp_file);
2864 }
2865 
2866 static void compress_threads_load_cleanup(void)
2867 {
2868     int i, thread_count;
2869 
2870     if (!migrate_use_compression()) {
2871         return;
2872     }
2873     thread_count = migrate_decompress_threads();
2874     for (i = 0; i < thread_count; i++) {
2875         /*
2876          * we use it as a indicator which shows if the thread is
2877          * properly init'd or not
2878          */
2879         if (!decomp_param[i].compbuf) {
2880             break;
2881         }
2882 
2883         qemu_mutex_lock(&decomp_param[i].mutex);
2884         decomp_param[i].quit = true;
2885         qemu_cond_signal(&decomp_param[i].cond);
2886         qemu_mutex_unlock(&decomp_param[i].mutex);
2887     }
2888     for (i = 0; i < thread_count; i++) {
2889         if (!decomp_param[i].compbuf) {
2890             break;
2891         }
2892 
2893         qemu_thread_join(decompress_threads + i);
2894         qemu_mutex_destroy(&decomp_param[i].mutex);
2895         qemu_cond_destroy(&decomp_param[i].cond);
2896         inflateEnd(&decomp_param[i].stream);
2897         g_free(decomp_param[i].compbuf);
2898         decomp_param[i].compbuf = NULL;
2899     }
2900     g_free(decompress_threads);
2901     g_free(decomp_param);
2902     decompress_threads = NULL;
2903     decomp_param = NULL;
2904     decomp_file = NULL;
2905 }
2906 
2907 static int compress_threads_load_setup(QEMUFile *f)
2908 {
2909     int i, thread_count;
2910 
2911     if (!migrate_use_compression()) {
2912         return 0;
2913     }
2914 
2915     thread_count = migrate_decompress_threads();
2916     decompress_threads = g_new0(QemuThread, thread_count);
2917     decomp_param = g_new0(DecompressParam, thread_count);
2918     qemu_mutex_init(&decomp_done_lock);
2919     qemu_cond_init(&decomp_done_cond);
2920     decomp_file = f;
2921     for (i = 0; i < thread_count; i++) {
2922         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2923             goto exit;
2924         }
2925 
2926         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2927         qemu_mutex_init(&decomp_param[i].mutex);
2928         qemu_cond_init(&decomp_param[i].cond);
2929         decomp_param[i].done = true;
2930         decomp_param[i].quit = false;
2931         qemu_thread_create(decompress_threads + i, "decompress",
2932                            do_data_decompress, decomp_param + i,
2933                            QEMU_THREAD_JOINABLE);
2934     }
2935     return 0;
2936 exit:
2937     compress_threads_load_cleanup();
2938     return -1;
2939 }
2940 
2941 static void decompress_data_with_multi_threads(QEMUFile *f,
2942                                                void *host, int len)
2943 {
2944     int idx, thread_count;
2945 
2946     thread_count = migrate_decompress_threads();
2947     qemu_mutex_lock(&decomp_done_lock);
2948     while (true) {
2949         for (idx = 0; idx < thread_count; idx++) {
2950             if (decomp_param[idx].done) {
2951                 decomp_param[idx].done = false;
2952                 qemu_mutex_lock(&decomp_param[idx].mutex);
2953                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2954                 decomp_param[idx].des = host;
2955                 decomp_param[idx].len = len;
2956                 qemu_cond_signal(&decomp_param[idx].cond);
2957                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2958                 break;
2959             }
2960         }
2961         if (idx < thread_count) {
2962             break;
2963         } else {
2964             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2965         }
2966     }
2967     qemu_mutex_unlock(&decomp_done_lock);
2968 }
2969 
2970 /*
2971  * colo cache: this is for secondary VM, we cache the whole
2972  * memory of the secondary VM, it is need to hold the global lock
2973  * to call this helper.
2974  */
2975 int colo_init_ram_cache(void)
2976 {
2977     RAMBlock *block;
2978 
2979     WITH_RCU_READ_LOCK_GUARD() {
2980         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2981             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
2982                                                     NULL,
2983                                                     false);
2984             if (!block->colo_cache) {
2985                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
2986                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
2987                              block->used_length);
2988                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2989                     if (block->colo_cache) {
2990                         qemu_anon_ram_free(block->colo_cache, block->used_length);
2991                         block->colo_cache = NULL;
2992                     }
2993                 }
2994                 return -errno;
2995             }
2996         }
2997     }
2998 
2999     /*
3000     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3001     * with to decide which page in cache should be flushed into SVM's RAM. Here
3002     * we use the same name 'ram_bitmap' as for migration.
3003     */
3004     if (ram_bytes_total()) {
3005         RAMBlock *block;
3006 
3007         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3008             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3009             block->bmap = bitmap_new(pages);
3010         }
3011     }
3012 
3013     ram_state_init(&ram_state);
3014     return 0;
3015 }
3016 
3017 /* TODO: duplicated with ram_init_bitmaps */
3018 void colo_incoming_start_dirty_log(void)
3019 {
3020     RAMBlock *block = NULL;
3021     /* For memory_global_dirty_log_start below. */
3022     qemu_mutex_lock_iothread();
3023     qemu_mutex_lock_ramlist();
3024 
3025     memory_global_dirty_log_sync();
3026     WITH_RCU_READ_LOCK_GUARD() {
3027         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3028             ramblock_sync_dirty_bitmap(ram_state, block);
3029             /* Discard this dirty bitmap record */
3030             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3031         }
3032         memory_global_dirty_log_start();
3033     }
3034     ram_state->migration_dirty_pages = 0;
3035     qemu_mutex_unlock_ramlist();
3036     qemu_mutex_unlock_iothread();
3037 }
3038 
3039 /* It is need to hold the global lock to call this helper */
3040 void colo_release_ram_cache(void)
3041 {
3042     RAMBlock *block;
3043 
3044     memory_global_dirty_log_stop();
3045     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3046         g_free(block->bmap);
3047         block->bmap = NULL;
3048     }
3049 
3050     WITH_RCU_READ_LOCK_GUARD() {
3051         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3052             if (block->colo_cache) {
3053                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3054                 block->colo_cache = NULL;
3055             }
3056         }
3057     }
3058     ram_state_cleanup(&ram_state);
3059 }
3060 
3061 /**
3062  * ram_load_setup: Setup RAM for migration incoming side
3063  *
3064  * Returns zero to indicate success and negative for error
3065  *
3066  * @f: QEMUFile where to receive the data
3067  * @opaque: RAMState pointer
3068  */
3069 static int ram_load_setup(QEMUFile *f, void *opaque)
3070 {
3071     if (compress_threads_load_setup(f)) {
3072         return -1;
3073     }
3074 
3075     xbzrle_load_setup();
3076     ramblock_recv_map_init();
3077 
3078     return 0;
3079 }
3080 
3081 static int ram_load_cleanup(void *opaque)
3082 {
3083     RAMBlock *rb;
3084 
3085     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3086         qemu_ram_block_writeback(rb);
3087     }
3088 
3089     xbzrle_load_cleanup();
3090     compress_threads_load_cleanup();
3091 
3092     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3093         g_free(rb->receivedmap);
3094         rb->receivedmap = NULL;
3095     }
3096 
3097     return 0;
3098 }
3099 
3100 /**
3101  * ram_postcopy_incoming_init: allocate postcopy data structures
3102  *
3103  * Returns 0 for success and negative if there was one error
3104  *
3105  * @mis: current migration incoming state
3106  *
3107  * Allocate data structures etc needed by incoming migration with
3108  * postcopy-ram. postcopy-ram's similarly names
3109  * postcopy_ram_incoming_init does the work.
3110  */
3111 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3112 {
3113     return postcopy_ram_incoming_init(mis);
3114 }
3115 
3116 /**
3117  * ram_load_postcopy: load a page in postcopy case
3118  *
3119  * Returns 0 for success or -errno in case of error
3120  *
3121  * Called in postcopy mode by ram_load().
3122  * rcu_read_lock is taken prior to this being called.
3123  *
3124  * @f: QEMUFile where to send the data
3125  */
3126 static int ram_load_postcopy(QEMUFile *f)
3127 {
3128     int flags = 0, ret = 0;
3129     bool place_needed = false;
3130     bool matches_target_page_size = false;
3131     MigrationIncomingState *mis = migration_incoming_get_current();
3132     /* Temporary page that is later 'placed' */
3133     void *postcopy_host_page = mis->postcopy_tmp_page;
3134     void *this_host = NULL;
3135     bool all_zero = false;
3136     int target_pages = 0;
3137 
3138     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3139         ram_addr_t addr;
3140         void *host = NULL;
3141         void *page_buffer = NULL;
3142         void *place_source = NULL;
3143         RAMBlock *block = NULL;
3144         uint8_t ch;
3145         int len;
3146 
3147         addr = qemu_get_be64(f);
3148 
3149         /*
3150          * If qemu file error, we should stop here, and then "addr"
3151          * may be invalid
3152          */
3153         ret = qemu_file_get_error(f);
3154         if (ret) {
3155             break;
3156         }
3157 
3158         flags = addr & ~TARGET_PAGE_MASK;
3159         addr &= TARGET_PAGE_MASK;
3160 
3161         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3162         place_needed = false;
3163         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3164                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3165             block = ram_block_from_stream(f, flags);
3166 
3167             host = host_from_ram_block_offset(block, addr);
3168             if (!host) {
3169                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3170                 ret = -EINVAL;
3171                 break;
3172             }
3173             target_pages++;
3174             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3175             /*
3176              * Postcopy requires that we place whole host pages atomically;
3177              * these may be huge pages for RAMBlocks that are backed by
3178              * hugetlbfs.
3179              * To make it atomic, the data is read into a temporary page
3180              * that's moved into place later.
3181              * The migration protocol uses,  possibly smaller, target-pages
3182              * however the source ensures it always sends all the components
3183              * of a host page in one chunk.
3184              */
3185             page_buffer = postcopy_host_page +
3186                           ((uintptr_t)host & (block->page_size - 1));
3187             /* If all TP are zero then we can optimise the place */
3188             if (target_pages == 1) {
3189                 all_zero = true;
3190                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3191                                                     block->page_size);
3192             } else {
3193                 /* not the 1st TP within the HP */
3194                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3195                     (uintptr_t)this_host) {
3196                     error_report("Non-same host page %p/%p",
3197                                   host, this_host);
3198                     ret = -EINVAL;
3199                     break;
3200                 }
3201             }
3202 
3203             /*
3204              * If it's the last part of a host page then we place the host
3205              * page
3206              */
3207             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3208                 place_needed = true;
3209                 target_pages = 0;
3210             }
3211             place_source = postcopy_host_page;
3212         }
3213 
3214         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3215         case RAM_SAVE_FLAG_ZERO:
3216             ch = qemu_get_byte(f);
3217             /*
3218              * Can skip to set page_buffer when
3219              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3220              */
3221             if (ch || !matches_target_page_size) {
3222                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3223             }
3224             if (ch) {
3225                 all_zero = false;
3226             }
3227             break;
3228 
3229         case RAM_SAVE_FLAG_PAGE:
3230             all_zero = false;
3231             if (!matches_target_page_size) {
3232                 /* For huge pages, we always use temporary buffer */
3233                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3234             } else {
3235                 /*
3236                  * For small pages that matches target page size, we
3237                  * avoid the qemu_file copy.  Instead we directly use
3238                  * the buffer of QEMUFile to place the page.  Note: we
3239                  * cannot do any QEMUFile operation before using that
3240                  * buffer to make sure the buffer is valid when
3241                  * placing the page.
3242                  */
3243                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3244                                          TARGET_PAGE_SIZE);
3245             }
3246             break;
3247         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3248             all_zero = false;
3249             len = qemu_get_be32(f);
3250             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3251                 error_report("Invalid compressed data length: %d", len);
3252                 ret = -EINVAL;
3253                 break;
3254             }
3255             decompress_data_with_multi_threads(f, page_buffer, len);
3256             break;
3257 
3258         case RAM_SAVE_FLAG_EOS:
3259             /* normal exit */
3260             multifd_recv_sync_main();
3261             break;
3262         default:
3263             error_report("Unknown combination of migration flags: %#x"
3264                          " (postcopy mode)", flags);
3265             ret = -EINVAL;
3266             break;
3267         }
3268 
3269         /* Got the whole host page, wait for decompress before placing. */
3270         if (place_needed) {
3271             ret |= wait_for_decompress_done();
3272         }
3273 
3274         /* Detect for any possible file errors */
3275         if (!ret && qemu_file_get_error(f)) {
3276             ret = qemu_file_get_error(f);
3277         }
3278 
3279         if (!ret && place_needed) {
3280             /* This gets called at the last target page in the host page */
3281             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3282                                                        block->page_size);
3283 
3284             if (all_zero) {
3285                 ret = postcopy_place_page_zero(mis, place_dest,
3286                                                block);
3287             } else {
3288                 ret = postcopy_place_page(mis, place_dest,
3289                                           place_source, block);
3290             }
3291         }
3292     }
3293 
3294     return ret;
3295 }
3296 
3297 static bool postcopy_is_advised(void)
3298 {
3299     PostcopyState ps = postcopy_state_get();
3300     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3301 }
3302 
3303 static bool postcopy_is_running(void)
3304 {
3305     PostcopyState ps = postcopy_state_get();
3306     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3307 }
3308 
3309 /*
3310  * Flush content of RAM cache into SVM's memory.
3311  * Only flush the pages that be dirtied by PVM or SVM or both.
3312  */
3313 static void colo_flush_ram_cache(void)
3314 {
3315     RAMBlock *block = NULL;
3316     void *dst_host;
3317     void *src_host;
3318     unsigned long offset = 0;
3319 
3320     memory_global_dirty_log_sync();
3321     WITH_RCU_READ_LOCK_GUARD() {
3322         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3323             ramblock_sync_dirty_bitmap(ram_state, block);
3324         }
3325     }
3326 
3327     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3328     WITH_RCU_READ_LOCK_GUARD() {
3329         block = QLIST_FIRST_RCU(&ram_list.blocks);
3330 
3331         while (block) {
3332             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3333 
3334             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3335                 >= block->used_length) {
3336                 offset = 0;
3337                 block = QLIST_NEXT_RCU(block, next);
3338             } else {
3339                 migration_bitmap_clear_dirty(ram_state, block, offset);
3340                 dst_host = block->host
3341                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3342                 src_host = block->colo_cache
3343                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3344                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3345             }
3346         }
3347     }
3348     trace_colo_flush_ram_cache_end();
3349 }
3350 
3351 /**
3352  * ram_load_precopy: load pages in precopy case
3353  *
3354  * Returns 0 for success or -errno in case of error
3355  *
3356  * Called in precopy mode by ram_load().
3357  * rcu_read_lock is taken prior to this being called.
3358  *
3359  * @f: QEMUFile where to send the data
3360  */
3361 static int ram_load_precopy(QEMUFile *f)
3362 {
3363     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3364     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3365     bool postcopy_advised = postcopy_is_advised();
3366     if (!migrate_use_compression()) {
3367         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3368     }
3369 
3370     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3371         ram_addr_t addr, total_ram_bytes;
3372         void *host = NULL, *host_bak = NULL;
3373         uint8_t ch;
3374 
3375         /*
3376          * Yield periodically to let main loop run, but an iteration of
3377          * the main loop is expensive, so do it each some iterations
3378          */
3379         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3380             aio_co_schedule(qemu_get_current_aio_context(),
3381                             qemu_coroutine_self());
3382             qemu_coroutine_yield();
3383         }
3384         i++;
3385 
3386         addr = qemu_get_be64(f);
3387         flags = addr & ~TARGET_PAGE_MASK;
3388         addr &= TARGET_PAGE_MASK;
3389 
3390         if (flags & invalid_flags) {
3391             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3392                 error_report("Received an unexpected compressed page");
3393             }
3394 
3395             ret = -EINVAL;
3396             break;
3397         }
3398 
3399         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3400                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3401             RAMBlock *block = ram_block_from_stream(f, flags);
3402 
3403             host = host_from_ram_block_offset(block, addr);
3404             /*
3405              * After going into COLO stage, we should not load the page
3406              * into SVM's memory directly, we put them into colo_cache firstly.
3407              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3408              * Previously, we copied all these memory in preparing stage of COLO
3409              * while we need to stop VM, which is a time-consuming process.
3410              * Here we optimize it by a trick, back-up every page while in
3411              * migration process while COLO is enabled, though it affects the
3412              * speed of the migration, but it obviously reduce the downtime of
3413              * back-up all SVM'S memory in COLO preparing stage.
3414              */
3415             if (migration_incoming_colo_enabled()) {
3416                 if (migration_incoming_in_colo_state()) {
3417                     /* In COLO stage, put all pages into cache temporarily */
3418                     host = colo_cache_from_block_offset(block, addr, true);
3419                 } else {
3420                    /*
3421                     * In migration stage but before COLO stage,
3422                     * Put all pages into both cache and SVM's memory.
3423                     */
3424                     host_bak = colo_cache_from_block_offset(block, addr, false);
3425                 }
3426             }
3427             if (!host) {
3428                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3429                 ret = -EINVAL;
3430                 break;
3431             }
3432             if (!migration_incoming_in_colo_state()) {
3433                 ramblock_recv_bitmap_set(block, host);
3434             }
3435 
3436             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3437         }
3438 
3439         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3440         case RAM_SAVE_FLAG_MEM_SIZE:
3441             /* Synchronize RAM block list */
3442             total_ram_bytes = addr;
3443             while (!ret && total_ram_bytes) {
3444                 RAMBlock *block;
3445                 char id[256];
3446                 ram_addr_t length;
3447 
3448                 len = qemu_get_byte(f);
3449                 qemu_get_buffer(f, (uint8_t *)id, len);
3450                 id[len] = 0;
3451                 length = qemu_get_be64(f);
3452 
3453                 block = qemu_ram_block_by_name(id);
3454                 if (block && !qemu_ram_is_migratable(block)) {
3455                     error_report("block %s should not be migrated !", id);
3456                     ret = -EINVAL;
3457                 } else if (block) {
3458                     if (length != block->used_length) {
3459                         Error *local_err = NULL;
3460 
3461                         ret = qemu_ram_resize(block, length,
3462                                               &local_err);
3463                         if (local_err) {
3464                             error_report_err(local_err);
3465                         }
3466                     }
3467                     /* For postcopy we need to check hugepage sizes match */
3468                     if (postcopy_advised &&
3469                         block->page_size != qemu_host_page_size) {
3470                         uint64_t remote_page_size = qemu_get_be64(f);
3471                         if (remote_page_size != block->page_size) {
3472                             error_report("Mismatched RAM page size %s "
3473                                          "(local) %zd != %" PRId64,
3474                                          id, block->page_size,
3475                                          remote_page_size);
3476                             ret = -EINVAL;
3477                         }
3478                     }
3479                     if (migrate_ignore_shared()) {
3480                         hwaddr addr = qemu_get_be64(f);
3481                         if (ramblock_is_ignored(block) &&
3482                             block->mr->addr != addr) {
3483                             error_report("Mismatched GPAs for block %s "
3484                                          "%" PRId64 "!= %" PRId64,
3485                                          id, (uint64_t)addr,
3486                                          (uint64_t)block->mr->addr);
3487                             ret = -EINVAL;
3488                         }
3489                     }
3490                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3491                                           block->idstr);
3492                 } else {
3493                     error_report("Unknown ramblock \"%s\", cannot "
3494                                  "accept migration", id);
3495                     ret = -EINVAL;
3496                 }
3497 
3498                 total_ram_bytes -= length;
3499             }
3500             break;
3501 
3502         case RAM_SAVE_FLAG_ZERO:
3503             ch = qemu_get_byte(f);
3504             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3505             break;
3506 
3507         case RAM_SAVE_FLAG_PAGE:
3508             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3509             break;
3510 
3511         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3512             len = qemu_get_be32(f);
3513             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3514                 error_report("Invalid compressed data length: %d", len);
3515                 ret = -EINVAL;
3516                 break;
3517             }
3518             decompress_data_with_multi_threads(f, host, len);
3519             break;
3520 
3521         case RAM_SAVE_FLAG_XBZRLE:
3522             if (load_xbzrle(f, addr, host) < 0) {
3523                 error_report("Failed to decompress XBZRLE page at "
3524                              RAM_ADDR_FMT, addr);
3525                 ret = -EINVAL;
3526                 break;
3527             }
3528             break;
3529         case RAM_SAVE_FLAG_EOS:
3530             /* normal exit */
3531             multifd_recv_sync_main();
3532             break;
3533         default:
3534             if (flags & RAM_SAVE_FLAG_HOOK) {
3535                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3536             } else {
3537                 error_report("Unknown combination of migration flags: %#x",
3538                              flags);
3539                 ret = -EINVAL;
3540             }
3541         }
3542         if (!ret) {
3543             ret = qemu_file_get_error(f);
3544         }
3545         if (!ret && host_bak) {
3546             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3547         }
3548     }
3549 
3550     ret |= wait_for_decompress_done();
3551     return ret;
3552 }
3553 
3554 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3555 {
3556     int ret = 0;
3557     static uint64_t seq_iter;
3558     /*
3559      * If system is running in postcopy mode, page inserts to host memory must
3560      * be atomic
3561      */
3562     bool postcopy_running = postcopy_is_running();
3563 
3564     seq_iter++;
3565 
3566     if (version_id != 4) {
3567         return -EINVAL;
3568     }
3569 
3570     /*
3571      * This RCU critical section can be very long running.
3572      * When RCU reclaims in the code start to become numerous,
3573      * it will be necessary to reduce the granularity of this
3574      * critical section.
3575      */
3576     WITH_RCU_READ_LOCK_GUARD() {
3577         if (postcopy_running) {
3578             ret = ram_load_postcopy(f);
3579         } else {
3580             ret = ram_load_precopy(f);
3581         }
3582     }
3583     trace_ram_load_complete(ret, seq_iter);
3584 
3585     if (!ret  && migration_incoming_in_colo_state()) {
3586         colo_flush_ram_cache();
3587     }
3588     return ret;
3589 }
3590 
3591 static bool ram_has_postcopy(void *opaque)
3592 {
3593     RAMBlock *rb;
3594     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3595         if (ramblock_is_pmem(rb)) {
3596             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3597                          "is not supported now!", rb->idstr, rb->host);
3598             return false;
3599         }
3600     }
3601 
3602     return migrate_postcopy_ram();
3603 }
3604 
3605 /* Sync all the dirty bitmap with destination VM.  */
3606 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3607 {
3608     RAMBlock *block;
3609     QEMUFile *file = s->to_dst_file;
3610     int ramblock_count = 0;
3611 
3612     trace_ram_dirty_bitmap_sync_start();
3613 
3614     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3615         qemu_savevm_send_recv_bitmap(file, block->idstr);
3616         trace_ram_dirty_bitmap_request(block->idstr);
3617         ramblock_count++;
3618     }
3619 
3620     trace_ram_dirty_bitmap_sync_wait();
3621 
3622     /* Wait until all the ramblocks' dirty bitmap synced */
3623     while (ramblock_count--) {
3624         qemu_sem_wait(&s->rp_state.rp_sem);
3625     }
3626 
3627     trace_ram_dirty_bitmap_sync_complete();
3628 
3629     return 0;
3630 }
3631 
3632 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3633 {
3634     qemu_sem_post(&s->rp_state.rp_sem);
3635 }
3636 
3637 /*
3638  * Read the received bitmap, revert it as the initial dirty bitmap.
3639  * This is only used when the postcopy migration is paused but wants
3640  * to resume from a middle point.
3641  */
3642 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3643 {
3644     int ret = -EINVAL;
3645     QEMUFile *file = s->rp_state.from_dst_file;
3646     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3647     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3648     uint64_t size, end_mark;
3649 
3650     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3651 
3652     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3653         error_report("%s: incorrect state %s", __func__,
3654                      MigrationStatus_str(s->state));
3655         return -EINVAL;
3656     }
3657 
3658     /*
3659      * Note: see comments in ramblock_recv_bitmap_send() on why we
3660      * need the endianess convertion, and the paddings.
3661      */
3662     local_size = ROUND_UP(local_size, 8);
3663 
3664     /* Add paddings */
3665     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3666 
3667     size = qemu_get_be64(file);
3668 
3669     /* The size of the bitmap should match with our ramblock */
3670     if (size != local_size) {
3671         error_report("%s: ramblock '%s' bitmap size mismatch "
3672                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3673                      block->idstr, size, local_size);
3674         ret = -EINVAL;
3675         goto out;
3676     }
3677 
3678     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3679     end_mark = qemu_get_be64(file);
3680 
3681     ret = qemu_file_get_error(file);
3682     if (ret || size != local_size) {
3683         error_report("%s: read bitmap failed for ramblock '%s': %d"
3684                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3685                      __func__, block->idstr, ret, local_size, size);
3686         ret = -EIO;
3687         goto out;
3688     }
3689 
3690     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3691         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3692                      __func__, block->idstr, end_mark);
3693         ret = -EINVAL;
3694         goto out;
3695     }
3696 
3697     /*
3698      * Endianess convertion. We are during postcopy (though paused).
3699      * The dirty bitmap won't change. We can directly modify it.
3700      */
3701     bitmap_from_le(block->bmap, le_bitmap, nbits);
3702 
3703     /*
3704      * What we received is "received bitmap". Revert it as the initial
3705      * dirty bitmap for this ramblock.
3706      */
3707     bitmap_complement(block->bmap, block->bmap, nbits);
3708 
3709     trace_ram_dirty_bitmap_reload_complete(block->idstr);
3710 
3711     /*
3712      * We succeeded to sync bitmap for current ramblock. If this is
3713      * the last one to sync, we need to notify the main send thread.
3714      */
3715     ram_dirty_bitmap_reload_notify(s);
3716 
3717     ret = 0;
3718 out:
3719     g_free(le_bitmap);
3720     return ret;
3721 }
3722 
3723 static int ram_resume_prepare(MigrationState *s, void *opaque)
3724 {
3725     RAMState *rs = *(RAMState **)opaque;
3726     int ret;
3727 
3728     ret = ram_dirty_bitmap_sync_all(s, rs);
3729     if (ret) {
3730         return ret;
3731     }
3732 
3733     ram_state_resume_prepare(rs, s->to_dst_file);
3734 
3735     return 0;
3736 }
3737 
3738 static SaveVMHandlers savevm_ram_handlers = {
3739     .save_setup = ram_save_setup,
3740     .save_live_iterate = ram_save_iterate,
3741     .save_live_complete_postcopy = ram_save_complete,
3742     .save_live_complete_precopy = ram_save_complete,
3743     .has_postcopy = ram_has_postcopy,
3744     .save_live_pending = ram_save_pending,
3745     .load_state = ram_load,
3746     .save_cleanup = ram_save_cleanup,
3747     .load_setup = ram_load_setup,
3748     .load_cleanup = ram_load_cleanup,
3749     .resume_prepare = ram_resume_prepare,
3750 };
3751 
3752 void ram_mig_init(void)
3753 {
3754     qemu_mutex_init(&XBZRLE.lock);
3755     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
3756 }
3757