xref: /qemu/migration/ram.c (revision 8ddc171b)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/madvise.h"
34 #include "qemu/main-loop.h"
35 #include "io/channel-null.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/error.h"
46 #include "qapi/qapi-types-migration.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
49 #include "trace.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
54 #include "block.h"
55 #include "sysemu/cpu-throttle.h"
56 #include "savevm.h"
57 #include "qemu/iov.h"
58 #include "multifd.h"
59 #include "sysemu/runstate.h"
60 #include "options.h"
61 
62 #include "hw/boards.h" /* for machine_dump_guest_core() */
63 
64 #if defined(__linux__)
65 #include "qemu/userfaultfd.h"
66 #endif /* defined(__linux__) */
67 
68 /***********************************************************/
69 /* ram save/restore */
70 
71 /*
72  * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
73  * worked for pages that were filled with the same char.  We switched
74  * it to only search for the zero value.  And to avoid confusion with
75  * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
76  */
77 /*
78  * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
79  */
80 #define RAM_SAVE_FLAG_FULL     0x01
81 #define RAM_SAVE_FLAG_ZERO     0x02
82 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
83 #define RAM_SAVE_FLAG_PAGE     0x08
84 #define RAM_SAVE_FLAG_EOS      0x10
85 #define RAM_SAVE_FLAG_CONTINUE 0x20
86 #define RAM_SAVE_FLAG_XBZRLE   0x40
87 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
88 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
89 #define RAM_SAVE_FLAG_MULTIFD_FLUSH    0x200
90 /* We can't use any flag that is bigger than 0x200 */
91 
92 int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
93      uint8_t *, int) = xbzrle_encode_buffer;
94 #if defined(CONFIG_AVX512BW_OPT)
95 #include "qemu/cpuid.h"
96 static void __attribute__((constructor)) init_cpu_flag(void)
97 {
98     unsigned max = __get_cpuid_max(0, NULL);
99     int a, b, c, d;
100     if (max >= 1) {
101         __cpuid(1, a, b, c, d);
102          /* We must check that AVX is not just available, but usable.  */
103         if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
104             int bv;
105             __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
106             __cpuid_count(7, 0, a, b, c, d);
107            /* 0xe6:
108             *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
109             *                    and ZMM16-ZMM31 state are enabled by OS)
110             *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
111             */
112             if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
113                 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
114             }
115         }
116     }
117 }
118 #endif
119 
120 XBZRLECacheStats xbzrle_counters;
121 
122 /* used by the search for pages to send */
123 struct PageSearchStatus {
124     /* The migration channel used for a specific host page */
125     QEMUFile    *pss_channel;
126     /* Last block from where we have sent data */
127     RAMBlock *last_sent_block;
128     /* Current block being searched */
129     RAMBlock    *block;
130     /* Current page to search from */
131     unsigned long page;
132     /* Set once we wrap around */
133     bool         complete_round;
134     /* Whether we're sending a host page */
135     bool          host_page_sending;
136     /* The start/end of current host page.  Invalid if host_page_sending==false */
137     unsigned long host_page_start;
138     unsigned long host_page_end;
139 };
140 typedef struct PageSearchStatus PageSearchStatus;
141 
142 /* struct contains XBZRLE cache and a static page
143    used by the compression */
144 static struct {
145     /* buffer used for XBZRLE encoding */
146     uint8_t *encoded_buf;
147     /* buffer for storing page content */
148     uint8_t *current_buf;
149     /* Cache for XBZRLE, Protected by lock. */
150     PageCache *cache;
151     QemuMutex lock;
152     /* it will store a page full of zeros */
153     uint8_t *zero_target_page;
154     /* buffer used for XBZRLE decoding */
155     uint8_t *decoded_buf;
156 } XBZRLE;
157 
158 static void XBZRLE_cache_lock(void)
159 {
160     if (migrate_xbzrle()) {
161         qemu_mutex_lock(&XBZRLE.lock);
162     }
163 }
164 
165 static void XBZRLE_cache_unlock(void)
166 {
167     if (migrate_xbzrle()) {
168         qemu_mutex_unlock(&XBZRLE.lock);
169     }
170 }
171 
172 /**
173  * xbzrle_cache_resize: resize the xbzrle cache
174  *
175  * This function is called from migrate_params_apply in main
176  * thread, possibly while a migration is in progress.  A running
177  * migration may be using the cache and might finish during this call,
178  * hence changes to the cache are protected by XBZRLE.lock().
179  *
180  * Returns 0 for success or -1 for error
181  *
182  * @new_size: new cache size
183  * @errp: set *errp if the check failed, with reason
184  */
185 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
186 {
187     PageCache *new_cache;
188     int64_t ret = 0;
189 
190     /* Check for truncation */
191     if (new_size != (size_t)new_size) {
192         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
193                    "exceeding address space");
194         return -1;
195     }
196 
197     if (new_size == migrate_xbzrle_cache_size()) {
198         /* nothing to do */
199         return 0;
200     }
201 
202     XBZRLE_cache_lock();
203 
204     if (XBZRLE.cache != NULL) {
205         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
206         if (!new_cache) {
207             ret = -1;
208             goto out;
209         }
210 
211         cache_fini(XBZRLE.cache);
212         XBZRLE.cache = new_cache;
213     }
214 out:
215     XBZRLE_cache_unlock();
216     return ret;
217 }
218 
219 static bool postcopy_preempt_active(void)
220 {
221     return migrate_postcopy_preempt() && migration_in_postcopy();
222 }
223 
224 bool ramblock_is_ignored(RAMBlock *block)
225 {
226     return !qemu_ram_is_migratable(block) ||
227            (migrate_ignore_shared() && qemu_ram_is_shared(block));
228 }
229 
230 #undef RAMBLOCK_FOREACH
231 
232 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
233 {
234     RAMBlock *block;
235     int ret = 0;
236 
237     RCU_READ_LOCK_GUARD();
238 
239     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
240         ret = func(block, opaque);
241         if (ret) {
242             break;
243         }
244     }
245     return ret;
246 }
247 
248 static void ramblock_recv_map_init(void)
249 {
250     RAMBlock *rb;
251 
252     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
253         assert(!rb->receivedmap);
254         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
255     }
256 }
257 
258 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
259 {
260     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
261                     rb->receivedmap);
262 }
263 
264 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
265 {
266     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
267 }
268 
269 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
270 {
271     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
272 }
273 
274 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
275                                     size_t nr)
276 {
277     bitmap_set_atomic(rb->receivedmap,
278                       ramblock_recv_bitmap_offset(host_addr, rb),
279                       nr);
280 }
281 
282 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
283 
284 /*
285  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
286  *
287  * Returns >0 if success with sent bytes, or <0 if error.
288  */
289 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
290                                   const char *block_name)
291 {
292     RAMBlock *block = qemu_ram_block_by_name(block_name);
293     unsigned long *le_bitmap, nbits;
294     uint64_t size;
295 
296     if (!block) {
297         error_report("%s: invalid block name: %s", __func__, block_name);
298         return -1;
299     }
300 
301     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
302 
303     /*
304      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
305      * machines we may need 4 more bytes for padding (see below
306      * comment). So extend it a bit before hand.
307      */
308     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
309 
310     /*
311      * Always use little endian when sending the bitmap. This is
312      * required that when source and destination VMs are not using the
313      * same endianness. (Note: big endian won't work.)
314      */
315     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
316 
317     /* Size of the bitmap, in bytes */
318     size = DIV_ROUND_UP(nbits, 8);
319 
320     /*
321      * size is always aligned to 8 bytes for 64bit machines, but it
322      * may not be true for 32bit machines. We need this padding to
323      * make sure the migration can survive even between 32bit and
324      * 64bit machines.
325      */
326     size = ROUND_UP(size, 8);
327 
328     qemu_put_be64(file, size);
329     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
330     /*
331      * Mark as an end, in case the middle part is screwed up due to
332      * some "mysterious" reason.
333      */
334     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
335     qemu_fflush(file);
336 
337     g_free(le_bitmap);
338 
339     if (qemu_file_get_error(file)) {
340         return qemu_file_get_error(file);
341     }
342 
343     return size + sizeof(size);
344 }
345 
346 /*
347  * An outstanding page request, on the source, having been received
348  * and queued
349  */
350 struct RAMSrcPageRequest {
351     RAMBlock *rb;
352     hwaddr    offset;
353     hwaddr    len;
354 
355     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
356 };
357 
358 /* State of RAM for migration */
359 struct RAMState {
360     /*
361      * PageSearchStatus structures for the channels when send pages.
362      * Protected by the bitmap_mutex.
363      */
364     PageSearchStatus pss[RAM_CHANNEL_MAX];
365     /* UFFD file descriptor, used in 'write-tracking' migration */
366     int uffdio_fd;
367     /* total ram size in bytes */
368     uint64_t ram_bytes_total;
369     /* Last block that we have visited searching for dirty pages */
370     RAMBlock *last_seen_block;
371     /* Last dirty target page we have sent */
372     ram_addr_t last_page;
373     /* last ram version we have seen */
374     uint32_t last_version;
375     /* How many times we have dirty too many pages */
376     int dirty_rate_high_cnt;
377     /* these variables are used for bitmap sync */
378     /* last time we did a full bitmap_sync */
379     int64_t time_last_bitmap_sync;
380     /* bytes transferred at start_time */
381     uint64_t bytes_xfer_prev;
382     /* number of dirty pages since start_time */
383     uint64_t num_dirty_pages_period;
384     /* xbzrle misses since the beginning of the period */
385     uint64_t xbzrle_cache_miss_prev;
386     /* Amount of xbzrle pages since the beginning of the period */
387     uint64_t xbzrle_pages_prev;
388     /* Amount of xbzrle encoded bytes since the beginning of the period */
389     uint64_t xbzrle_bytes_prev;
390     /* Start using XBZRLE (e.g., after the first round). */
391     bool xbzrle_enabled;
392     /* Are we on the last stage of migration */
393     bool last_stage;
394     /* compression statistics since the beginning of the period */
395     /* amount of count that no free thread to compress data */
396     uint64_t compress_thread_busy_prev;
397     /* amount bytes after compression */
398     uint64_t compressed_size_prev;
399     /* amount of compressed pages */
400     uint64_t compress_pages_prev;
401 
402     /* total handled target pages at the beginning of period */
403     uint64_t target_page_count_prev;
404     /* total handled target pages since start */
405     uint64_t target_page_count;
406     /* number of dirty bits in the bitmap */
407     uint64_t migration_dirty_pages;
408     /*
409      * Protects:
410      * - dirty/clear bitmap
411      * - migration_dirty_pages
412      * - pss structures
413      */
414     QemuMutex bitmap_mutex;
415     /* The RAMBlock used in the last src_page_requests */
416     RAMBlock *last_req_rb;
417     /* Queue of outstanding page requests from the destination */
418     QemuMutex src_page_req_mutex;
419     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
420 };
421 typedef struct RAMState RAMState;
422 
423 static RAMState *ram_state;
424 
425 static NotifierWithReturnList precopy_notifier_list;
426 
427 /* Whether postcopy has queued requests? */
428 static bool postcopy_has_request(RAMState *rs)
429 {
430     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
431 }
432 
433 void precopy_infrastructure_init(void)
434 {
435     notifier_with_return_list_init(&precopy_notifier_list);
436 }
437 
438 void precopy_add_notifier(NotifierWithReturn *n)
439 {
440     notifier_with_return_list_add(&precopy_notifier_list, n);
441 }
442 
443 void precopy_remove_notifier(NotifierWithReturn *n)
444 {
445     notifier_with_return_remove(n);
446 }
447 
448 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
449 {
450     PrecopyNotifyData pnd;
451     pnd.reason = reason;
452     pnd.errp = errp;
453 
454     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
455 }
456 
457 uint64_t ram_bytes_remaining(void)
458 {
459     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
460                        0;
461 }
462 
463 RAMStats ram_counters;
464 
465 void ram_transferred_add(uint64_t bytes)
466 {
467     if (runstate_is_running()) {
468         stat64_add(&ram_counters.precopy_bytes, bytes);
469     } else if (migration_in_postcopy()) {
470         stat64_add(&ram_counters.postcopy_bytes, bytes);
471     } else {
472         stat64_add(&ram_counters.downtime_bytes, bytes);
473     }
474     stat64_add(&ram_counters.transferred, bytes);
475 }
476 
477 struct MigrationOps {
478     int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
479 };
480 typedef struct MigrationOps MigrationOps;
481 
482 MigrationOps *migration_ops;
483 
484 CompressionStats compression_counters;
485 
486 struct CompressParam {
487     bool done;
488     bool quit;
489     bool zero_page;
490     QEMUFile *file;
491     QemuMutex mutex;
492     QemuCond cond;
493     RAMBlock *block;
494     ram_addr_t offset;
495 
496     /* internally used fields */
497     z_stream stream;
498     uint8_t *originbuf;
499 };
500 typedef struct CompressParam CompressParam;
501 
502 struct DecompressParam {
503     bool done;
504     bool quit;
505     QemuMutex mutex;
506     QemuCond cond;
507     void *des;
508     uint8_t *compbuf;
509     int len;
510     z_stream stream;
511 };
512 typedef struct DecompressParam DecompressParam;
513 
514 static CompressParam *comp_param;
515 static QemuThread *compress_threads;
516 /* comp_done_cond is used to wake up the migration thread when
517  * one of the compression threads has finished the compression.
518  * comp_done_lock is used to co-work with comp_done_cond.
519  */
520 static QemuMutex comp_done_lock;
521 static QemuCond comp_done_cond;
522 
523 static QEMUFile *decomp_file;
524 static DecompressParam *decomp_param;
525 static QemuThread *decompress_threads;
526 static QemuMutex decomp_done_lock;
527 static QemuCond decomp_done_cond;
528 
529 static int ram_save_host_page_urgent(PageSearchStatus *pss);
530 
531 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
532                                  ram_addr_t offset, uint8_t *source_buf);
533 
534 /* NOTE: page is the PFN not real ram_addr_t. */
535 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
536 {
537     pss->block = rb;
538     pss->page = page;
539     pss->complete_round = false;
540 }
541 
542 /*
543  * Check whether two PSSs are actively sending the same page.  Return true
544  * if it is, false otherwise.
545  */
546 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
547 {
548     return pss1->host_page_sending && pss2->host_page_sending &&
549         (pss1->host_page_start == pss2->host_page_start);
550 }
551 
552 static void *do_data_compress(void *opaque)
553 {
554     CompressParam *param = opaque;
555     RAMBlock *block;
556     ram_addr_t offset;
557     bool zero_page;
558 
559     qemu_mutex_lock(&param->mutex);
560     while (!param->quit) {
561         if (param->block) {
562             block = param->block;
563             offset = param->offset;
564             param->block = NULL;
565             qemu_mutex_unlock(&param->mutex);
566 
567             zero_page = do_compress_ram_page(param->file, &param->stream,
568                                              block, offset, param->originbuf);
569 
570             qemu_mutex_lock(&comp_done_lock);
571             param->done = true;
572             param->zero_page = zero_page;
573             qemu_cond_signal(&comp_done_cond);
574             qemu_mutex_unlock(&comp_done_lock);
575 
576             qemu_mutex_lock(&param->mutex);
577         } else {
578             qemu_cond_wait(&param->cond, &param->mutex);
579         }
580     }
581     qemu_mutex_unlock(&param->mutex);
582 
583     return NULL;
584 }
585 
586 static void compress_threads_save_cleanup(void)
587 {
588     int i, thread_count;
589 
590     if (!migrate_compress() || !comp_param) {
591         return;
592     }
593 
594     thread_count = migrate_compress_threads();
595     for (i = 0; i < thread_count; i++) {
596         /*
597          * we use it as a indicator which shows if the thread is
598          * properly init'd or not
599          */
600         if (!comp_param[i].file) {
601             break;
602         }
603 
604         qemu_mutex_lock(&comp_param[i].mutex);
605         comp_param[i].quit = true;
606         qemu_cond_signal(&comp_param[i].cond);
607         qemu_mutex_unlock(&comp_param[i].mutex);
608 
609         qemu_thread_join(compress_threads + i);
610         qemu_mutex_destroy(&comp_param[i].mutex);
611         qemu_cond_destroy(&comp_param[i].cond);
612         deflateEnd(&comp_param[i].stream);
613         g_free(comp_param[i].originbuf);
614         qemu_fclose(comp_param[i].file);
615         comp_param[i].file = NULL;
616     }
617     qemu_mutex_destroy(&comp_done_lock);
618     qemu_cond_destroy(&comp_done_cond);
619     g_free(compress_threads);
620     g_free(comp_param);
621     compress_threads = NULL;
622     comp_param = NULL;
623 }
624 
625 static int compress_threads_save_setup(void)
626 {
627     int i, thread_count;
628 
629     if (!migrate_compress()) {
630         return 0;
631     }
632     thread_count = migrate_compress_threads();
633     compress_threads = g_new0(QemuThread, thread_count);
634     comp_param = g_new0(CompressParam, thread_count);
635     qemu_cond_init(&comp_done_cond);
636     qemu_mutex_init(&comp_done_lock);
637     for (i = 0; i < thread_count; i++) {
638         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
639         if (!comp_param[i].originbuf) {
640             goto exit;
641         }
642 
643         if (deflateInit(&comp_param[i].stream,
644                         migrate_compress_level()) != Z_OK) {
645             g_free(comp_param[i].originbuf);
646             goto exit;
647         }
648 
649         /* comp_param[i].file is just used as a dummy buffer to save data,
650          * set its ops to empty.
651          */
652         comp_param[i].file = qemu_file_new_output(
653             QIO_CHANNEL(qio_channel_null_new()));
654         comp_param[i].done = true;
655         comp_param[i].quit = false;
656         qemu_mutex_init(&comp_param[i].mutex);
657         qemu_cond_init(&comp_param[i].cond);
658         qemu_thread_create(compress_threads + i, "compress",
659                            do_data_compress, comp_param + i,
660                            QEMU_THREAD_JOINABLE);
661     }
662     return 0;
663 
664 exit:
665     compress_threads_save_cleanup();
666     return -1;
667 }
668 
669 /**
670  * save_page_header: write page header to wire
671  *
672  * If this is the 1st block, it also writes the block identification
673  *
674  * Returns the number of bytes written
675  *
676  * @pss: current PSS channel status
677  * @block: block that contains the page we want to send
678  * @offset: offset inside the block for the page
679  *          in the lower bits, it contains flags
680  */
681 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
682                                RAMBlock *block, ram_addr_t offset)
683 {
684     size_t size, len;
685     bool same_block = (block == pss->last_sent_block);
686 
687     if (same_block) {
688         offset |= RAM_SAVE_FLAG_CONTINUE;
689     }
690     qemu_put_be64(f, offset);
691     size = 8;
692 
693     if (!same_block) {
694         len = strlen(block->idstr);
695         qemu_put_byte(f, len);
696         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
697         size += 1 + len;
698         pss->last_sent_block = block;
699     }
700     return size;
701 }
702 
703 /**
704  * mig_throttle_guest_down: throttle down the guest
705  *
706  * Reduce amount of guest cpu execution to hopefully slow down memory
707  * writes. If guest dirty memory rate is reduced below the rate at
708  * which we can transfer pages to the destination then we should be
709  * able to complete migration. Some workloads dirty memory way too
710  * fast and will not effectively converge, even with auto-converge.
711  */
712 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
713                                     uint64_t bytes_dirty_threshold)
714 {
715     uint64_t pct_initial = migrate_cpu_throttle_initial();
716     uint64_t pct_increment = migrate_cpu_throttle_increment();
717     bool pct_tailslow = migrate_cpu_throttle_tailslow();
718     int pct_max = migrate_max_cpu_throttle();
719 
720     uint64_t throttle_now = cpu_throttle_get_percentage();
721     uint64_t cpu_now, cpu_ideal, throttle_inc;
722 
723     /* We have not started throttling yet. Let's start it. */
724     if (!cpu_throttle_active()) {
725         cpu_throttle_set(pct_initial);
726     } else {
727         /* Throttling already on, just increase the rate */
728         if (!pct_tailslow) {
729             throttle_inc = pct_increment;
730         } else {
731             /* Compute the ideal CPU percentage used by Guest, which may
732              * make the dirty rate match the dirty rate threshold. */
733             cpu_now = 100 - throttle_now;
734             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
735                         bytes_dirty_period);
736             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
737         }
738         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
739     }
740 }
741 
742 void mig_throttle_counter_reset(void)
743 {
744     RAMState *rs = ram_state;
745 
746     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
747     rs->num_dirty_pages_period = 0;
748     rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred);
749 }
750 
751 /**
752  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
753  *
754  * @rs: current RAM state
755  * @current_addr: address for the zero page
756  *
757  * Update the xbzrle cache to reflect a page that's been sent as all 0.
758  * The important thing is that a stale (not-yet-0'd) page be replaced
759  * by the new data.
760  * As a bonus, if the page wasn't in the cache it gets added so that
761  * when a small write is made into the 0'd page it gets XBZRLE sent.
762  */
763 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
764 {
765     /* We don't care if this fails to allocate a new cache page
766      * as long as it updated an old one */
767     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
768                  stat64_get(&ram_counters.dirty_sync_count));
769 }
770 
771 #define ENCODING_FLAG_XBZRLE 0x1
772 
773 /**
774  * save_xbzrle_page: compress and send current page
775  *
776  * Returns: 1 means that we wrote the page
777  *          0 means that page is identical to the one already sent
778  *          -1 means that xbzrle would be longer than normal
779  *
780  * @rs: current RAM state
781  * @pss: current PSS channel
782  * @current_data: pointer to the address of the page contents
783  * @current_addr: addr of the page
784  * @block: block that contains the page we want to send
785  * @offset: offset inside the block for the page
786  */
787 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
788                             uint8_t **current_data, ram_addr_t current_addr,
789                             RAMBlock *block, ram_addr_t offset)
790 {
791     int encoded_len = 0, bytes_xbzrle;
792     uint8_t *prev_cached_page;
793     QEMUFile *file = pss->pss_channel;
794     uint64_t generation = stat64_get(&ram_counters.dirty_sync_count);
795 
796     if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) {
797         xbzrle_counters.cache_miss++;
798         if (!rs->last_stage) {
799             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
800                              generation) == -1) {
801                 return -1;
802             } else {
803                 /* update *current_data when the page has been
804                    inserted into cache */
805                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
806             }
807         }
808         return -1;
809     }
810 
811     /*
812      * Reaching here means the page has hit the xbzrle cache, no matter what
813      * encoding result it is (normal encoding, overflow or skipping the page),
814      * count the page as encoded. This is used to calculate the encoding rate.
815      *
816      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
817      * 2nd page turns out to be skipped (i.e. no new bytes written to the
818      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
819      * skipped page included. In this way, the encoding rate can tell if the
820      * guest page is good for xbzrle encoding.
821      */
822     xbzrle_counters.pages++;
823     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
824 
825     /* save current buffer into memory */
826     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
827 
828     /* XBZRLE encoding (if there is no overflow) */
829     encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
830                                             TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
831                                             TARGET_PAGE_SIZE);
832 
833     /*
834      * Update the cache contents, so that it corresponds to the data
835      * sent, in all cases except where we skip the page.
836      */
837     if (!rs->last_stage && encoded_len != 0) {
838         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
839         /*
840          * In the case where we couldn't compress, ensure that the caller
841          * sends the data from the cache, since the guest might have
842          * changed the RAM since we copied it.
843          */
844         *current_data = prev_cached_page;
845     }
846 
847     if (encoded_len == 0) {
848         trace_save_xbzrle_page_skipping();
849         return 0;
850     } else if (encoded_len == -1) {
851         trace_save_xbzrle_page_overflow();
852         xbzrle_counters.overflow++;
853         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
854         return -1;
855     }
856 
857     /* Send XBZRLE based compressed page */
858     bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
859                                     offset | RAM_SAVE_FLAG_XBZRLE);
860     qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
861     qemu_put_be16(file, encoded_len);
862     qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
863     bytes_xbzrle += encoded_len + 1 + 2;
864     /*
865      * Like compressed_size (please see update_compress_thread_counts),
866      * the xbzrle encoded bytes don't count the 8 byte header with
867      * RAM_SAVE_FLAG_CONTINUE.
868      */
869     xbzrle_counters.bytes += bytes_xbzrle - 8;
870     ram_transferred_add(bytes_xbzrle);
871 
872     return 1;
873 }
874 
875 /**
876  * pss_find_next_dirty: find the next dirty page of current ramblock
877  *
878  * This function updates pss->page to point to the next dirty page index
879  * within the ramblock to migrate, or the end of ramblock when nothing
880  * found.  Note that when pss->host_page_sending==true it means we're
881  * during sending a host page, so we won't look for dirty page that is
882  * outside the host page boundary.
883  *
884  * @pss: the current page search status
885  */
886 static void pss_find_next_dirty(PageSearchStatus *pss)
887 {
888     RAMBlock *rb = pss->block;
889     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
890     unsigned long *bitmap = rb->bmap;
891 
892     if (ramblock_is_ignored(rb)) {
893         /* Points directly to the end, so we know no dirty page */
894         pss->page = size;
895         return;
896     }
897 
898     /*
899      * If during sending a host page, only look for dirty pages within the
900      * current host page being send.
901      */
902     if (pss->host_page_sending) {
903         assert(pss->host_page_end);
904         size = MIN(size, pss->host_page_end);
905     }
906 
907     pss->page = find_next_bit(bitmap, size, pss->page);
908 }
909 
910 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
911                                                        unsigned long page)
912 {
913     uint8_t shift;
914     hwaddr size, start;
915 
916     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
917         return;
918     }
919 
920     shift = rb->clear_bmap_shift;
921     /*
922      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
923      * can make things easier sometimes since then start address
924      * of the small chunk will always be 64 pages aligned so the
925      * bitmap will always be aligned to unsigned long. We should
926      * even be able to remove this restriction but I'm simply
927      * keeping it.
928      */
929     assert(shift >= 6);
930 
931     size = 1ULL << (TARGET_PAGE_BITS + shift);
932     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
933     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
934     memory_region_clear_dirty_bitmap(rb->mr, start, size);
935 }
936 
937 static void
938 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
939                                                  unsigned long start,
940                                                  unsigned long npages)
941 {
942     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
943     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
944     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
945 
946     /*
947      * Clear pages from start to start + npages - 1, so the end boundary is
948      * exclusive.
949      */
950     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
951         migration_clear_memory_region_dirty_bitmap(rb, i);
952     }
953 }
954 
955 /*
956  * colo_bitmap_find_diry:find contiguous dirty pages from start
957  *
958  * Returns the page offset within memory region of the start of the contiguout
959  * dirty page
960  *
961  * @rs: current RAM state
962  * @rb: RAMBlock where to search for dirty pages
963  * @start: page where we start the search
964  * @num: the number of contiguous dirty pages
965  */
966 static inline
967 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
968                                      unsigned long start, unsigned long *num)
969 {
970     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
971     unsigned long *bitmap = rb->bmap;
972     unsigned long first, next;
973 
974     *num = 0;
975 
976     if (ramblock_is_ignored(rb)) {
977         return size;
978     }
979 
980     first = find_next_bit(bitmap, size, start);
981     if (first >= size) {
982         return first;
983     }
984     next = find_next_zero_bit(bitmap, size, first + 1);
985     assert(next >= first);
986     *num = next - first;
987     return first;
988 }
989 
990 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
991                                                 RAMBlock *rb,
992                                                 unsigned long page)
993 {
994     bool ret;
995 
996     /*
997      * Clear dirty bitmap if needed.  This _must_ be called before we
998      * send any of the page in the chunk because we need to make sure
999      * we can capture further page content changes when we sync dirty
1000      * log the next time.  So as long as we are going to send any of
1001      * the page in the chunk we clear the remote dirty bitmap for all.
1002      * Clearing it earlier won't be a problem, but too late will.
1003      */
1004     migration_clear_memory_region_dirty_bitmap(rb, page);
1005 
1006     ret = test_and_clear_bit(page, rb->bmap);
1007     if (ret) {
1008         rs->migration_dirty_pages--;
1009     }
1010 
1011     return ret;
1012 }
1013 
1014 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
1015                                        void *opaque)
1016 {
1017     const hwaddr offset = section->offset_within_region;
1018     const hwaddr size = int128_get64(section->size);
1019     const unsigned long start = offset >> TARGET_PAGE_BITS;
1020     const unsigned long npages = size >> TARGET_PAGE_BITS;
1021     RAMBlock *rb = section->mr->ram_block;
1022     uint64_t *cleared_bits = opaque;
1023 
1024     /*
1025      * We don't grab ram_state->bitmap_mutex because we expect to run
1026      * only when starting migration or during postcopy recovery where
1027      * we don't have concurrent access.
1028      */
1029     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1030         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1031     }
1032     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1033     bitmap_clear(rb->bmap, start, npages);
1034 }
1035 
1036 /*
1037  * Exclude all dirty pages from migration that fall into a discarded range as
1038  * managed by a RamDiscardManager responsible for the mapped memory region of
1039  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1040  *
1041  * Discarded pages ("logically unplugged") have undefined content and must
1042  * not get migrated, because even reading these pages for migration might
1043  * result in undesired behavior.
1044  *
1045  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1046  *
1047  * Note: The result is only stable while migrating (precopy/postcopy).
1048  */
1049 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1050 {
1051     uint64_t cleared_bits = 0;
1052 
1053     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1054         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1055         MemoryRegionSection section = {
1056             .mr = rb->mr,
1057             .offset_within_region = 0,
1058             .size = int128_make64(qemu_ram_get_used_length(rb)),
1059         };
1060 
1061         ram_discard_manager_replay_discarded(rdm, &section,
1062                                              dirty_bitmap_clear_section,
1063                                              &cleared_bits);
1064     }
1065     return cleared_bits;
1066 }
1067 
1068 /*
1069  * Check if a host-page aligned page falls into a discarded range as managed by
1070  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1071  *
1072  * Note: The result is only stable while migrating (precopy/postcopy).
1073  */
1074 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1075 {
1076     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1077         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1078         MemoryRegionSection section = {
1079             .mr = rb->mr,
1080             .offset_within_region = start,
1081             .size = int128_make64(qemu_ram_pagesize(rb)),
1082         };
1083 
1084         return !ram_discard_manager_is_populated(rdm, &section);
1085     }
1086     return false;
1087 }
1088 
1089 /* Called with RCU critical section */
1090 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1091 {
1092     uint64_t new_dirty_pages =
1093         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1094 
1095     rs->migration_dirty_pages += new_dirty_pages;
1096     rs->num_dirty_pages_period += new_dirty_pages;
1097 }
1098 
1099 /**
1100  * ram_pagesize_summary: calculate all the pagesizes of a VM
1101  *
1102  * Returns a summary bitmap of the page sizes of all RAMBlocks
1103  *
1104  * For VMs with just normal pages this is equivalent to the host page
1105  * size. If it's got some huge pages then it's the OR of all the
1106  * different page sizes.
1107  */
1108 uint64_t ram_pagesize_summary(void)
1109 {
1110     RAMBlock *block;
1111     uint64_t summary = 0;
1112 
1113     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1114         summary |= block->page_size;
1115     }
1116 
1117     return summary;
1118 }
1119 
1120 uint64_t ram_get_total_transferred_pages(void)
1121 {
1122     return stat64_get(&ram_counters.normal_pages) +
1123         stat64_get(&ram_counters.zero_pages) +
1124         compression_counters.pages + xbzrle_counters.pages;
1125 }
1126 
1127 static void migration_update_rates(RAMState *rs, int64_t end_time)
1128 {
1129     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1130     double compressed_size;
1131 
1132     /* calculate period counters */
1133     stat64_set(&ram_counters.dirty_pages_rate,
1134                rs->num_dirty_pages_period * 1000 /
1135                (end_time - rs->time_last_bitmap_sync));
1136 
1137     if (!page_count) {
1138         return;
1139     }
1140 
1141     if (migrate_xbzrle()) {
1142         double encoded_size, unencoded_size;
1143 
1144         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1145             rs->xbzrle_cache_miss_prev) / page_count;
1146         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1147         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1148                          TARGET_PAGE_SIZE;
1149         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1150         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1151             xbzrle_counters.encoding_rate = 0;
1152         } else {
1153             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1154         }
1155         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1156         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1157     }
1158 
1159     if (migrate_compress()) {
1160         compression_counters.busy_rate = (double)(compression_counters.busy -
1161             rs->compress_thread_busy_prev) / page_count;
1162         rs->compress_thread_busy_prev = compression_counters.busy;
1163 
1164         compressed_size = compression_counters.compressed_size -
1165                           rs->compressed_size_prev;
1166         if (compressed_size) {
1167             double uncompressed_size = (compression_counters.pages -
1168                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1169 
1170             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1171             compression_counters.compression_rate =
1172                                         uncompressed_size / compressed_size;
1173 
1174             rs->compress_pages_prev = compression_counters.pages;
1175             rs->compressed_size_prev = compression_counters.compressed_size;
1176         }
1177     }
1178 }
1179 
1180 static void migration_trigger_throttle(RAMState *rs)
1181 {
1182     uint64_t threshold = migrate_throttle_trigger_threshold();
1183     uint64_t bytes_xfer_period =
1184         stat64_get(&ram_counters.transferred) - rs->bytes_xfer_prev;
1185     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1186     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1187 
1188     /* During block migration the auto-converge logic incorrectly detects
1189      * that ram migration makes no progress. Avoid this by disabling the
1190      * throttling logic during the bulk phase of block migration. */
1191     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1192         /* The following detection logic can be refined later. For now:
1193            Check to see if the ratio between dirtied bytes and the approx.
1194            amount of bytes that just got transferred since the last time
1195            we were in this routine reaches the threshold. If that happens
1196            twice, start or increase throttling. */
1197 
1198         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1199             (++rs->dirty_rate_high_cnt >= 2)) {
1200             trace_migration_throttle();
1201             rs->dirty_rate_high_cnt = 0;
1202             mig_throttle_guest_down(bytes_dirty_period,
1203                                     bytes_dirty_threshold);
1204         }
1205     }
1206 }
1207 
1208 static void migration_bitmap_sync(RAMState *rs)
1209 {
1210     RAMBlock *block;
1211     int64_t end_time;
1212 
1213     stat64_add(&ram_counters.dirty_sync_count, 1);
1214 
1215     if (!rs->time_last_bitmap_sync) {
1216         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1217     }
1218 
1219     trace_migration_bitmap_sync_start();
1220     memory_global_dirty_log_sync();
1221 
1222     qemu_mutex_lock(&rs->bitmap_mutex);
1223     WITH_RCU_READ_LOCK_GUARD() {
1224         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1225             ramblock_sync_dirty_bitmap(rs, block);
1226         }
1227         stat64_set(&ram_counters.dirty_bytes_last_sync, ram_bytes_remaining());
1228     }
1229     qemu_mutex_unlock(&rs->bitmap_mutex);
1230 
1231     memory_global_after_dirty_log_sync();
1232     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1233 
1234     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1235 
1236     /* more than 1 second = 1000 millisecons */
1237     if (end_time > rs->time_last_bitmap_sync + 1000) {
1238         migration_trigger_throttle(rs);
1239 
1240         migration_update_rates(rs, end_time);
1241 
1242         rs->target_page_count_prev = rs->target_page_count;
1243 
1244         /* reset period counters */
1245         rs->time_last_bitmap_sync = end_time;
1246         rs->num_dirty_pages_period = 0;
1247         rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred);
1248     }
1249     if (migrate_events()) {
1250         uint64_t generation = stat64_get(&ram_counters.dirty_sync_count);
1251         qapi_event_send_migration_pass(generation);
1252     }
1253 }
1254 
1255 static void migration_bitmap_sync_precopy(RAMState *rs)
1256 {
1257     Error *local_err = NULL;
1258 
1259     /*
1260      * The current notifier usage is just an optimization to migration, so we
1261      * don't stop the normal migration process in the error case.
1262      */
1263     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1264         error_report_err(local_err);
1265         local_err = NULL;
1266     }
1267 
1268     migration_bitmap_sync(rs);
1269 
1270     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1271         error_report_err(local_err);
1272     }
1273 }
1274 
1275 void ram_release_page(const char *rbname, uint64_t offset)
1276 {
1277     if (!migrate_release_ram() || !migration_in_postcopy()) {
1278         return;
1279     }
1280 
1281     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1282 }
1283 
1284 /**
1285  * save_zero_page_to_file: send the zero page to the file
1286  *
1287  * Returns the size of data written to the file, 0 means the page is not
1288  * a zero page
1289  *
1290  * @pss: current PSS channel
1291  * @block: block that contains the page we want to send
1292  * @offset: offset inside the block for the page
1293  */
1294 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
1295                                   RAMBlock *block, ram_addr_t offset)
1296 {
1297     uint8_t *p = block->host + offset;
1298     int len = 0;
1299 
1300     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1301         len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1302         qemu_put_byte(file, 0);
1303         len += 1;
1304         ram_release_page(block->idstr, offset);
1305     }
1306     return len;
1307 }
1308 
1309 /**
1310  * save_zero_page: send the zero page to the stream
1311  *
1312  * Returns the number of pages written.
1313  *
1314  * @pss: current PSS channel
1315  * @block: block that contains the page we want to send
1316  * @offset: offset inside the block for the page
1317  */
1318 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
1319                           ram_addr_t offset)
1320 {
1321     int len = save_zero_page_to_file(pss, f, block, offset);
1322 
1323     if (len) {
1324         stat64_add(&ram_counters.zero_pages, 1);
1325         ram_transferred_add(len);
1326         return 1;
1327     }
1328     return -1;
1329 }
1330 
1331 /*
1332  * @pages: the number of pages written by the control path,
1333  *        < 0 - error
1334  *        > 0 - number of pages written
1335  *
1336  * Return true if the pages has been saved, otherwise false is returned.
1337  */
1338 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1339                               ram_addr_t offset, int *pages)
1340 {
1341     uint64_t bytes_xmit = 0;
1342     int ret;
1343 
1344     *pages = -1;
1345     ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1346                                 TARGET_PAGE_SIZE, &bytes_xmit);
1347     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1348         return false;
1349     }
1350 
1351     if (bytes_xmit) {
1352         ram_transferred_add(bytes_xmit);
1353         *pages = 1;
1354     }
1355 
1356     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1357         return true;
1358     }
1359 
1360     if (bytes_xmit > 0) {
1361         stat64_add(&ram_counters.normal_pages, 1);
1362     } else if (bytes_xmit == 0) {
1363         stat64_add(&ram_counters.zero_pages, 1);
1364     }
1365 
1366     return true;
1367 }
1368 
1369 /*
1370  * directly send the page to the stream
1371  *
1372  * Returns the number of pages written.
1373  *
1374  * @pss: current PSS channel
1375  * @block: block that contains the page we want to send
1376  * @offset: offset inside the block for the page
1377  * @buf: the page to be sent
1378  * @async: send to page asyncly
1379  */
1380 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1381                             ram_addr_t offset, uint8_t *buf, bool async)
1382 {
1383     QEMUFile *file = pss->pss_channel;
1384 
1385     ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
1386                                          offset | RAM_SAVE_FLAG_PAGE));
1387     if (async) {
1388         qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1389                               migrate_release_ram() &&
1390                               migration_in_postcopy());
1391     } else {
1392         qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1393     }
1394     ram_transferred_add(TARGET_PAGE_SIZE);
1395     stat64_add(&ram_counters.normal_pages, 1);
1396     return 1;
1397 }
1398 
1399 /**
1400  * ram_save_page: send the given page to the stream
1401  *
1402  * Returns the number of pages written.
1403  *          < 0 - error
1404  *          >=0 - Number of pages written - this might legally be 0
1405  *                if xbzrle noticed the page was the same.
1406  *
1407  * @rs: current RAM state
1408  * @block: block that contains the page we want to send
1409  * @offset: offset inside the block for the page
1410  */
1411 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1412 {
1413     int pages = -1;
1414     uint8_t *p;
1415     bool send_async = true;
1416     RAMBlock *block = pss->block;
1417     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1418     ram_addr_t current_addr = block->offset + offset;
1419 
1420     p = block->host + offset;
1421     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1422 
1423     XBZRLE_cache_lock();
1424     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1425         pages = save_xbzrle_page(rs, pss, &p, current_addr,
1426                                  block, offset);
1427         if (!rs->last_stage) {
1428             /* Can't send this cached data async, since the cache page
1429              * might get updated before it gets to the wire
1430              */
1431             send_async = false;
1432         }
1433     }
1434 
1435     /* XBZRLE overflow or normal page */
1436     if (pages == -1) {
1437         pages = save_normal_page(pss, block, offset, p, send_async);
1438     }
1439 
1440     XBZRLE_cache_unlock();
1441 
1442     return pages;
1443 }
1444 
1445 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1446                                  ram_addr_t offset)
1447 {
1448     if (multifd_queue_page(file, block, offset) < 0) {
1449         return -1;
1450     }
1451     stat64_add(&ram_counters.normal_pages, 1);
1452 
1453     return 1;
1454 }
1455 
1456 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1457                                  ram_addr_t offset, uint8_t *source_buf)
1458 {
1459     RAMState *rs = ram_state;
1460     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
1461     uint8_t *p = block->host + offset;
1462     int ret;
1463 
1464     if (save_zero_page_to_file(pss, f, block, offset)) {
1465         return true;
1466     }
1467 
1468     save_page_header(pss, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1469 
1470     /*
1471      * copy it to a internal buffer to avoid it being modified by VM
1472      * so that we can catch up the error during compression and
1473      * decompression
1474      */
1475     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1476     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1477     if (ret < 0) {
1478         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1479         error_report("compressed data failed!");
1480     }
1481     return false;
1482 }
1483 
1484 static void
1485 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1486 {
1487     ram_transferred_add(bytes_xmit);
1488 
1489     if (param->zero_page) {
1490         stat64_add(&ram_counters.zero_pages, 1);
1491         return;
1492     }
1493 
1494     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1495     compression_counters.compressed_size += bytes_xmit - 8;
1496     compression_counters.pages++;
1497 }
1498 
1499 static bool save_page_use_compression(RAMState *rs);
1500 
1501 static void flush_compressed_data(RAMState *rs)
1502 {
1503     MigrationState *ms = migrate_get_current();
1504     int idx, len, thread_count;
1505 
1506     if (!save_page_use_compression(rs)) {
1507         return;
1508     }
1509     thread_count = migrate_compress_threads();
1510 
1511     qemu_mutex_lock(&comp_done_lock);
1512     for (idx = 0; idx < thread_count; idx++) {
1513         while (!comp_param[idx].done) {
1514             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1515         }
1516     }
1517     qemu_mutex_unlock(&comp_done_lock);
1518 
1519     for (idx = 0; idx < thread_count; idx++) {
1520         qemu_mutex_lock(&comp_param[idx].mutex);
1521         if (!comp_param[idx].quit) {
1522             len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
1523             /*
1524              * it's safe to fetch zero_page without holding comp_done_lock
1525              * as there is no further request submitted to the thread,
1526              * i.e, the thread should be waiting for a request at this point.
1527              */
1528             update_compress_thread_counts(&comp_param[idx], len);
1529         }
1530         qemu_mutex_unlock(&comp_param[idx].mutex);
1531     }
1532 }
1533 
1534 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1535                                        ram_addr_t offset)
1536 {
1537     param->block = block;
1538     param->offset = offset;
1539 }
1540 
1541 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
1542 {
1543     int idx, thread_count, bytes_xmit = -1, pages = -1;
1544     bool wait = migrate_compress_wait_thread();
1545     MigrationState *ms = migrate_get_current();
1546 
1547     thread_count = migrate_compress_threads();
1548     qemu_mutex_lock(&comp_done_lock);
1549 retry:
1550     for (idx = 0; idx < thread_count; idx++) {
1551         if (comp_param[idx].done) {
1552             comp_param[idx].done = false;
1553             bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1554                                             comp_param[idx].file);
1555             qemu_mutex_lock(&comp_param[idx].mutex);
1556             set_compress_params(&comp_param[idx], block, offset);
1557             qemu_cond_signal(&comp_param[idx].cond);
1558             qemu_mutex_unlock(&comp_param[idx].mutex);
1559             pages = 1;
1560             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1561             break;
1562         }
1563     }
1564 
1565     /*
1566      * wait for the free thread if the user specifies 'compress-wait-thread',
1567      * otherwise we will post the page out in the main thread as normal page.
1568      */
1569     if (pages < 0 && wait) {
1570         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1571         goto retry;
1572     }
1573     qemu_mutex_unlock(&comp_done_lock);
1574 
1575     return pages;
1576 }
1577 
1578 #define PAGE_ALL_CLEAN 0
1579 #define PAGE_TRY_AGAIN 1
1580 #define PAGE_DIRTY_FOUND 2
1581 /**
1582  * find_dirty_block: find the next dirty page and update any state
1583  * associated with the search process.
1584  *
1585  * Returns:
1586  *         <0: An error happened
1587  *         PAGE_ALL_CLEAN: no dirty page found, give up
1588  *         PAGE_TRY_AGAIN: no dirty page found, retry for next block
1589  *         PAGE_DIRTY_FOUND: dirty page found
1590  *
1591  * @rs: current RAM state
1592  * @pss: data about the state of the current dirty page scan
1593  * @again: set to false if the search has scanned the whole of RAM
1594  */
1595 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
1596 {
1597     /* Update pss->page for the next dirty bit in ramblock */
1598     pss_find_next_dirty(pss);
1599 
1600     if (pss->complete_round && pss->block == rs->last_seen_block &&
1601         pss->page >= rs->last_page) {
1602         /*
1603          * We've been once around the RAM and haven't found anything.
1604          * Give up.
1605          */
1606         return PAGE_ALL_CLEAN;
1607     }
1608     if (!offset_in_ramblock(pss->block,
1609                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1610         /* Didn't find anything in this RAM Block */
1611         pss->page = 0;
1612         pss->block = QLIST_NEXT_RCU(pss->block, next);
1613         if (!pss->block) {
1614             if (!migrate_multifd_flush_after_each_section()) {
1615                 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
1616                 int ret = multifd_send_sync_main(f);
1617                 if (ret < 0) {
1618                     return ret;
1619                 }
1620                 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
1621                 qemu_fflush(f);
1622             }
1623             /*
1624              * If memory migration starts over, we will meet a dirtied page
1625              * which may still exists in compression threads's ring, so we
1626              * should flush the compressed data to make sure the new page
1627              * is not overwritten by the old one in the destination.
1628              *
1629              * Also If xbzrle is on, stop using the data compression at this
1630              * point. In theory, xbzrle can do better than compression.
1631              */
1632             flush_compressed_data(rs);
1633 
1634             /* Hit the end of the list */
1635             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1636             /* Flag that we've looped */
1637             pss->complete_round = true;
1638             /* After the first round, enable XBZRLE. */
1639             if (migrate_xbzrle()) {
1640                 rs->xbzrle_enabled = true;
1641             }
1642         }
1643         /* Didn't find anything this time, but try again on the new block */
1644         return PAGE_TRY_AGAIN;
1645     } else {
1646         /* We've found something */
1647         return PAGE_DIRTY_FOUND;
1648     }
1649 }
1650 
1651 /**
1652  * unqueue_page: gets a page of the queue
1653  *
1654  * Helper for 'get_queued_page' - gets a page off the queue
1655  *
1656  * Returns the block of the page (or NULL if none available)
1657  *
1658  * @rs: current RAM state
1659  * @offset: used to return the offset within the RAMBlock
1660  */
1661 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1662 {
1663     struct RAMSrcPageRequest *entry;
1664     RAMBlock *block = NULL;
1665 
1666     if (!postcopy_has_request(rs)) {
1667         return NULL;
1668     }
1669 
1670     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1671 
1672     /*
1673      * This should _never_ change even after we take the lock, because no one
1674      * should be taking anything off the request list other than us.
1675      */
1676     assert(postcopy_has_request(rs));
1677 
1678     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1679     block = entry->rb;
1680     *offset = entry->offset;
1681 
1682     if (entry->len > TARGET_PAGE_SIZE) {
1683         entry->len -= TARGET_PAGE_SIZE;
1684         entry->offset += TARGET_PAGE_SIZE;
1685     } else {
1686         memory_region_unref(block->mr);
1687         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1688         g_free(entry);
1689         migration_consume_urgent_request();
1690     }
1691 
1692     return block;
1693 }
1694 
1695 #if defined(__linux__)
1696 /**
1697  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1698  *   is found, return RAM block pointer and page offset
1699  *
1700  * Returns pointer to the RAMBlock containing faulting page,
1701  *   NULL if no write faults are pending
1702  *
1703  * @rs: current RAM state
1704  * @offset: page offset from the beginning of the block
1705  */
1706 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1707 {
1708     struct uffd_msg uffd_msg;
1709     void *page_address;
1710     RAMBlock *block;
1711     int res;
1712 
1713     if (!migrate_background_snapshot()) {
1714         return NULL;
1715     }
1716 
1717     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1718     if (res <= 0) {
1719         return NULL;
1720     }
1721 
1722     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1723     block = qemu_ram_block_from_host(page_address, false, offset);
1724     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1725     return block;
1726 }
1727 
1728 /**
1729  * ram_save_release_protection: release UFFD write protection after
1730  *   a range of pages has been saved
1731  *
1732  * @rs: current RAM state
1733  * @pss: page-search-status structure
1734  * @start_page: index of the first page in the range relative to pss->block
1735  *
1736  * Returns 0 on success, negative value in case of an error
1737 */
1738 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1739         unsigned long start_page)
1740 {
1741     int res = 0;
1742 
1743     /* Check if page is from UFFD-managed region. */
1744     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1745         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1746         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1747 
1748         /* Flush async buffers before un-protect. */
1749         qemu_fflush(pss->pss_channel);
1750         /* Un-protect memory range. */
1751         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1752                 false, false);
1753     }
1754 
1755     return res;
1756 }
1757 
1758 /* ram_write_tracking_available: check if kernel supports required UFFD features
1759  *
1760  * Returns true if supports, false otherwise
1761  */
1762 bool ram_write_tracking_available(void)
1763 {
1764     uint64_t uffd_features;
1765     int res;
1766 
1767     res = uffd_query_features(&uffd_features);
1768     return (res == 0 &&
1769             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1770 }
1771 
1772 /* ram_write_tracking_compatible: check if guest configuration is
1773  *   compatible with 'write-tracking'
1774  *
1775  * Returns true if compatible, false otherwise
1776  */
1777 bool ram_write_tracking_compatible(void)
1778 {
1779     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1780     int uffd_fd;
1781     RAMBlock *block;
1782     bool ret = false;
1783 
1784     /* Open UFFD file descriptor */
1785     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1786     if (uffd_fd < 0) {
1787         return false;
1788     }
1789 
1790     RCU_READ_LOCK_GUARD();
1791 
1792     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1793         uint64_t uffd_ioctls;
1794 
1795         /* Nothing to do with read-only and MMIO-writable regions */
1796         if (block->mr->readonly || block->mr->rom_device) {
1797             continue;
1798         }
1799         /* Try to register block memory via UFFD-IO to track writes */
1800         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1801                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1802             goto out;
1803         }
1804         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1805             goto out;
1806         }
1807     }
1808     ret = true;
1809 
1810 out:
1811     uffd_close_fd(uffd_fd);
1812     return ret;
1813 }
1814 
1815 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1816                                        ram_addr_t size)
1817 {
1818     const ram_addr_t end = offset + size;
1819 
1820     /*
1821      * We read one byte of each page; this will preallocate page tables if
1822      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1823      * where no page was populated yet. This might require adaption when
1824      * supporting other mappings, like shmem.
1825      */
1826     for (; offset < end; offset += block->page_size) {
1827         char tmp = *((char *)block->host + offset);
1828 
1829         /* Don't optimize the read out */
1830         asm volatile("" : "+r" (tmp));
1831     }
1832 }
1833 
1834 static inline int populate_read_section(MemoryRegionSection *section,
1835                                         void *opaque)
1836 {
1837     const hwaddr size = int128_get64(section->size);
1838     hwaddr offset = section->offset_within_region;
1839     RAMBlock *block = section->mr->ram_block;
1840 
1841     populate_read_range(block, offset, size);
1842     return 0;
1843 }
1844 
1845 /*
1846  * ram_block_populate_read: preallocate page tables and populate pages in the
1847  *   RAM block by reading a byte of each page.
1848  *
1849  * Since it's solely used for userfault_fd WP feature, here we just
1850  *   hardcode page size to qemu_real_host_page_size.
1851  *
1852  * @block: RAM block to populate
1853  */
1854 static void ram_block_populate_read(RAMBlock *rb)
1855 {
1856     /*
1857      * Skip populating all pages that fall into a discarded range as managed by
1858      * a RamDiscardManager responsible for the mapped memory region of the
1859      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1860      * must not get populated automatically. We don't have to track
1861      * modifications via userfaultfd WP reliably, because these pages will
1862      * not be part of the migration stream either way -- see
1863      * ramblock_dirty_bitmap_exclude_discarded_pages().
1864      *
1865      * Note: The result is only stable while migrating (precopy/postcopy).
1866      */
1867     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1868         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1869         MemoryRegionSection section = {
1870             .mr = rb->mr,
1871             .offset_within_region = 0,
1872             .size = rb->mr->size,
1873         };
1874 
1875         ram_discard_manager_replay_populated(rdm, &section,
1876                                              populate_read_section, NULL);
1877     } else {
1878         populate_read_range(rb, 0, rb->used_length);
1879     }
1880 }
1881 
1882 /*
1883  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1884  */
1885 void ram_write_tracking_prepare(void)
1886 {
1887     RAMBlock *block;
1888 
1889     RCU_READ_LOCK_GUARD();
1890 
1891     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1892         /* Nothing to do with read-only and MMIO-writable regions */
1893         if (block->mr->readonly || block->mr->rom_device) {
1894             continue;
1895         }
1896 
1897         /*
1898          * Populate pages of the RAM block before enabling userfault_fd
1899          * write protection.
1900          *
1901          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1902          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1903          * pages with pte_none() entries in page table.
1904          */
1905         ram_block_populate_read(block);
1906     }
1907 }
1908 
1909 static inline int uffd_protect_section(MemoryRegionSection *section,
1910                                        void *opaque)
1911 {
1912     const hwaddr size = int128_get64(section->size);
1913     const hwaddr offset = section->offset_within_region;
1914     RAMBlock *rb = section->mr->ram_block;
1915     int uffd_fd = (uintptr_t)opaque;
1916 
1917     return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1918                                   false);
1919 }
1920 
1921 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1922 {
1923     assert(rb->flags & RAM_UF_WRITEPROTECT);
1924 
1925     /* See ram_block_populate_read() */
1926     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1927         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1928         MemoryRegionSection section = {
1929             .mr = rb->mr,
1930             .offset_within_region = 0,
1931             .size = rb->mr->size,
1932         };
1933 
1934         return ram_discard_manager_replay_populated(rdm, &section,
1935                                                     uffd_protect_section,
1936                                                     (void *)(uintptr_t)uffd_fd);
1937     }
1938     return uffd_change_protection(uffd_fd, rb->host,
1939                                   rb->used_length, true, false);
1940 }
1941 
1942 /*
1943  * ram_write_tracking_start: start UFFD-WP memory tracking
1944  *
1945  * Returns 0 for success or negative value in case of error
1946  */
1947 int ram_write_tracking_start(void)
1948 {
1949     int uffd_fd;
1950     RAMState *rs = ram_state;
1951     RAMBlock *block;
1952 
1953     /* Open UFFD file descriptor */
1954     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1955     if (uffd_fd < 0) {
1956         return uffd_fd;
1957     }
1958     rs->uffdio_fd = uffd_fd;
1959 
1960     RCU_READ_LOCK_GUARD();
1961 
1962     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1963         /* Nothing to do with read-only and MMIO-writable regions */
1964         if (block->mr->readonly || block->mr->rom_device) {
1965             continue;
1966         }
1967 
1968         /* Register block memory with UFFD to track writes */
1969         if (uffd_register_memory(rs->uffdio_fd, block->host,
1970                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1971             goto fail;
1972         }
1973         block->flags |= RAM_UF_WRITEPROTECT;
1974         memory_region_ref(block->mr);
1975 
1976         /* Apply UFFD write protection to the block memory range */
1977         if (ram_block_uffd_protect(block, uffd_fd)) {
1978             goto fail;
1979         }
1980 
1981         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1982                 block->host, block->max_length);
1983     }
1984 
1985     return 0;
1986 
1987 fail:
1988     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1989 
1990     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1991         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1992             continue;
1993         }
1994         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1995         /* Cleanup flags and remove reference */
1996         block->flags &= ~RAM_UF_WRITEPROTECT;
1997         memory_region_unref(block->mr);
1998     }
1999 
2000     uffd_close_fd(uffd_fd);
2001     rs->uffdio_fd = -1;
2002     return -1;
2003 }
2004 
2005 /**
2006  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
2007  */
2008 void ram_write_tracking_stop(void)
2009 {
2010     RAMState *rs = ram_state;
2011     RAMBlock *block;
2012 
2013     RCU_READ_LOCK_GUARD();
2014 
2015     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2016         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
2017             continue;
2018         }
2019         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
2020 
2021         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
2022                 block->host, block->max_length);
2023 
2024         /* Cleanup flags and remove reference */
2025         block->flags &= ~RAM_UF_WRITEPROTECT;
2026         memory_region_unref(block->mr);
2027     }
2028 
2029     /* Finally close UFFD file descriptor */
2030     uffd_close_fd(rs->uffdio_fd);
2031     rs->uffdio_fd = -1;
2032 }
2033 
2034 #else
2035 /* No target OS support, stubs just fail or ignore */
2036 
2037 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
2038 {
2039     (void) rs;
2040     (void) offset;
2041 
2042     return NULL;
2043 }
2044 
2045 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
2046         unsigned long start_page)
2047 {
2048     (void) rs;
2049     (void) pss;
2050     (void) start_page;
2051 
2052     return 0;
2053 }
2054 
2055 bool ram_write_tracking_available(void)
2056 {
2057     return false;
2058 }
2059 
2060 bool ram_write_tracking_compatible(void)
2061 {
2062     assert(0);
2063     return false;
2064 }
2065 
2066 int ram_write_tracking_start(void)
2067 {
2068     assert(0);
2069     return -1;
2070 }
2071 
2072 void ram_write_tracking_stop(void)
2073 {
2074     assert(0);
2075 }
2076 #endif /* defined(__linux__) */
2077 
2078 /**
2079  * get_queued_page: unqueue a page from the postcopy requests
2080  *
2081  * Skips pages that are already sent (!dirty)
2082  *
2083  * Returns true if a queued page is found
2084  *
2085  * @rs: current RAM state
2086  * @pss: data about the state of the current dirty page scan
2087  */
2088 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2089 {
2090     RAMBlock  *block;
2091     ram_addr_t offset;
2092     bool dirty;
2093 
2094     do {
2095         block = unqueue_page(rs, &offset);
2096         /*
2097          * We're sending this page, and since it's postcopy nothing else
2098          * will dirty it, and we must make sure it doesn't get sent again
2099          * even if this queue request was received after the background
2100          * search already sent it.
2101          */
2102         if (block) {
2103             unsigned long page;
2104 
2105             page = offset >> TARGET_PAGE_BITS;
2106             dirty = test_bit(page, block->bmap);
2107             if (!dirty) {
2108                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2109                                                 page);
2110             } else {
2111                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2112             }
2113         }
2114 
2115     } while (block && !dirty);
2116 
2117     if (!block) {
2118         /*
2119          * Poll write faults too if background snapshot is enabled; that's
2120          * when we have vcpus got blocked by the write protected pages.
2121          */
2122         block = poll_fault_page(rs, &offset);
2123     }
2124 
2125     if (block) {
2126         /*
2127          * We want the background search to continue from the queued page
2128          * since the guest is likely to want other pages near to the page
2129          * it just requested.
2130          */
2131         pss->block = block;
2132         pss->page = offset >> TARGET_PAGE_BITS;
2133 
2134         /*
2135          * This unqueued page would break the "one round" check, even is
2136          * really rare.
2137          */
2138         pss->complete_round = false;
2139     }
2140 
2141     return !!block;
2142 }
2143 
2144 /**
2145  * migration_page_queue_free: drop any remaining pages in the ram
2146  * request queue
2147  *
2148  * It should be empty at the end anyway, but in error cases there may
2149  * be some left.  in case that there is any page left, we drop it.
2150  *
2151  */
2152 static void migration_page_queue_free(RAMState *rs)
2153 {
2154     struct RAMSrcPageRequest *mspr, *next_mspr;
2155     /* This queue generally should be empty - but in the case of a failed
2156      * migration might have some droppings in.
2157      */
2158     RCU_READ_LOCK_GUARD();
2159     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2160         memory_region_unref(mspr->rb->mr);
2161         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2162         g_free(mspr);
2163     }
2164 }
2165 
2166 /**
2167  * ram_save_queue_pages: queue the page for transmission
2168  *
2169  * A request from postcopy destination for example.
2170  *
2171  * Returns zero on success or negative on error
2172  *
2173  * @rbname: Name of the RAMBLock of the request. NULL means the
2174  *          same that last one.
2175  * @start: starting address from the start of the RAMBlock
2176  * @len: length (in bytes) to send
2177  */
2178 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2179 {
2180     RAMBlock *ramblock;
2181     RAMState *rs = ram_state;
2182 
2183     stat64_add(&ram_counters.postcopy_requests, 1);
2184     RCU_READ_LOCK_GUARD();
2185 
2186     if (!rbname) {
2187         /* Reuse last RAMBlock */
2188         ramblock = rs->last_req_rb;
2189 
2190         if (!ramblock) {
2191             /*
2192              * Shouldn't happen, we can't reuse the last RAMBlock if
2193              * it's the 1st request.
2194              */
2195             error_report("ram_save_queue_pages no previous block");
2196             return -1;
2197         }
2198     } else {
2199         ramblock = qemu_ram_block_by_name(rbname);
2200 
2201         if (!ramblock) {
2202             /* We shouldn't be asked for a non-existent RAMBlock */
2203             error_report("ram_save_queue_pages no block '%s'", rbname);
2204             return -1;
2205         }
2206         rs->last_req_rb = ramblock;
2207     }
2208     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2209     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2210         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2211                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2212                      __func__, start, len, ramblock->used_length);
2213         return -1;
2214     }
2215 
2216     /*
2217      * When with postcopy preempt, we send back the page directly in the
2218      * rp-return thread.
2219      */
2220     if (postcopy_preempt_active()) {
2221         ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2222         size_t page_size = qemu_ram_pagesize(ramblock);
2223         PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2224         int ret = 0;
2225 
2226         qemu_mutex_lock(&rs->bitmap_mutex);
2227 
2228         pss_init(pss, ramblock, page_start);
2229         /*
2230          * Always use the preempt channel, and make sure it's there.  It's
2231          * safe to access without lock, because when rp-thread is running
2232          * we should be the only one who operates on the qemufile
2233          */
2234         pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
2235         assert(pss->pss_channel);
2236 
2237         /*
2238          * It must be either one or multiple of host page size.  Just
2239          * assert; if something wrong we're mostly split brain anyway.
2240          */
2241         assert(len % page_size == 0);
2242         while (len) {
2243             if (ram_save_host_page_urgent(pss)) {
2244                 error_report("%s: ram_save_host_page_urgent() failed: "
2245                              "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2246                              __func__, ramblock->idstr, start);
2247                 ret = -1;
2248                 break;
2249             }
2250             /*
2251              * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2252              * will automatically be moved and point to the next host page
2253              * we're going to send, so no need to update here.
2254              *
2255              * Normally QEMU never sends >1 host page in requests, so
2256              * logically we don't even need that as the loop should only
2257              * run once, but just to be consistent.
2258              */
2259             len -= page_size;
2260         };
2261         qemu_mutex_unlock(&rs->bitmap_mutex);
2262 
2263         return ret;
2264     }
2265 
2266     struct RAMSrcPageRequest *new_entry =
2267         g_new0(struct RAMSrcPageRequest, 1);
2268     new_entry->rb = ramblock;
2269     new_entry->offset = start;
2270     new_entry->len = len;
2271 
2272     memory_region_ref(ramblock->mr);
2273     qemu_mutex_lock(&rs->src_page_req_mutex);
2274     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2275     migration_make_urgent_request();
2276     qemu_mutex_unlock(&rs->src_page_req_mutex);
2277 
2278     return 0;
2279 }
2280 
2281 static bool save_page_use_compression(RAMState *rs)
2282 {
2283     if (!migrate_compress()) {
2284         return false;
2285     }
2286 
2287     /*
2288      * If xbzrle is enabled (e.g., after first round of migration), stop
2289      * using the data compression. In theory, xbzrle can do better than
2290      * compression.
2291      */
2292     if (rs->xbzrle_enabled) {
2293         return false;
2294     }
2295 
2296     return true;
2297 }
2298 
2299 /*
2300  * try to compress the page before posting it out, return true if the page
2301  * has been properly handled by compression, otherwise needs other
2302  * paths to handle it
2303  */
2304 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2305                                RAMBlock *block, ram_addr_t offset)
2306 {
2307     if (!save_page_use_compression(rs)) {
2308         return false;
2309     }
2310 
2311     /*
2312      * When starting the process of a new block, the first page of
2313      * the block should be sent out before other pages in the same
2314      * block, and all the pages in last block should have been sent
2315      * out, keeping this order is important, because the 'cont' flag
2316      * is used to avoid resending the block name.
2317      *
2318      * We post the fist page as normal page as compression will take
2319      * much CPU resource.
2320      */
2321     if (block != pss->last_sent_block) {
2322         flush_compressed_data(rs);
2323         return false;
2324     }
2325 
2326     if (compress_page_with_multi_thread(block, offset) > 0) {
2327         return true;
2328     }
2329 
2330     compression_counters.busy++;
2331     return false;
2332 }
2333 
2334 /**
2335  * ram_save_target_page_legacy: save one target page
2336  *
2337  * Returns the number of pages written
2338  *
2339  * @rs: current RAM state
2340  * @pss: data about the page we want to send
2341  */
2342 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
2343 {
2344     RAMBlock *block = pss->block;
2345     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2346     int res;
2347 
2348     if (control_save_page(pss, block, offset, &res)) {
2349         return res;
2350     }
2351 
2352     if (save_compress_page(rs, pss, block, offset)) {
2353         return 1;
2354     }
2355 
2356     res = save_zero_page(pss, pss->pss_channel, block, offset);
2357     if (res > 0) {
2358         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2359          * page would be stale
2360          */
2361         if (rs->xbzrle_enabled) {
2362             XBZRLE_cache_lock();
2363             xbzrle_cache_zero_page(rs, block->offset + offset);
2364             XBZRLE_cache_unlock();
2365         }
2366         return res;
2367     }
2368 
2369     /*
2370      * Do not use multifd in postcopy as one whole host page should be
2371      * placed.  Meanwhile postcopy requires atomic update of pages, so even
2372      * if host page size == guest page size the dest guest during run may
2373      * still see partially copied pages which is data corruption.
2374      */
2375     if (migrate_multifd() && !migration_in_postcopy()) {
2376         return ram_save_multifd_page(pss->pss_channel, block, offset);
2377     }
2378 
2379     return ram_save_page(rs, pss);
2380 }
2381 
2382 /* Should be called before sending a host page */
2383 static void pss_host_page_prepare(PageSearchStatus *pss)
2384 {
2385     /* How many guest pages are there in one host page? */
2386     size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2387 
2388     pss->host_page_sending = true;
2389     if (guest_pfns <= 1) {
2390         /*
2391          * This covers both when guest psize == host psize, or when guest
2392          * has larger psize than the host (guest_pfns==0).
2393          *
2394          * For the latter, we always send one whole guest page per
2395          * iteration of the host page (example: an Alpha VM on x86 host
2396          * will have guest psize 8K while host psize 4K).
2397          */
2398         pss->host_page_start = pss->page;
2399         pss->host_page_end = pss->page + 1;
2400     } else {
2401         /*
2402          * The host page spans over multiple guest pages, we send them
2403          * within the same host page iteration.
2404          */
2405         pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2406         pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2407     }
2408 }
2409 
2410 /*
2411  * Whether the page pointed by PSS is within the host page being sent.
2412  * Must be called after a previous pss_host_page_prepare().
2413  */
2414 static bool pss_within_range(PageSearchStatus *pss)
2415 {
2416     ram_addr_t ram_addr;
2417 
2418     assert(pss->host_page_sending);
2419 
2420     /* Over host-page boundary? */
2421     if (pss->page >= pss->host_page_end) {
2422         return false;
2423     }
2424 
2425     ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2426 
2427     return offset_in_ramblock(pss->block, ram_addr);
2428 }
2429 
2430 static void pss_host_page_finish(PageSearchStatus *pss)
2431 {
2432     pss->host_page_sending = false;
2433     /* This is not needed, but just to reset it */
2434     pss->host_page_start = pss->host_page_end = 0;
2435 }
2436 
2437 /*
2438  * Send an urgent host page specified by `pss'.  Need to be called with
2439  * bitmap_mutex held.
2440  *
2441  * Returns 0 if save host page succeeded, false otherwise.
2442  */
2443 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2444 {
2445     bool page_dirty, sent = false;
2446     RAMState *rs = ram_state;
2447     int ret = 0;
2448 
2449     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2450     pss_host_page_prepare(pss);
2451 
2452     /*
2453      * If precopy is sending the same page, let it be done in precopy, or
2454      * we could send the same page in two channels and none of them will
2455      * receive the whole page.
2456      */
2457     if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2458         trace_postcopy_preempt_hit(pss->block->idstr,
2459                                    pss->page << TARGET_PAGE_BITS);
2460         return 0;
2461     }
2462 
2463     do {
2464         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2465 
2466         if (page_dirty) {
2467             /* Be strict to return code; it must be 1, or what else? */
2468             if (migration_ops->ram_save_target_page(rs, pss) != 1) {
2469                 error_report_once("%s: ram_save_target_page failed", __func__);
2470                 ret = -1;
2471                 goto out;
2472             }
2473             sent = true;
2474         }
2475         pss_find_next_dirty(pss);
2476     } while (pss_within_range(pss));
2477 out:
2478     pss_host_page_finish(pss);
2479     /* For urgent requests, flush immediately if sent */
2480     if (sent) {
2481         qemu_fflush(pss->pss_channel);
2482     }
2483     return ret;
2484 }
2485 
2486 /**
2487  * ram_save_host_page: save a whole host page
2488  *
2489  * Starting at *offset send pages up to the end of the current host
2490  * page. It's valid for the initial offset to point into the middle of
2491  * a host page in which case the remainder of the hostpage is sent.
2492  * Only dirty target pages are sent. Note that the host page size may
2493  * be a huge page for this block.
2494  *
2495  * The saving stops at the boundary of the used_length of the block
2496  * if the RAMBlock isn't a multiple of the host page size.
2497  *
2498  * The caller must be with ram_state.bitmap_mutex held to call this
2499  * function.  Note that this function can temporarily release the lock, but
2500  * when the function is returned it'll make sure the lock is still held.
2501  *
2502  * Returns the number of pages written or negative on error
2503  *
2504  * @rs: current RAM state
2505  * @pss: data about the page we want to send
2506  */
2507 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2508 {
2509     bool page_dirty, preempt_active = postcopy_preempt_active();
2510     int tmppages, pages = 0;
2511     size_t pagesize_bits =
2512         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2513     unsigned long start_page = pss->page;
2514     int res;
2515 
2516     if (ramblock_is_ignored(pss->block)) {
2517         error_report("block %s should not be migrated !", pss->block->idstr);
2518         return 0;
2519     }
2520 
2521     /* Update host page boundary information */
2522     pss_host_page_prepare(pss);
2523 
2524     do {
2525         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2526 
2527         /* Check the pages is dirty and if it is send it */
2528         if (page_dirty) {
2529             /*
2530              * Properly yield the lock only in postcopy preempt mode
2531              * because both migration thread and rp-return thread can
2532              * operate on the bitmaps.
2533              */
2534             if (preempt_active) {
2535                 qemu_mutex_unlock(&rs->bitmap_mutex);
2536             }
2537             tmppages = migration_ops->ram_save_target_page(rs, pss);
2538             if (tmppages >= 0) {
2539                 pages += tmppages;
2540                 /*
2541                  * Allow rate limiting to happen in the middle of huge pages if
2542                  * something is sent in the current iteration.
2543                  */
2544                 if (pagesize_bits > 1 && tmppages > 0) {
2545                     migration_rate_limit();
2546                 }
2547             }
2548             if (preempt_active) {
2549                 qemu_mutex_lock(&rs->bitmap_mutex);
2550             }
2551         } else {
2552             tmppages = 0;
2553         }
2554 
2555         if (tmppages < 0) {
2556             pss_host_page_finish(pss);
2557             return tmppages;
2558         }
2559 
2560         pss_find_next_dirty(pss);
2561     } while (pss_within_range(pss));
2562 
2563     pss_host_page_finish(pss);
2564 
2565     res = ram_save_release_protection(rs, pss, start_page);
2566     return (res < 0 ? res : pages);
2567 }
2568 
2569 /**
2570  * ram_find_and_save_block: finds a dirty page and sends it to f
2571  *
2572  * Called within an RCU critical section.
2573  *
2574  * Returns the number of pages written where zero means no dirty pages,
2575  * or negative on error
2576  *
2577  * @rs: current RAM state
2578  *
2579  * On systems where host-page-size > target-page-size it will send all the
2580  * pages in a host page that are dirty.
2581  */
2582 static int ram_find_and_save_block(RAMState *rs)
2583 {
2584     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2585     int pages = 0;
2586 
2587     /* No dirty page as there is zero RAM */
2588     if (!rs->ram_bytes_total) {
2589         return pages;
2590     }
2591 
2592     /*
2593      * Always keep last_seen_block/last_page valid during this procedure,
2594      * because find_dirty_block() relies on these values (e.g., we compare
2595      * last_seen_block with pss.block to see whether we searched all the
2596      * ramblocks) to detect the completion of migration.  Having NULL value
2597      * of last_seen_block can conditionally cause below loop to run forever.
2598      */
2599     if (!rs->last_seen_block) {
2600         rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2601         rs->last_page = 0;
2602     }
2603 
2604     pss_init(pss, rs->last_seen_block, rs->last_page);
2605 
2606     while (true){
2607         if (!get_queued_page(rs, pss)) {
2608             /* priority queue empty, so just search for something dirty */
2609             int res = find_dirty_block(rs, pss);
2610             if (res != PAGE_DIRTY_FOUND) {
2611                 if (res == PAGE_ALL_CLEAN) {
2612                     break;
2613                 } else if (res == PAGE_TRY_AGAIN) {
2614                     continue;
2615                 } else if (res < 0) {
2616                     pages = res;
2617                     break;
2618                 }
2619             }
2620         }
2621         pages = ram_save_host_page(rs, pss);
2622         if (pages) {
2623             break;
2624         }
2625     }
2626 
2627     rs->last_seen_block = pss->block;
2628     rs->last_page = pss->page;
2629 
2630     return pages;
2631 }
2632 
2633 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2634 {
2635     uint64_t pages = size / TARGET_PAGE_SIZE;
2636 
2637     if (zero) {
2638         stat64_add(&ram_counters.zero_pages, pages);
2639     } else {
2640         stat64_add(&ram_counters.normal_pages, pages);
2641         ram_transferred_add(size);
2642         qemu_file_credit_transfer(f, size);
2643     }
2644 }
2645 
2646 static uint64_t ram_bytes_total_with_ignored(void)
2647 {
2648     RAMBlock *block;
2649     uint64_t total = 0;
2650 
2651     RCU_READ_LOCK_GUARD();
2652 
2653     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2654         total += block->used_length;
2655     }
2656     return total;
2657 }
2658 
2659 uint64_t ram_bytes_total(void)
2660 {
2661     RAMBlock *block;
2662     uint64_t total = 0;
2663 
2664     RCU_READ_LOCK_GUARD();
2665 
2666     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2667         total += block->used_length;
2668     }
2669     return total;
2670 }
2671 
2672 static void xbzrle_load_setup(void)
2673 {
2674     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2675 }
2676 
2677 static void xbzrle_load_cleanup(void)
2678 {
2679     g_free(XBZRLE.decoded_buf);
2680     XBZRLE.decoded_buf = NULL;
2681 }
2682 
2683 static void ram_state_cleanup(RAMState **rsp)
2684 {
2685     if (*rsp) {
2686         migration_page_queue_free(*rsp);
2687         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2688         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2689         g_free(*rsp);
2690         *rsp = NULL;
2691     }
2692 }
2693 
2694 static void xbzrle_cleanup(void)
2695 {
2696     XBZRLE_cache_lock();
2697     if (XBZRLE.cache) {
2698         cache_fini(XBZRLE.cache);
2699         g_free(XBZRLE.encoded_buf);
2700         g_free(XBZRLE.current_buf);
2701         g_free(XBZRLE.zero_target_page);
2702         XBZRLE.cache = NULL;
2703         XBZRLE.encoded_buf = NULL;
2704         XBZRLE.current_buf = NULL;
2705         XBZRLE.zero_target_page = NULL;
2706     }
2707     XBZRLE_cache_unlock();
2708 }
2709 
2710 static void ram_save_cleanup(void *opaque)
2711 {
2712     RAMState **rsp = opaque;
2713     RAMBlock *block;
2714 
2715     /* We don't use dirty log with background snapshots */
2716     if (!migrate_background_snapshot()) {
2717         /* caller have hold iothread lock or is in a bh, so there is
2718          * no writing race against the migration bitmap
2719          */
2720         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2721             /*
2722              * do not stop dirty log without starting it, since
2723              * memory_global_dirty_log_stop will assert that
2724              * memory_global_dirty_log_start/stop used in pairs
2725              */
2726             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2727         }
2728     }
2729 
2730     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2731         g_free(block->clear_bmap);
2732         block->clear_bmap = NULL;
2733         g_free(block->bmap);
2734         block->bmap = NULL;
2735     }
2736 
2737     xbzrle_cleanup();
2738     compress_threads_save_cleanup();
2739     ram_state_cleanup(rsp);
2740     g_free(migration_ops);
2741     migration_ops = NULL;
2742 }
2743 
2744 static void ram_state_reset(RAMState *rs)
2745 {
2746     int i;
2747 
2748     for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2749         rs->pss[i].last_sent_block = NULL;
2750     }
2751 
2752     rs->last_seen_block = NULL;
2753     rs->last_page = 0;
2754     rs->last_version = ram_list.version;
2755     rs->xbzrle_enabled = false;
2756 }
2757 
2758 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2759 
2760 /* **** functions for postcopy ***** */
2761 
2762 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2763 {
2764     struct RAMBlock *block;
2765 
2766     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2767         unsigned long *bitmap = block->bmap;
2768         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2769         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2770 
2771         while (run_start < range) {
2772             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2773             ram_discard_range(block->idstr,
2774                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2775                               ((ram_addr_t)(run_end - run_start))
2776                                 << TARGET_PAGE_BITS);
2777             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2778         }
2779     }
2780 }
2781 
2782 /**
2783  * postcopy_send_discard_bm_ram: discard a RAMBlock
2784  *
2785  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2786  *
2787  * @ms: current migration state
2788  * @block: RAMBlock to discard
2789  */
2790 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2791 {
2792     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2793     unsigned long current;
2794     unsigned long *bitmap = block->bmap;
2795 
2796     for (current = 0; current < end; ) {
2797         unsigned long one = find_next_bit(bitmap, end, current);
2798         unsigned long zero, discard_length;
2799 
2800         if (one >= end) {
2801             break;
2802         }
2803 
2804         zero = find_next_zero_bit(bitmap, end, one + 1);
2805 
2806         if (zero >= end) {
2807             discard_length = end - one;
2808         } else {
2809             discard_length = zero - one;
2810         }
2811         postcopy_discard_send_range(ms, one, discard_length);
2812         current = one + discard_length;
2813     }
2814 }
2815 
2816 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2817 
2818 /**
2819  * postcopy_each_ram_send_discard: discard all RAMBlocks
2820  *
2821  * Utility for the outgoing postcopy code.
2822  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2823  *   passing it bitmap indexes and name.
2824  * (qemu_ram_foreach_block ends up passing unscaled lengths
2825  *  which would mean postcopy code would have to deal with target page)
2826  *
2827  * @ms: current migration state
2828  */
2829 static void postcopy_each_ram_send_discard(MigrationState *ms)
2830 {
2831     struct RAMBlock *block;
2832 
2833     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2834         postcopy_discard_send_init(ms, block->idstr);
2835 
2836         /*
2837          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2838          * host-page size chunks, mark any partially dirty host-page size
2839          * chunks as all dirty.  In this case the host-page is the host-page
2840          * for the particular RAMBlock, i.e. it might be a huge page.
2841          */
2842         postcopy_chunk_hostpages_pass(ms, block);
2843 
2844         /*
2845          * Postcopy sends chunks of bitmap over the wire, but it
2846          * just needs indexes at this point, avoids it having
2847          * target page specific code.
2848          */
2849         postcopy_send_discard_bm_ram(ms, block);
2850         postcopy_discard_send_finish(ms);
2851     }
2852 }
2853 
2854 /**
2855  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2856  *
2857  * Helper for postcopy_chunk_hostpages; it's called twice to
2858  * canonicalize the two bitmaps, that are similar, but one is
2859  * inverted.
2860  *
2861  * Postcopy requires that all target pages in a hostpage are dirty or
2862  * clean, not a mix.  This function canonicalizes the bitmaps.
2863  *
2864  * @ms: current migration state
2865  * @block: block that contains the page we want to canonicalize
2866  */
2867 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2868 {
2869     RAMState *rs = ram_state;
2870     unsigned long *bitmap = block->bmap;
2871     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2872     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2873     unsigned long run_start;
2874 
2875     if (block->page_size == TARGET_PAGE_SIZE) {
2876         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2877         return;
2878     }
2879 
2880     /* Find a dirty page */
2881     run_start = find_next_bit(bitmap, pages, 0);
2882 
2883     while (run_start < pages) {
2884 
2885         /*
2886          * If the start of this run of pages is in the middle of a host
2887          * page, then we need to fixup this host page.
2888          */
2889         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2890             /* Find the end of this run */
2891             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2892             /*
2893              * If the end isn't at the start of a host page, then the
2894              * run doesn't finish at the end of a host page
2895              * and we need to discard.
2896              */
2897         }
2898 
2899         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2900             unsigned long page;
2901             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2902                                                              host_ratio);
2903             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2904 
2905             /* Clean up the bitmap */
2906             for (page = fixup_start_addr;
2907                  page < fixup_start_addr + host_ratio; page++) {
2908                 /*
2909                  * Remark them as dirty, updating the count for any pages
2910                  * that weren't previously dirty.
2911                  */
2912                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2913             }
2914         }
2915 
2916         /* Find the next dirty page for the next iteration */
2917         run_start = find_next_bit(bitmap, pages, run_start);
2918     }
2919 }
2920 
2921 /**
2922  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2923  *
2924  * Transmit the set of pages to be discarded after precopy to the target
2925  * these are pages that:
2926  *     a) Have been previously transmitted but are now dirty again
2927  *     b) Pages that have never been transmitted, this ensures that
2928  *        any pages on the destination that have been mapped by background
2929  *        tasks get discarded (transparent huge pages is the specific concern)
2930  * Hopefully this is pretty sparse
2931  *
2932  * @ms: current migration state
2933  */
2934 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2935 {
2936     RAMState *rs = ram_state;
2937 
2938     RCU_READ_LOCK_GUARD();
2939 
2940     /* This should be our last sync, the src is now paused */
2941     migration_bitmap_sync(rs);
2942 
2943     /* Easiest way to make sure we don't resume in the middle of a host-page */
2944     rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2945     rs->last_seen_block = NULL;
2946     rs->last_page = 0;
2947 
2948     postcopy_each_ram_send_discard(ms);
2949 
2950     trace_ram_postcopy_send_discard_bitmap();
2951 }
2952 
2953 /**
2954  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2955  *
2956  * Returns zero on success
2957  *
2958  * @rbname: name of the RAMBlock of the request. NULL means the
2959  *          same that last one.
2960  * @start: RAMBlock starting page
2961  * @length: RAMBlock size
2962  */
2963 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2964 {
2965     trace_ram_discard_range(rbname, start, length);
2966 
2967     RCU_READ_LOCK_GUARD();
2968     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2969 
2970     if (!rb) {
2971         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2972         return -1;
2973     }
2974 
2975     /*
2976      * On source VM, we don't need to update the received bitmap since
2977      * we don't even have one.
2978      */
2979     if (rb->receivedmap) {
2980         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2981                      length >> qemu_target_page_bits());
2982     }
2983 
2984     return ram_block_discard_range(rb, start, length);
2985 }
2986 
2987 /*
2988  * For every allocation, we will try not to crash the VM if the
2989  * allocation failed.
2990  */
2991 static int xbzrle_init(void)
2992 {
2993     Error *local_err = NULL;
2994 
2995     if (!migrate_xbzrle()) {
2996         return 0;
2997     }
2998 
2999     XBZRLE_cache_lock();
3000 
3001     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3002     if (!XBZRLE.zero_target_page) {
3003         error_report("%s: Error allocating zero page", __func__);
3004         goto err_out;
3005     }
3006 
3007     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3008                               TARGET_PAGE_SIZE, &local_err);
3009     if (!XBZRLE.cache) {
3010         error_report_err(local_err);
3011         goto free_zero_page;
3012     }
3013 
3014     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3015     if (!XBZRLE.encoded_buf) {
3016         error_report("%s: Error allocating encoded_buf", __func__);
3017         goto free_cache;
3018     }
3019 
3020     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3021     if (!XBZRLE.current_buf) {
3022         error_report("%s: Error allocating current_buf", __func__);
3023         goto free_encoded_buf;
3024     }
3025 
3026     /* We are all good */
3027     XBZRLE_cache_unlock();
3028     return 0;
3029 
3030 free_encoded_buf:
3031     g_free(XBZRLE.encoded_buf);
3032     XBZRLE.encoded_buf = NULL;
3033 free_cache:
3034     cache_fini(XBZRLE.cache);
3035     XBZRLE.cache = NULL;
3036 free_zero_page:
3037     g_free(XBZRLE.zero_target_page);
3038     XBZRLE.zero_target_page = NULL;
3039 err_out:
3040     XBZRLE_cache_unlock();
3041     return -ENOMEM;
3042 }
3043 
3044 static int ram_state_init(RAMState **rsp)
3045 {
3046     *rsp = g_try_new0(RAMState, 1);
3047 
3048     if (!*rsp) {
3049         error_report("%s: Init ramstate fail", __func__);
3050         return -1;
3051     }
3052 
3053     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3054     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3055     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3056     (*rsp)->ram_bytes_total = ram_bytes_total();
3057 
3058     /*
3059      * Count the total number of pages used by ram blocks not including any
3060      * gaps due to alignment or unplugs.
3061      * This must match with the initial values of dirty bitmap.
3062      */
3063     (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
3064     ram_state_reset(*rsp);
3065 
3066     return 0;
3067 }
3068 
3069 static void ram_list_init_bitmaps(void)
3070 {
3071     MigrationState *ms = migrate_get_current();
3072     RAMBlock *block;
3073     unsigned long pages;
3074     uint8_t shift;
3075 
3076     /* Skip setting bitmap if there is no RAM */
3077     if (ram_bytes_total()) {
3078         shift = ms->clear_bitmap_shift;
3079         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3080             error_report("clear_bitmap_shift (%u) too big, using "
3081                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3082             shift = CLEAR_BITMAP_SHIFT_MAX;
3083         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3084             error_report("clear_bitmap_shift (%u) too small, using "
3085                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3086             shift = CLEAR_BITMAP_SHIFT_MIN;
3087         }
3088 
3089         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3090             pages = block->max_length >> TARGET_PAGE_BITS;
3091             /*
3092              * The initial dirty bitmap for migration must be set with all
3093              * ones to make sure we'll migrate every guest RAM page to
3094              * destination.
3095              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3096              * new migration after a failed migration, ram_list.
3097              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3098              * guest memory.
3099              */
3100             block->bmap = bitmap_new(pages);
3101             bitmap_set(block->bmap, 0, pages);
3102             block->clear_bmap_shift = shift;
3103             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3104         }
3105     }
3106 }
3107 
3108 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3109 {
3110     unsigned long pages;
3111     RAMBlock *rb;
3112 
3113     RCU_READ_LOCK_GUARD();
3114 
3115     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3116             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3117             rs->migration_dirty_pages -= pages;
3118     }
3119 }
3120 
3121 static void ram_init_bitmaps(RAMState *rs)
3122 {
3123     /* For memory_global_dirty_log_start below.  */
3124     qemu_mutex_lock_iothread();
3125     qemu_mutex_lock_ramlist();
3126 
3127     WITH_RCU_READ_LOCK_GUARD() {
3128         ram_list_init_bitmaps();
3129         /* We don't use dirty log with background snapshots */
3130         if (!migrate_background_snapshot()) {
3131             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3132             migration_bitmap_sync_precopy(rs);
3133         }
3134     }
3135     qemu_mutex_unlock_ramlist();
3136     qemu_mutex_unlock_iothread();
3137 
3138     /*
3139      * After an eventual first bitmap sync, fixup the initial bitmap
3140      * containing all 1s to exclude any discarded pages from migration.
3141      */
3142     migration_bitmap_clear_discarded_pages(rs);
3143 }
3144 
3145 static int ram_init_all(RAMState **rsp)
3146 {
3147     if (ram_state_init(rsp)) {
3148         return -1;
3149     }
3150 
3151     if (xbzrle_init()) {
3152         ram_state_cleanup(rsp);
3153         return -1;
3154     }
3155 
3156     ram_init_bitmaps(*rsp);
3157 
3158     return 0;
3159 }
3160 
3161 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3162 {
3163     RAMBlock *block;
3164     uint64_t pages = 0;
3165 
3166     /*
3167      * Postcopy is not using xbzrle/compression, so no need for that.
3168      * Also, since source are already halted, we don't need to care
3169      * about dirty page logging as well.
3170      */
3171 
3172     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3173         pages += bitmap_count_one(block->bmap,
3174                                   block->used_length >> TARGET_PAGE_BITS);
3175     }
3176 
3177     /* This may not be aligned with current bitmaps. Recalculate. */
3178     rs->migration_dirty_pages = pages;
3179 
3180     ram_state_reset(rs);
3181 
3182     /* Update RAMState cache of output QEMUFile */
3183     rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
3184 
3185     trace_ram_state_resume_prepare(pages);
3186 }
3187 
3188 /*
3189  * This function clears bits of the free pages reported by the caller from the
3190  * migration dirty bitmap. @addr is the host address corresponding to the
3191  * start of the continuous guest free pages, and @len is the total bytes of
3192  * those pages.
3193  */
3194 void qemu_guest_free_page_hint(void *addr, size_t len)
3195 {
3196     RAMBlock *block;
3197     ram_addr_t offset;
3198     size_t used_len, start, npages;
3199     MigrationState *s = migrate_get_current();
3200 
3201     /* This function is currently expected to be used during live migration */
3202     if (!migration_is_setup_or_active(s->state)) {
3203         return;
3204     }
3205 
3206     for (; len > 0; len -= used_len, addr += used_len) {
3207         block = qemu_ram_block_from_host(addr, false, &offset);
3208         if (unlikely(!block || offset >= block->used_length)) {
3209             /*
3210              * The implementation might not support RAMBlock resize during
3211              * live migration, but it could happen in theory with future
3212              * updates. So we add a check here to capture that case.
3213              */
3214             error_report_once("%s unexpected error", __func__);
3215             return;
3216         }
3217 
3218         if (len <= block->used_length - offset) {
3219             used_len = len;
3220         } else {
3221             used_len = block->used_length - offset;
3222         }
3223 
3224         start = offset >> TARGET_PAGE_BITS;
3225         npages = used_len >> TARGET_PAGE_BITS;
3226 
3227         qemu_mutex_lock(&ram_state->bitmap_mutex);
3228         /*
3229          * The skipped free pages are equavalent to be sent from clear_bmap's
3230          * perspective, so clear the bits from the memory region bitmap which
3231          * are initially set. Otherwise those skipped pages will be sent in
3232          * the next round after syncing from the memory region bitmap.
3233          */
3234         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3235         ram_state->migration_dirty_pages -=
3236                       bitmap_count_one_with_offset(block->bmap, start, npages);
3237         bitmap_clear(block->bmap, start, npages);
3238         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3239     }
3240 }
3241 
3242 /*
3243  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3244  * long-running RCU critical section.  When rcu-reclaims in the code
3245  * start to become numerous it will be necessary to reduce the
3246  * granularity of these critical sections.
3247  */
3248 
3249 /**
3250  * ram_save_setup: Setup RAM for migration
3251  *
3252  * Returns zero to indicate success and negative for error
3253  *
3254  * @f: QEMUFile where to send the data
3255  * @opaque: RAMState pointer
3256  */
3257 static int ram_save_setup(QEMUFile *f, void *opaque)
3258 {
3259     RAMState **rsp = opaque;
3260     RAMBlock *block;
3261     int ret;
3262 
3263     if (compress_threads_save_setup()) {
3264         return -1;
3265     }
3266 
3267     /* migration has already setup the bitmap, reuse it. */
3268     if (!migration_in_colo_state()) {
3269         if (ram_init_all(rsp) != 0) {
3270             compress_threads_save_cleanup();
3271             return -1;
3272         }
3273     }
3274     (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3275 
3276     WITH_RCU_READ_LOCK_GUARD() {
3277         qemu_put_be64(f, ram_bytes_total_with_ignored()
3278                          | RAM_SAVE_FLAG_MEM_SIZE);
3279 
3280         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3281             qemu_put_byte(f, strlen(block->idstr));
3282             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3283             qemu_put_be64(f, block->used_length);
3284             if (migrate_postcopy_ram() && block->page_size !=
3285                                           qemu_host_page_size) {
3286                 qemu_put_be64(f, block->page_size);
3287             }
3288             if (migrate_ignore_shared()) {
3289                 qemu_put_be64(f, block->mr->addr);
3290             }
3291         }
3292     }
3293 
3294     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3295     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3296 
3297     migration_ops = g_malloc0(sizeof(MigrationOps));
3298     migration_ops->ram_save_target_page = ram_save_target_page_legacy;
3299     ret = multifd_send_sync_main(f);
3300     if (ret < 0) {
3301         return ret;
3302     }
3303 
3304     if (!migrate_multifd_flush_after_each_section()) {
3305         qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3306     }
3307 
3308     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3309     qemu_fflush(f);
3310 
3311     return 0;
3312 }
3313 
3314 /**
3315  * ram_save_iterate: iterative stage for migration
3316  *
3317  * Returns zero to indicate success and negative for error
3318  *
3319  * @f: QEMUFile where to send the data
3320  * @opaque: RAMState pointer
3321  */
3322 static int ram_save_iterate(QEMUFile *f, void *opaque)
3323 {
3324     RAMState **temp = opaque;
3325     RAMState *rs = *temp;
3326     int ret = 0;
3327     int i;
3328     int64_t t0;
3329     int done = 0;
3330 
3331     if (blk_mig_bulk_active()) {
3332         /* Avoid transferring ram during bulk phase of block migration as
3333          * the bulk phase will usually take a long time and transferring
3334          * ram updates during that time is pointless. */
3335         goto out;
3336     }
3337 
3338     /*
3339      * We'll take this lock a little bit long, but it's okay for two reasons.
3340      * Firstly, the only possible other thread to take it is who calls
3341      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3342      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3343      * guarantees that we'll at least released it in a regular basis.
3344      */
3345     qemu_mutex_lock(&rs->bitmap_mutex);
3346     WITH_RCU_READ_LOCK_GUARD() {
3347         if (ram_list.version != rs->last_version) {
3348             ram_state_reset(rs);
3349         }
3350 
3351         /* Read version before ram_list.blocks */
3352         smp_rmb();
3353 
3354         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3355 
3356         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3357         i = 0;
3358         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3359                postcopy_has_request(rs)) {
3360             int pages;
3361 
3362             if (qemu_file_get_error(f)) {
3363                 break;
3364             }
3365 
3366             pages = ram_find_and_save_block(rs);
3367             /* no more pages to sent */
3368             if (pages == 0) {
3369                 done = 1;
3370                 break;
3371             }
3372 
3373             if (pages < 0) {
3374                 qemu_file_set_error(f, pages);
3375                 break;
3376             }
3377 
3378             rs->target_page_count += pages;
3379 
3380             /*
3381              * During postcopy, it is necessary to make sure one whole host
3382              * page is sent in one chunk.
3383              */
3384             if (migrate_postcopy_ram()) {
3385                 flush_compressed_data(rs);
3386             }
3387 
3388             /*
3389              * we want to check in the 1st loop, just in case it was the 1st
3390              * time and we had to sync the dirty bitmap.
3391              * qemu_clock_get_ns() is a bit expensive, so we only check each
3392              * some iterations
3393              */
3394             if ((i & 63) == 0) {
3395                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3396                               1000000;
3397                 if (t1 > MAX_WAIT) {
3398                     trace_ram_save_iterate_big_wait(t1, i);
3399                     break;
3400                 }
3401             }
3402             i++;
3403         }
3404     }
3405     qemu_mutex_unlock(&rs->bitmap_mutex);
3406 
3407     /*
3408      * Must occur before EOS (or any QEMUFile operation)
3409      * because of RDMA protocol.
3410      */
3411     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3412 
3413 out:
3414     if (ret >= 0
3415         && migration_is_setup_or_active(migrate_get_current()->state)) {
3416         if (migrate_multifd_flush_after_each_section()) {
3417             ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3418             if (ret < 0) {
3419                 return ret;
3420             }
3421         }
3422 
3423         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3424         qemu_fflush(f);
3425         ram_transferred_add(8);
3426 
3427         ret = qemu_file_get_error(f);
3428     }
3429     if (ret < 0) {
3430         return ret;
3431     }
3432 
3433     return done;
3434 }
3435 
3436 /**
3437  * ram_save_complete: function called to send the remaining amount of ram
3438  *
3439  * Returns zero to indicate success or negative on error
3440  *
3441  * Called with iothread lock
3442  *
3443  * @f: QEMUFile where to send the data
3444  * @opaque: RAMState pointer
3445  */
3446 static int ram_save_complete(QEMUFile *f, void *opaque)
3447 {
3448     RAMState **temp = opaque;
3449     RAMState *rs = *temp;
3450     int ret = 0;
3451 
3452     rs->last_stage = !migration_in_colo_state();
3453 
3454     WITH_RCU_READ_LOCK_GUARD() {
3455         if (!migration_in_postcopy()) {
3456             migration_bitmap_sync_precopy(rs);
3457         }
3458 
3459         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3460 
3461         /* try transferring iterative blocks of memory */
3462 
3463         /* flush all remaining blocks regardless of rate limiting */
3464         qemu_mutex_lock(&rs->bitmap_mutex);
3465         while (true) {
3466             int pages;
3467 
3468             pages = ram_find_and_save_block(rs);
3469             /* no more blocks to sent */
3470             if (pages == 0) {
3471                 break;
3472             }
3473             if (pages < 0) {
3474                 ret = pages;
3475                 break;
3476             }
3477         }
3478         qemu_mutex_unlock(&rs->bitmap_mutex);
3479 
3480         flush_compressed_data(rs);
3481         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3482     }
3483 
3484     if (ret < 0) {
3485         return ret;
3486     }
3487 
3488     ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3489     if (ret < 0) {
3490         return ret;
3491     }
3492 
3493     if (!migrate_multifd_flush_after_each_section()) {
3494         qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3495     }
3496     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3497     qemu_fflush(f);
3498 
3499     return 0;
3500 }
3501 
3502 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3503                                        uint64_t *can_postcopy)
3504 {
3505     RAMState **temp = opaque;
3506     RAMState *rs = *temp;
3507 
3508     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3509 
3510     if (migrate_postcopy_ram()) {
3511         /* We can do postcopy, and all the data is postcopiable */
3512         *can_postcopy += remaining_size;
3513     } else {
3514         *must_precopy += remaining_size;
3515     }
3516 }
3517 
3518 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3519                                     uint64_t *can_postcopy)
3520 {
3521     MigrationState *s = migrate_get_current();
3522     RAMState **temp = opaque;
3523     RAMState *rs = *temp;
3524 
3525     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3526 
3527     if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
3528         qemu_mutex_lock_iothread();
3529         WITH_RCU_READ_LOCK_GUARD() {
3530             migration_bitmap_sync_precopy(rs);
3531         }
3532         qemu_mutex_unlock_iothread();
3533         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3534     }
3535 
3536     if (migrate_postcopy_ram()) {
3537         /* We can do postcopy, and all the data is postcopiable */
3538         *can_postcopy += remaining_size;
3539     } else {
3540         *must_precopy += remaining_size;
3541     }
3542 }
3543 
3544 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3545 {
3546     unsigned int xh_len;
3547     int xh_flags;
3548     uint8_t *loaded_data;
3549 
3550     /* extract RLE header */
3551     xh_flags = qemu_get_byte(f);
3552     xh_len = qemu_get_be16(f);
3553 
3554     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3555         error_report("Failed to load XBZRLE page - wrong compression!");
3556         return -1;
3557     }
3558 
3559     if (xh_len > TARGET_PAGE_SIZE) {
3560         error_report("Failed to load XBZRLE page - len overflow!");
3561         return -1;
3562     }
3563     loaded_data = XBZRLE.decoded_buf;
3564     /* load data and decode */
3565     /* it can change loaded_data to point to an internal buffer */
3566     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3567 
3568     /* decode RLE */
3569     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3570                              TARGET_PAGE_SIZE) == -1) {
3571         error_report("Failed to load XBZRLE page - decode error!");
3572         return -1;
3573     }
3574 
3575     return 0;
3576 }
3577 
3578 /**
3579  * ram_block_from_stream: read a RAMBlock id from the migration stream
3580  *
3581  * Must be called from within a rcu critical section.
3582  *
3583  * Returns a pointer from within the RCU-protected ram_list.
3584  *
3585  * @mis: the migration incoming state pointer
3586  * @f: QEMUFile where to read the data from
3587  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3588  * @channel: the channel we're using
3589  */
3590 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3591                                               QEMUFile *f, int flags,
3592                                               int channel)
3593 {
3594     RAMBlock *block = mis->last_recv_block[channel];
3595     char id[256];
3596     uint8_t len;
3597 
3598     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3599         if (!block) {
3600             error_report("Ack, bad migration stream!");
3601             return NULL;
3602         }
3603         return block;
3604     }
3605 
3606     len = qemu_get_byte(f);
3607     qemu_get_buffer(f, (uint8_t *)id, len);
3608     id[len] = 0;
3609 
3610     block = qemu_ram_block_by_name(id);
3611     if (!block) {
3612         error_report("Can't find block %s", id);
3613         return NULL;
3614     }
3615 
3616     if (ramblock_is_ignored(block)) {
3617         error_report("block %s should not be migrated !", id);
3618         return NULL;
3619     }
3620 
3621     mis->last_recv_block[channel] = block;
3622 
3623     return block;
3624 }
3625 
3626 static inline void *host_from_ram_block_offset(RAMBlock *block,
3627                                                ram_addr_t offset)
3628 {
3629     if (!offset_in_ramblock(block, offset)) {
3630         return NULL;
3631     }
3632 
3633     return block->host + offset;
3634 }
3635 
3636 static void *host_page_from_ram_block_offset(RAMBlock *block,
3637                                              ram_addr_t offset)
3638 {
3639     /* Note: Explicitly no check against offset_in_ramblock(). */
3640     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3641                                    block->page_size);
3642 }
3643 
3644 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3645                                                          ram_addr_t offset)
3646 {
3647     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3648 }
3649 
3650 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3651                              ram_addr_t offset, bool record_bitmap)
3652 {
3653     if (!offset_in_ramblock(block, offset)) {
3654         return NULL;
3655     }
3656     if (!block->colo_cache) {
3657         error_report("%s: colo_cache is NULL in block :%s",
3658                      __func__, block->idstr);
3659         return NULL;
3660     }
3661 
3662     /*
3663     * During colo checkpoint, we need bitmap of these migrated pages.
3664     * It help us to decide which pages in ram cache should be flushed
3665     * into VM's RAM later.
3666     */
3667     if (record_bitmap &&
3668         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3669         ram_state->migration_dirty_pages++;
3670     }
3671     return block->colo_cache + offset;
3672 }
3673 
3674 /**
3675  * ram_handle_compressed: handle the zero page case
3676  *
3677  * If a page (or a whole RDMA chunk) has been
3678  * determined to be zero, then zap it.
3679  *
3680  * @host: host address for the zero page
3681  * @ch: what the page is filled from.  We only support zero
3682  * @size: size of the zero page
3683  */
3684 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3685 {
3686     if (ch != 0 || !buffer_is_zero(host, size)) {
3687         memset(host, ch, size);
3688     }
3689 }
3690 
3691 /* return the size after decompression, or negative value on error */
3692 static int
3693 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3694                      const uint8_t *source, size_t source_len)
3695 {
3696     int err;
3697 
3698     err = inflateReset(stream);
3699     if (err != Z_OK) {
3700         return -1;
3701     }
3702 
3703     stream->avail_in = source_len;
3704     stream->next_in = (uint8_t *)source;
3705     stream->avail_out = dest_len;
3706     stream->next_out = dest;
3707 
3708     err = inflate(stream, Z_NO_FLUSH);
3709     if (err != Z_STREAM_END) {
3710         return -1;
3711     }
3712 
3713     return stream->total_out;
3714 }
3715 
3716 static void *do_data_decompress(void *opaque)
3717 {
3718     DecompressParam *param = opaque;
3719     unsigned long pagesize;
3720     uint8_t *des;
3721     int len, ret;
3722 
3723     qemu_mutex_lock(&param->mutex);
3724     while (!param->quit) {
3725         if (param->des) {
3726             des = param->des;
3727             len = param->len;
3728             param->des = 0;
3729             qemu_mutex_unlock(&param->mutex);
3730 
3731             pagesize = TARGET_PAGE_SIZE;
3732 
3733             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3734                                        param->compbuf, len);
3735             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3736                 error_report("decompress data failed");
3737                 qemu_file_set_error(decomp_file, ret);
3738             }
3739 
3740             qemu_mutex_lock(&decomp_done_lock);
3741             param->done = true;
3742             qemu_cond_signal(&decomp_done_cond);
3743             qemu_mutex_unlock(&decomp_done_lock);
3744 
3745             qemu_mutex_lock(&param->mutex);
3746         } else {
3747             qemu_cond_wait(&param->cond, &param->mutex);
3748         }
3749     }
3750     qemu_mutex_unlock(&param->mutex);
3751 
3752     return NULL;
3753 }
3754 
3755 static int wait_for_decompress_done(void)
3756 {
3757     int idx, thread_count;
3758 
3759     if (!migrate_compress()) {
3760         return 0;
3761     }
3762 
3763     thread_count = migrate_decompress_threads();
3764     qemu_mutex_lock(&decomp_done_lock);
3765     for (idx = 0; idx < thread_count; idx++) {
3766         while (!decomp_param[idx].done) {
3767             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3768         }
3769     }
3770     qemu_mutex_unlock(&decomp_done_lock);
3771     return qemu_file_get_error(decomp_file);
3772 }
3773 
3774 static void compress_threads_load_cleanup(void)
3775 {
3776     int i, thread_count;
3777 
3778     if (!migrate_compress()) {
3779         return;
3780     }
3781     thread_count = migrate_decompress_threads();
3782     for (i = 0; i < thread_count; i++) {
3783         /*
3784          * we use it as a indicator which shows if the thread is
3785          * properly init'd or not
3786          */
3787         if (!decomp_param[i].compbuf) {
3788             break;
3789         }
3790 
3791         qemu_mutex_lock(&decomp_param[i].mutex);
3792         decomp_param[i].quit = true;
3793         qemu_cond_signal(&decomp_param[i].cond);
3794         qemu_mutex_unlock(&decomp_param[i].mutex);
3795     }
3796     for (i = 0; i < thread_count; i++) {
3797         if (!decomp_param[i].compbuf) {
3798             break;
3799         }
3800 
3801         qemu_thread_join(decompress_threads + i);
3802         qemu_mutex_destroy(&decomp_param[i].mutex);
3803         qemu_cond_destroy(&decomp_param[i].cond);
3804         inflateEnd(&decomp_param[i].stream);
3805         g_free(decomp_param[i].compbuf);
3806         decomp_param[i].compbuf = NULL;
3807     }
3808     g_free(decompress_threads);
3809     g_free(decomp_param);
3810     decompress_threads = NULL;
3811     decomp_param = NULL;
3812     decomp_file = NULL;
3813 }
3814 
3815 static int compress_threads_load_setup(QEMUFile *f)
3816 {
3817     int i, thread_count;
3818 
3819     if (!migrate_compress()) {
3820         return 0;
3821     }
3822 
3823     thread_count = migrate_decompress_threads();
3824     decompress_threads = g_new0(QemuThread, thread_count);
3825     decomp_param = g_new0(DecompressParam, thread_count);
3826     qemu_mutex_init(&decomp_done_lock);
3827     qemu_cond_init(&decomp_done_cond);
3828     decomp_file = f;
3829     for (i = 0; i < thread_count; i++) {
3830         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3831             goto exit;
3832         }
3833 
3834         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3835         qemu_mutex_init(&decomp_param[i].mutex);
3836         qemu_cond_init(&decomp_param[i].cond);
3837         decomp_param[i].done = true;
3838         decomp_param[i].quit = false;
3839         qemu_thread_create(decompress_threads + i, "decompress",
3840                            do_data_decompress, decomp_param + i,
3841                            QEMU_THREAD_JOINABLE);
3842     }
3843     return 0;
3844 exit:
3845     compress_threads_load_cleanup();
3846     return -1;
3847 }
3848 
3849 static void decompress_data_with_multi_threads(QEMUFile *f,
3850                                                void *host, int len)
3851 {
3852     int idx, thread_count;
3853 
3854     thread_count = migrate_decompress_threads();
3855     QEMU_LOCK_GUARD(&decomp_done_lock);
3856     while (true) {
3857         for (idx = 0; idx < thread_count; idx++) {
3858             if (decomp_param[idx].done) {
3859                 decomp_param[idx].done = false;
3860                 qemu_mutex_lock(&decomp_param[idx].mutex);
3861                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3862                 decomp_param[idx].des = host;
3863                 decomp_param[idx].len = len;
3864                 qemu_cond_signal(&decomp_param[idx].cond);
3865                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3866                 break;
3867             }
3868         }
3869         if (idx < thread_count) {
3870             break;
3871         } else {
3872             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3873         }
3874     }
3875 }
3876 
3877 static void colo_init_ram_state(void)
3878 {
3879     ram_state_init(&ram_state);
3880 }
3881 
3882 /*
3883  * colo cache: this is for secondary VM, we cache the whole
3884  * memory of the secondary VM, it is need to hold the global lock
3885  * to call this helper.
3886  */
3887 int colo_init_ram_cache(void)
3888 {
3889     RAMBlock *block;
3890 
3891     WITH_RCU_READ_LOCK_GUARD() {
3892         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3893             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3894                                                     NULL, false, false);
3895             if (!block->colo_cache) {
3896                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3897                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3898                              block->used_length);
3899                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3900                     if (block->colo_cache) {
3901                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3902                         block->colo_cache = NULL;
3903                     }
3904                 }
3905                 return -errno;
3906             }
3907             if (!machine_dump_guest_core(current_machine)) {
3908                 qemu_madvise(block->colo_cache, block->used_length,
3909                              QEMU_MADV_DONTDUMP);
3910             }
3911         }
3912     }
3913 
3914     /*
3915     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3916     * with to decide which page in cache should be flushed into SVM's RAM. Here
3917     * we use the same name 'ram_bitmap' as for migration.
3918     */
3919     if (ram_bytes_total()) {
3920         RAMBlock *block;
3921 
3922         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3923             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3924             block->bmap = bitmap_new(pages);
3925         }
3926     }
3927 
3928     colo_init_ram_state();
3929     return 0;
3930 }
3931 
3932 /* TODO: duplicated with ram_init_bitmaps */
3933 void colo_incoming_start_dirty_log(void)
3934 {
3935     RAMBlock *block = NULL;
3936     /* For memory_global_dirty_log_start below. */
3937     qemu_mutex_lock_iothread();
3938     qemu_mutex_lock_ramlist();
3939 
3940     memory_global_dirty_log_sync();
3941     WITH_RCU_READ_LOCK_GUARD() {
3942         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3943             ramblock_sync_dirty_bitmap(ram_state, block);
3944             /* Discard this dirty bitmap record */
3945             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3946         }
3947         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3948     }
3949     ram_state->migration_dirty_pages = 0;
3950     qemu_mutex_unlock_ramlist();
3951     qemu_mutex_unlock_iothread();
3952 }
3953 
3954 /* It is need to hold the global lock to call this helper */
3955 void colo_release_ram_cache(void)
3956 {
3957     RAMBlock *block;
3958 
3959     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3960     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3961         g_free(block->bmap);
3962         block->bmap = NULL;
3963     }
3964 
3965     WITH_RCU_READ_LOCK_GUARD() {
3966         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3967             if (block->colo_cache) {
3968                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3969                 block->colo_cache = NULL;
3970             }
3971         }
3972     }
3973     ram_state_cleanup(&ram_state);
3974 }
3975 
3976 /**
3977  * ram_load_setup: Setup RAM for migration incoming side
3978  *
3979  * Returns zero to indicate success and negative for error
3980  *
3981  * @f: QEMUFile where to receive the data
3982  * @opaque: RAMState pointer
3983  */
3984 static int ram_load_setup(QEMUFile *f, void *opaque)
3985 {
3986     if (compress_threads_load_setup(f)) {
3987         return -1;
3988     }
3989 
3990     xbzrle_load_setup();
3991     ramblock_recv_map_init();
3992 
3993     return 0;
3994 }
3995 
3996 static int ram_load_cleanup(void *opaque)
3997 {
3998     RAMBlock *rb;
3999 
4000     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4001         qemu_ram_block_writeback(rb);
4002     }
4003 
4004     xbzrle_load_cleanup();
4005     compress_threads_load_cleanup();
4006 
4007     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4008         g_free(rb->receivedmap);
4009         rb->receivedmap = NULL;
4010     }
4011 
4012     return 0;
4013 }
4014 
4015 /**
4016  * ram_postcopy_incoming_init: allocate postcopy data structures
4017  *
4018  * Returns 0 for success and negative if there was one error
4019  *
4020  * @mis: current migration incoming state
4021  *
4022  * Allocate data structures etc needed by incoming migration with
4023  * postcopy-ram. postcopy-ram's similarly names
4024  * postcopy_ram_incoming_init does the work.
4025  */
4026 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4027 {
4028     return postcopy_ram_incoming_init(mis);
4029 }
4030 
4031 /**
4032  * ram_load_postcopy: load a page in postcopy case
4033  *
4034  * Returns 0 for success or -errno in case of error
4035  *
4036  * Called in postcopy mode by ram_load().
4037  * rcu_read_lock is taken prior to this being called.
4038  *
4039  * @f: QEMUFile where to send the data
4040  * @channel: the channel to use for loading
4041  */
4042 int ram_load_postcopy(QEMUFile *f, int channel)
4043 {
4044     int flags = 0, ret = 0;
4045     bool place_needed = false;
4046     bool matches_target_page_size = false;
4047     MigrationIncomingState *mis = migration_incoming_get_current();
4048     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
4049 
4050     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4051         ram_addr_t addr;
4052         void *page_buffer = NULL;
4053         void *place_source = NULL;
4054         RAMBlock *block = NULL;
4055         uint8_t ch;
4056         int len;
4057 
4058         addr = qemu_get_be64(f);
4059 
4060         /*
4061          * If qemu file error, we should stop here, and then "addr"
4062          * may be invalid
4063          */
4064         ret = qemu_file_get_error(f);
4065         if (ret) {
4066             break;
4067         }
4068 
4069         flags = addr & ~TARGET_PAGE_MASK;
4070         addr &= TARGET_PAGE_MASK;
4071 
4072         trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
4073         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4074                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
4075             block = ram_block_from_stream(mis, f, flags, channel);
4076             if (!block) {
4077                 ret = -EINVAL;
4078                 break;
4079             }
4080 
4081             /*
4082              * Relying on used_length is racy and can result in false positives.
4083              * We might place pages beyond used_length in case RAM was shrunk
4084              * while in postcopy, which is fine - trying to place via
4085              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4086              */
4087             if (!block->host || addr >= block->postcopy_length) {
4088                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4089                 ret = -EINVAL;
4090                 break;
4091             }
4092             tmp_page->target_pages++;
4093             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4094             /*
4095              * Postcopy requires that we place whole host pages atomically;
4096              * these may be huge pages for RAMBlocks that are backed by
4097              * hugetlbfs.
4098              * To make it atomic, the data is read into a temporary page
4099              * that's moved into place later.
4100              * The migration protocol uses,  possibly smaller, target-pages
4101              * however the source ensures it always sends all the components
4102              * of a host page in one chunk.
4103              */
4104             page_buffer = tmp_page->tmp_huge_page +
4105                           host_page_offset_from_ram_block_offset(block, addr);
4106             /* If all TP are zero then we can optimise the place */
4107             if (tmp_page->target_pages == 1) {
4108                 tmp_page->host_addr =
4109                     host_page_from_ram_block_offset(block, addr);
4110             } else if (tmp_page->host_addr !=
4111                        host_page_from_ram_block_offset(block, addr)) {
4112                 /* not the 1st TP within the HP */
4113                 error_report("Non-same host page detected on channel %d: "
4114                              "Target host page %p, received host page %p "
4115                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4116                              channel, tmp_page->host_addr,
4117                              host_page_from_ram_block_offset(block, addr),
4118                              block->idstr, addr, tmp_page->target_pages);
4119                 ret = -EINVAL;
4120                 break;
4121             }
4122 
4123             /*
4124              * If it's the last part of a host page then we place the host
4125              * page
4126              */
4127             if (tmp_page->target_pages ==
4128                 (block->page_size / TARGET_PAGE_SIZE)) {
4129                 place_needed = true;
4130             }
4131             place_source = tmp_page->tmp_huge_page;
4132         }
4133 
4134         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4135         case RAM_SAVE_FLAG_ZERO:
4136             ch = qemu_get_byte(f);
4137             /*
4138              * Can skip to set page_buffer when
4139              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4140              */
4141             if (ch || !matches_target_page_size) {
4142                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4143             }
4144             if (ch) {
4145                 tmp_page->all_zero = false;
4146             }
4147             break;
4148 
4149         case RAM_SAVE_FLAG_PAGE:
4150             tmp_page->all_zero = false;
4151             if (!matches_target_page_size) {
4152                 /* For huge pages, we always use temporary buffer */
4153                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4154             } else {
4155                 /*
4156                  * For small pages that matches target page size, we
4157                  * avoid the qemu_file copy.  Instead we directly use
4158                  * the buffer of QEMUFile to place the page.  Note: we
4159                  * cannot do any QEMUFile operation before using that
4160                  * buffer to make sure the buffer is valid when
4161                  * placing the page.
4162                  */
4163                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4164                                          TARGET_PAGE_SIZE);
4165             }
4166             break;
4167         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4168             tmp_page->all_zero = false;
4169             len = qemu_get_be32(f);
4170             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4171                 error_report("Invalid compressed data length: %d", len);
4172                 ret = -EINVAL;
4173                 break;
4174             }
4175             decompress_data_with_multi_threads(f, page_buffer, len);
4176             break;
4177         case RAM_SAVE_FLAG_MULTIFD_FLUSH:
4178             multifd_recv_sync_main();
4179             break;
4180         case RAM_SAVE_FLAG_EOS:
4181             /* normal exit */
4182             if (migrate_multifd_flush_after_each_section()) {
4183                 multifd_recv_sync_main();
4184             }
4185             break;
4186         default:
4187             error_report("Unknown combination of migration flags: 0x%x"
4188                          " (postcopy mode)", flags);
4189             ret = -EINVAL;
4190             break;
4191         }
4192 
4193         /* Got the whole host page, wait for decompress before placing. */
4194         if (place_needed) {
4195             ret |= wait_for_decompress_done();
4196         }
4197 
4198         /* Detect for any possible file errors */
4199         if (!ret && qemu_file_get_error(f)) {
4200             ret = qemu_file_get_error(f);
4201         }
4202 
4203         if (!ret && place_needed) {
4204             if (tmp_page->all_zero) {
4205                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4206             } else {
4207                 ret = postcopy_place_page(mis, tmp_page->host_addr,
4208                                           place_source, block);
4209             }
4210             place_needed = false;
4211             postcopy_temp_page_reset(tmp_page);
4212         }
4213     }
4214 
4215     return ret;
4216 }
4217 
4218 static bool postcopy_is_running(void)
4219 {
4220     PostcopyState ps = postcopy_state_get();
4221     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4222 }
4223 
4224 /*
4225  * Flush content of RAM cache into SVM's memory.
4226  * Only flush the pages that be dirtied by PVM or SVM or both.
4227  */
4228 void colo_flush_ram_cache(void)
4229 {
4230     RAMBlock *block = NULL;
4231     void *dst_host;
4232     void *src_host;
4233     unsigned long offset = 0;
4234 
4235     memory_global_dirty_log_sync();
4236     WITH_RCU_READ_LOCK_GUARD() {
4237         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4238             ramblock_sync_dirty_bitmap(ram_state, block);
4239         }
4240     }
4241 
4242     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4243     WITH_RCU_READ_LOCK_GUARD() {
4244         block = QLIST_FIRST_RCU(&ram_list.blocks);
4245 
4246         while (block) {
4247             unsigned long num = 0;
4248 
4249             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4250             if (!offset_in_ramblock(block,
4251                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4252                 offset = 0;
4253                 num = 0;
4254                 block = QLIST_NEXT_RCU(block, next);
4255             } else {
4256                 unsigned long i = 0;
4257 
4258                 for (i = 0; i < num; i++) {
4259                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
4260                 }
4261                 dst_host = block->host
4262                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4263                 src_host = block->colo_cache
4264                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4265                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4266                 offset += num;
4267             }
4268         }
4269     }
4270     trace_colo_flush_ram_cache_end();
4271 }
4272 
4273 /**
4274  * ram_load_precopy: load pages in precopy case
4275  *
4276  * Returns 0 for success or -errno in case of error
4277  *
4278  * Called in precopy mode by ram_load().
4279  * rcu_read_lock is taken prior to this being called.
4280  *
4281  * @f: QEMUFile where to send the data
4282  */
4283 static int ram_load_precopy(QEMUFile *f)
4284 {
4285     MigrationIncomingState *mis = migration_incoming_get_current();
4286     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4287     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4288     bool postcopy_advised = migration_incoming_postcopy_advised();
4289     if (!migrate_compress()) {
4290         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4291     }
4292 
4293     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4294         ram_addr_t addr, total_ram_bytes;
4295         void *host = NULL, *host_bak = NULL;
4296         uint8_t ch;
4297 
4298         /*
4299          * Yield periodically to let main loop run, but an iteration of
4300          * the main loop is expensive, so do it each some iterations
4301          */
4302         if ((i & 32767) == 0 && qemu_in_coroutine()) {
4303             aio_co_schedule(qemu_get_current_aio_context(),
4304                             qemu_coroutine_self());
4305             qemu_coroutine_yield();
4306         }
4307         i++;
4308 
4309         addr = qemu_get_be64(f);
4310         flags = addr & ~TARGET_PAGE_MASK;
4311         addr &= TARGET_PAGE_MASK;
4312 
4313         if (flags & invalid_flags) {
4314             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4315                 error_report("Received an unexpected compressed page");
4316             }
4317 
4318             ret = -EINVAL;
4319             break;
4320         }
4321 
4322         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4323                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4324             RAMBlock *block = ram_block_from_stream(mis, f, flags,
4325                                                     RAM_CHANNEL_PRECOPY);
4326 
4327             host = host_from_ram_block_offset(block, addr);
4328             /*
4329              * After going into COLO stage, we should not load the page
4330              * into SVM's memory directly, we put them into colo_cache firstly.
4331              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4332              * Previously, we copied all these memory in preparing stage of COLO
4333              * while we need to stop VM, which is a time-consuming process.
4334              * Here we optimize it by a trick, back-up every page while in
4335              * migration process while COLO is enabled, though it affects the
4336              * speed of the migration, but it obviously reduce the downtime of
4337              * back-up all SVM'S memory in COLO preparing stage.
4338              */
4339             if (migration_incoming_colo_enabled()) {
4340                 if (migration_incoming_in_colo_state()) {
4341                     /* In COLO stage, put all pages into cache temporarily */
4342                     host = colo_cache_from_block_offset(block, addr, true);
4343                 } else {
4344                    /*
4345                     * In migration stage but before COLO stage,
4346                     * Put all pages into both cache and SVM's memory.
4347                     */
4348                     host_bak = colo_cache_from_block_offset(block, addr, false);
4349                 }
4350             }
4351             if (!host) {
4352                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4353                 ret = -EINVAL;
4354                 break;
4355             }
4356             if (!migration_incoming_in_colo_state()) {
4357                 ramblock_recv_bitmap_set(block, host);
4358             }
4359 
4360             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4361         }
4362 
4363         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4364         case RAM_SAVE_FLAG_MEM_SIZE:
4365             /* Synchronize RAM block list */
4366             total_ram_bytes = addr;
4367             while (!ret && total_ram_bytes) {
4368                 RAMBlock *block;
4369                 char id[256];
4370                 ram_addr_t length;
4371 
4372                 len = qemu_get_byte(f);
4373                 qemu_get_buffer(f, (uint8_t *)id, len);
4374                 id[len] = 0;
4375                 length = qemu_get_be64(f);
4376 
4377                 block = qemu_ram_block_by_name(id);
4378                 if (block && !qemu_ram_is_migratable(block)) {
4379                     error_report("block %s should not be migrated !", id);
4380                     ret = -EINVAL;
4381                 } else if (block) {
4382                     if (length != block->used_length) {
4383                         Error *local_err = NULL;
4384 
4385                         ret = qemu_ram_resize(block, length,
4386                                               &local_err);
4387                         if (local_err) {
4388                             error_report_err(local_err);
4389                         }
4390                     }
4391                     /* For postcopy we need to check hugepage sizes match */
4392                     if (postcopy_advised && migrate_postcopy_ram() &&
4393                         block->page_size != qemu_host_page_size) {
4394                         uint64_t remote_page_size = qemu_get_be64(f);
4395                         if (remote_page_size != block->page_size) {
4396                             error_report("Mismatched RAM page size %s "
4397                                          "(local) %zd != %" PRId64,
4398                                          id, block->page_size,
4399                                          remote_page_size);
4400                             ret = -EINVAL;
4401                         }
4402                     }
4403                     if (migrate_ignore_shared()) {
4404                         hwaddr addr = qemu_get_be64(f);
4405                         if (ramblock_is_ignored(block) &&
4406                             block->mr->addr != addr) {
4407                             error_report("Mismatched GPAs for block %s "
4408                                          "%" PRId64 "!= %" PRId64,
4409                                          id, (uint64_t)addr,
4410                                          (uint64_t)block->mr->addr);
4411                             ret = -EINVAL;
4412                         }
4413                     }
4414                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4415                                           block->idstr);
4416                 } else {
4417                     error_report("Unknown ramblock \"%s\", cannot "
4418                                  "accept migration", id);
4419                     ret = -EINVAL;
4420                 }
4421 
4422                 total_ram_bytes -= length;
4423             }
4424             break;
4425 
4426         case RAM_SAVE_FLAG_ZERO:
4427             ch = qemu_get_byte(f);
4428             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4429             break;
4430 
4431         case RAM_SAVE_FLAG_PAGE:
4432             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4433             break;
4434 
4435         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4436             len = qemu_get_be32(f);
4437             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4438                 error_report("Invalid compressed data length: %d", len);
4439                 ret = -EINVAL;
4440                 break;
4441             }
4442             decompress_data_with_multi_threads(f, host, len);
4443             break;
4444 
4445         case RAM_SAVE_FLAG_XBZRLE:
4446             if (load_xbzrle(f, addr, host) < 0) {
4447                 error_report("Failed to decompress XBZRLE page at "
4448                              RAM_ADDR_FMT, addr);
4449                 ret = -EINVAL;
4450                 break;
4451             }
4452             break;
4453         case RAM_SAVE_FLAG_MULTIFD_FLUSH:
4454             multifd_recv_sync_main();
4455             break;
4456         case RAM_SAVE_FLAG_EOS:
4457             /* normal exit */
4458             if (migrate_multifd_flush_after_each_section()) {
4459                 multifd_recv_sync_main();
4460             }
4461             break;
4462         default:
4463             if (flags & RAM_SAVE_FLAG_HOOK) {
4464                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4465             } else {
4466                 error_report("Unknown combination of migration flags: 0x%x",
4467                              flags);
4468                 ret = -EINVAL;
4469             }
4470         }
4471         if (!ret) {
4472             ret = qemu_file_get_error(f);
4473         }
4474         if (!ret && host_bak) {
4475             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4476         }
4477     }
4478 
4479     ret |= wait_for_decompress_done();
4480     return ret;
4481 }
4482 
4483 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4484 {
4485     int ret = 0;
4486     static uint64_t seq_iter;
4487     /*
4488      * If system is running in postcopy mode, page inserts to host memory must
4489      * be atomic
4490      */
4491     bool postcopy_running = postcopy_is_running();
4492 
4493     seq_iter++;
4494 
4495     if (version_id != 4) {
4496         return -EINVAL;
4497     }
4498 
4499     /*
4500      * This RCU critical section can be very long running.
4501      * When RCU reclaims in the code start to become numerous,
4502      * it will be necessary to reduce the granularity of this
4503      * critical section.
4504      */
4505     WITH_RCU_READ_LOCK_GUARD() {
4506         if (postcopy_running) {
4507             /*
4508              * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4509              * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4510              * service fast page faults.
4511              */
4512             ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4513         } else {
4514             ret = ram_load_precopy(f);
4515         }
4516     }
4517     trace_ram_load_complete(ret, seq_iter);
4518 
4519     return ret;
4520 }
4521 
4522 static bool ram_has_postcopy(void *opaque)
4523 {
4524     RAMBlock *rb;
4525     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4526         if (ramblock_is_pmem(rb)) {
4527             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4528                          "is not supported now!", rb->idstr, rb->host);
4529             return false;
4530         }
4531     }
4532 
4533     return migrate_postcopy_ram();
4534 }
4535 
4536 /* Sync all the dirty bitmap with destination VM.  */
4537 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4538 {
4539     RAMBlock *block;
4540     QEMUFile *file = s->to_dst_file;
4541     int ramblock_count = 0;
4542 
4543     trace_ram_dirty_bitmap_sync_start();
4544 
4545     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4546         qemu_savevm_send_recv_bitmap(file, block->idstr);
4547         trace_ram_dirty_bitmap_request(block->idstr);
4548         ramblock_count++;
4549     }
4550 
4551     trace_ram_dirty_bitmap_sync_wait();
4552 
4553     /* Wait until all the ramblocks' dirty bitmap synced */
4554     while (ramblock_count--) {
4555         qemu_sem_wait(&s->rp_state.rp_sem);
4556     }
4557 
4558     trace_ram_dirty_bitmap_sync_complete();
4559 
4560     return 0;
4561 }
4562 
4563 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4564 {
4565     qemu_sem_post(&s->rp_state.rp_sem);
4566 }
4567 
4568 /*
4569  * Read the received bitmap, revert it as the initial dirty bitmap.
4570  * This is only used when the postcopy migration is paused but wants
4571  * to resume from a middle point.
4572  */
4573 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4574 {
4575     int ret = -EINVAL;
4576     /* from_dst_file is always valid because we're within rp_thread */
4577     QEMUFile *file = s->rp_state.from_dst_file;
4578     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4579     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4580     uint64_t size, end_mark;
4581 
4582     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4583 
4584     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4585         error_report("%s: incorrect state %s", __func__,
4586                      MigrationStatus_str(s->state));
4587         return -EINVAL;
4588     }
4589 
4590     /*
4591      * Note: see comments in ramblock_recv_bitmap_send() on why we
4592      * need the endianness conversion, and the paddings.
4593      */
4594     local_size = ROUND_UP(local_size, 8);
4595 
4596     /* Add paddings */
4597     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4598 
4599     size = qemu_get_be64(file);
4600 
4601     /* The size of the bitmap should match with our ramblock */
4602     if (size != local_size) {
4603         error_report("%s: ramblock '%s' bitmap size mismatch "
4604                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4605                      block->idstr, size, local_size);
4606         ret = -EINVAL;
4607         goto out;
4608     }
4609 
4610     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4611     end_mark = qemu_get_be64(file);
4612 
4613     ret = qemu_file_get_error(file);
4614     if (ret || size != local_size) {
4615         error_report("%s: read bitmap failed for ramblock '%s': %d"
4616                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4617                      __func__, block->idstr, ret, local_size, size);
4618         ret = -EIO;
4619         goto out;
4620     }
4621 
4622     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4623         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4624                      __func__, block->idstr, end_mark);
4625         ret = -EINVAL;
4626         goto out;
4627     }
4628 
4629     /*
4630      * Endianness conversion. We are during postcopy (though paused).
4631      * The dirty bitmap won't change. We can directly modify it.
4632      */
4633     bitmap_from_le(block->bmap, le_bitmap, nbits);
4634 
4635     /*
4636      * What we received is "received bitmap". Revert it as the initial
4637      * dirty bitmap for this ramblock.
4638      */
4639     bitmap_complement(block->bmap, block->bmap, nbits);
4640 
4641     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4642     ramblock_dirty_bitmap_clear_discarded_pages(block);
4643 
4644     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4645     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4646 
4647     /*
4648      * We succeeded to sync bitmap for current ramblock. If this is
4649      * the last one to sync, we need to notify the main send thread.
4650      */
4651     ram_dirty_bitmap_reload_notify(s);
4652 
4653     ret = 0;
4654 out:
4655     g_free(le_bitmap);
4656     return ret;
4657 }
4658 
4659 static int ram_resume_prepare(MigrationState *s, void *opaque)
4660 {
4661     RAMState *rs = *(RAMState **)opaque;
4662     int ret;
4663 
4664     ret = ram_dirty_bitmap_sync_all(s, rs);
4665     if (ret) {
4666         return ret;
4667     }
4668 
4669     ram_state_resume_prepare(rs, s->to_dst_file);
4670 
4671     return 0;
4672 }
4673 
4674 void postcopy_preempt_shutdown_file(MigrationState *s)
4675 {
4676     qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4677     qemu_fflush(s->postcopy_qemufile_src);
4678 }
4679 
4680 static SaveVMHandlers savevm_ram_handlers = {
4681     .save_setup = ram_save_setup,
4682     .save_live_iterate = ram_save_iterate,
4683     .save_live_complete_postcopy = ram_save_complete,
4684     .save_live_complete_precopy = ram_save_complete,
4685     .has_postcopy = ram_has_postcopy,
4686     .state_pending_exact = ram_state_pending_exact,
4687     .state_pending_estimate = ram_state_pending_estimate,
4688     .load_state = ram_load,
4689     .save_cleanup = ram_save_cleanup,
4690     .load_setup = ram_load_setup,
4691     .load_cleanup = ram_load_cleanup,
4692     .resume_prepare = ram_resume_prepare,
4693 };
4694 
4695 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4696                                       size_t old_size, size_t new_size)
4697 {
4698     PostcopyState ps = postcopy_state_get();
4699     ram_addr_t offset;
4700     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4701     Error *err = NULL;
4702 
4703     if (ramblock_is_ignored(rb)) {
4704         return;
4705     }
4706 
4707     if (!migration_is_idle()) {
4708         /*
4709          * Precopy code on the source cannot deal with the size of RAM blocks
4710          * changing at random points in time - especially after sending the
4711          * RAM block sizes in the migration stream, they must no longer change.
4712          * Abort and indicate a proper reason.
4713          */
4714         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4715         migration_cancel(err);
4716         error_free(err);
4717     }
4718 
4719     switch (ps) {
4720     case POSTCOPY_INCOMING_ADVISE:
4721         /*
4722          * Update what ram_postcopy_incoming_init()->init_range() does at the
4723          * time postcopy was advised. Syncing RAM blocks with the source will
4724          * result in RAM resizes.
4725          */
4726         if (old_size < new_size) {
4727             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4728                 error_report("RAM block '%s' discard of resized RAM failed",
4729                              rb->idstr);
4730             }
4731         }
4732         rb->postcopy_length = new_size;
4733         break;
4734     case POSTCOPY_INCOMING_NONE:
4735     case POSTCOPY_INCOMING_RUNNING:
4736     case POSTCOPY_INCOMING_END:
4737         /*
4738          * Once our guest is running, postcopy does no longer care about
4739          * resizes. When growing, the new memory was not available on the
4740          * source, no handler needed.
4741          */
4742         break;
4743     default:
4744         error_report("RAM block '%s' resized during postcopy state: %d",
4745                      rb->idstr, ps);
4746         exit(-1);
4747     }
4748 }
4749 
4750 static RAMBlockNotifier ram_mig_ram_notifier = {
4751     .ram_block_resized = ram_mig_ram_block_resized,
4752 };
4753 
4754 void ram_mig_init(void)
4755 {
4756     qemu_mutex_init(&XBZRLE.lock);
4757     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4758     ram_block_notifier_add(&ram_mig_ram_notifier);
4759 }
4760