xref: /qemu/migration/ram.c (revision a976a99a)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/madvise.h"
34 #include "qemu/main-loop.h"
35 #include "io/channel-null.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/error.h"
46 #include "qapi/qapi-types-migration.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
49 #include "trace.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
54 #include "block.h"
55 #include "sysemu/cpu-throttle.h"
56 #include "savevm.h"
57 #include "qemu/iov.h"
58 #include "multifd.h"
59 #include "sysemu/runstate.h"
60 
61 #include "hw/boards.h" /* for machine_dump_guest_core() */
62 
63 #if defined(__linux__)
64 #include "qemu/userfaultfd.h"
65 #endif /* defined(__linux__) */
66 
67 /***********************************************************/
68 /* ram save/restore */
69 
70 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
71  * worked for pages that where filled with the same char.  We switched
72  * it to only search for the zero value.  And to avoid confusion with
73  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
74  */
75 
76 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
77 #define RAM_SAVE_FLAG_ZERO     0x02
78 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
79 #define RAM_SAVE_FLAG_PAGE     0x08
80 #define RAM_SAVE_FLAG_EOS      0x10
81 #define RAM_SAVE_FLAG_CONTINUE 0x20
82 #define RAM_SAVE_FLAG_XBZRLE   0x40
83 /* 0x80 is reserved in migration.h start with 0x100 next */
84 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
85 
86 XBZRLECacheStats xbzrle_counters;
87 
88 /* struct contains XBZRLE cache and a static page
89    used by the compression */
90 static struct {
91     /* buffer used for XBZRLE encoding */
92     uint8_t *encoded_buf;
93     /* buffer for storing page content */
94     uint8_t *current_buf;
95     /* Cache for XBZRLE, Protected by lock. */
96     PageCache *cache;
97     QemuMutex lock;
98     /* it will store a page full of zeros */
99     uint8_t *zero_target_page;
100     /* buffer used for XBZRLE decoding */
101     uint8_t *decoded_buf;
102 } XBZRLE;
103 
104 static void XBZRLE_cache_lock(void)
105 {
106     if (migrate_use_xbzrle()) {
107         qemu_mutex_lock(&XBZRLE.lock);
108     }
109 }
110 
111 static void XBZRLE_cache_unlock(void)
112 {
113     if (migrate_use_xbzrle()) {
114         qemu_mutex_unlock(&XBZRLE.lock);
115     }
116 }
117 
118 /**
119  * xbzrle_cache_resize: resize the xbzrle cache
120  *
121  * This function is called from migrate_params_apply in main
122  * thread, possibly while a migration is in progress.  A running
123  * migration may be using the cache and might finish during this call,
124  * hence changes to the cache are protected by XBZRLE.lock().
125  *
126  * Returns 0 for success or -1 for error
127  *
128  * @new_size: new cache size
129  * @errp: set *errp if the check failed, with reason
130  */
131 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
132 {
133     PageCache *new_cache;
134     int64_t ret = 0;
135 
136     /* Check for truncation */
137     if (new_size != (size_t)new_size) {
138         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
139                    "exceeding address space");
140         return -1;
141     }
142 
143     if (new_size == migrate_xbzrle_cache_size()) {
144         /* nothing to do */
145         return 0;
146     }
147 
148     XBZRLE_cache_lock();
149 
150     if (XBZRLE.cache != NULL) {
151         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
152         if (!new_cache) {
153             ret = -1;
154             goto out;
155         }
156 
157         cache_fini(XBZRLE.cache);
158         XBZRLE.cache = new_cache;
159     }
160 out:
161     XBZRLE_cache_unlock();
162     return ret;
163 }
164 
165 bool ramblock_is_ignored(RAMBlock *block)
166 {
167     return !qemu_ram_is_migratable(block) ||
168            (migrate_ignore_shared() && qemu_ram_is_shared(block));
169 }
170 
171 #undef RAMBLOCK_FOREACH
172 
173 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
174 {
175     RAMBlock *block;
176     int ret = 0;
177 
178     RCU_READ_LOCK_GUARD();
179 
180     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
181         ret = func(block, opaque);
182         if (ret) {
183             break;
184         }
185     }
186     return ret;
187 }
188 
189 static void ramblock_recv_map_init(void)
190 {
191     RAMBlock *rb;
192 
193     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
194         assert(!rb->receivedmap);
195         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
196     }
197 }
198 
199 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
200 {
201     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
202                     rb->receivedmap);
203 }
204 
205 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
206 {
207     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
208 }
209 
210 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
211 {
212     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
213 }
214 
215 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
216                                     size_t nr)
217 {
218     bitmap_set_atomic(rb->receivedmap,
219                       ramblock_recv_bitmap_offset(host_addr, rb),
220                       nr);
221 }
222 
223 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
224 
225 /*
226  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
227  *
228  * Returns >0 if success with sent bytes, or <0 if error.
229  */
230 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
231                                   const char *block_name)
232 {
233     RAMBlock *block = qemu_ram_block_by_name(block_name);
234     unsigned long *le_bitmap, nbits;
235     uint64_t size;
236 
237     if (!block) {
238         error_report("%s: invalid block name: %s", __func__, block_name);
239         return -1;
240     }
241 
242     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
243 
244     /*
245      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
246      * machines we may need 4 more bytes for padding (see below
247      * comment). So extend it a bit before hand.
248      */
249     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
250 
251     /*
252      * Always use little endian when sending the bitmap. This is
253      * required that when source and destination VMs are not using the
254      * same endianness. (Note: big endian won't work.)
255      */
256     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
257 
258     /* Size of the bitmap, in bytes */
259     size = DIV_ROUND_UP(nbits, 8);
260 
261     /*
262      * size is always aligned to 8 bytes for 64bit machines, but it
263      * may not be true for 32bit machines. We need this padding to
264      * make sure the migration can survive even between 32bit and
265      * 64bit machines.
266      */
267     size = ROUND_UP(size, 8);
268 
269     qemu_put_be64(file, size);
270     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
271     /*
272      * Mark as an end, in case the middle part is screwed up due to
273      * some "mysterious" reason.
274      */
275     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
276     qemu_fflush(file);
277 
278     g_free(le_bitmap);
279 
280     if (qemu_file_get_error(file)) {
281         return qemu_file_get_error(file);
282     }
283 
284     return size + sizeof(size);
285 }
286 
287 /*
288  * An outstanding page request, on the source, having been received
289  * and queued
290  */
291 struct RAMSrcPageRequest {
292     RAMBlock *rb;
293     hwaddr    offset;
294     hwaddr    len;
295 
296     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
297 };
298 
299 typedef struct {
300     /*
301      * Cached ramblock/offset values if preempted.  They're only meaningful if
302      * preempted==true below.
303      */
304     RAMBlock *ram_block;
305     unsigned long ram_page;
306     /*
307      * Whether a postcopy preemption just happened.  Will be reset after
308      * precopy recovered to background migration.
309      */
310     bool preempted;
311 } PostcopyPreemptState;
312 
313 /* State of RAM for migration */
314 struct RAMState {
315     /* QEMUFile used for this migration */
316     QEMUFile *f;
317     /* UFFD file descriptor, used in 'write-tracking' migration */
318     int uffdio_fd;
319     /* Last block that we have visited searching for dirty pages */
320     RAMBlock *last_seen_block;
321     /* Last block from where we have sent data */
322     RAMBlock *last_sent_block;
323     /* Last dirty target page we have sent */
324     ram_addr_t last_page;
325     /* last ram version we have seen */
326     uint32_t last_version;
327     /* How many times we have dirty too many pages */
328     int dirty_rate_high_cnt;
329     /* these variables are used for bitmap sync */
330     /* last time we did a full bitmap_sync */
331     int64_t time_last_bitmap_sync;
332     /* bytes transferred at start_time */
333     uint64_t bytes_xfer_prev;
334     /* number of dirty pages since start_time */
335     uint64_t num_dirty_pages_period;
336     /* xbzrle misses since the beginning of the period */
337     uint64_t xbzrle_cache_miss_prev;
338     /* Amount of xbzrle pages since the beginning of the period */
339     uint64_t xbzrle_pages_prev;
340     /* Amount of xbzrle encoded bytes since the beginning of the period */
341     uint64_t xbzrle_bytes_prev;
342     /* Start using XBZRLE (e.g., after the first round). */
343     bool xbzrle_enabled;
344     /* Are we on the last stage of migration */
345     bool last_stage;
346     /* compression statistics since the beginning of the period */
347     /* amount of count that no free thread to compress data */
348     uint64_t compress_thread_busy_prev;
349     /* amount bytes after compression */
350     uint64_t compressed_size_prev;
351     /* amount of compressed pages */
352     uint64_t compress_pages_prev;
353 
354     /* total handled target pages at the beginning of period */
355     uint64_t target_page_count_prev;
356     /* total handled target pages since start */
357     uint64_t target_page_count;
358     /* number of dirty bits in the bitmap */
359     uint64_t migration_dirty_pages;
360     /* Protects modification of the bitmap and migration dirty pages */
361     QemuMutex bitmap_mutex;
362     /* The RAMBlock used in the last src_page_requests */
363     RAMBlock *last_req_rb;
364     /* Queue of outstanding page requests from the destination */
365     QemuMutex src_page_req_mutex;
366     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
367 
368     /* Postcopy preemption informations */
369     PostcopyPreemptState postcopy_preempt_state;
370     /*
371      * Current channel we're using on src VM.  Only valid if postcopy-preempt
372      * is enabled.
373      */
374     unsigned int postcopy_channel;
375 };
376 typedef struct RAMState RAMState;
377 
378 static RAMState *ram_state;
379 
380 static NotifierWithReturnList precopy_notifier_list;
381 
382 static void postcopy_preempt_reset(RAMState *rs)
383 {
384     memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState));
385 }
386 
387 /* Whether postcopy has queued requests? */
388 static bool postcopy_has_request(RAMState *rs)
389 {
390     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
391 }
392 
393 void precopy_infrastructure_init(void)
394 {
395     notifier_with_return_list_init(&precopy_notifier_list);
396 }
397 
398 void precopy_add_notifier(NotifierWithReturn *n)
399 {
400     notifier_with_return_list_add(&precopy_notifier_list, n);
401 }
402 
403 void precopy_remove_notifier(NotifierWithReturn *n)
404 {
405     notifier_with_return_remove(n);
406 }
407 
408 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
409 {
410     PrecopyNotifyData pnd;
411     pnd.reason = reason;
412     pnd.errp = errp;
413 
414     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
415 }
416 
417 uint64_t ram_bytes_remaining(void)
418 {
419     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
420                        0;
421 }
422 
423 MigrationStats ram_counters;
424 
425 static void ram_transferred_add(uint64_t bytes)
426 {
427     if (runstate_is_running()) {
428         ram_counters.precopy_bytes += bytes;
429     } else if (migration_in_postcopy()) {
430         ram_counters.postcopy_bytes += bytes;
431     } else {
432         ram_counters.downtime_bytes += bytes;
433     }
434     ram_counters.transferred += bytes;
435 }
436 
437 void dirty_sync_missed_zero_copy(void)
438 {
439     ram_counters.dirty_sync_missed_zero_copy++;
440 }
441 
442 /* used by the search for pages to send */
443 struct PageSearchStatus {
444     /* Current block being searched */
445     RAMBlock    *block;
446     /* Current page to search from */
447     unsigned long page;
448     /* Set once we wrap around */
449     bool         complete_round;
450     /*
451      * [POSTCOPY-ONLY] Whether current page is explicitly requested by
452      * postcopy.  When set, the request is "urgent" because the dest QEMU
453      * threads are waiting for us.
454      */
455     bool         postcopy_requested;
456     /*
457      * [POSTCOPY-ONLY] The target channel to use to send current page.
458      *
459      * Note: This may _not_ match with the value in postcopy_requested
460      * above. Let's imagine the case where the postcopy request is exactly
461      * the page that we're sending in progress during precopy. In this case
462      * we'll have postcopy_requested set to true but the target channel
463      * will be the precopy channel (so that we don't split brain on that
464      * specific page since the precopy channel already contains partial of
465      * that page data).
466      *
467      * Besides that specific use case, postcopy_target_channel should
468      * always be equal to postcopy_requested, because by default we send
469      * postcopy pages via postcopy preempt channel.
470      */
471     bool         postcopy_target_channel;
472 };
473 typedef struct PageSearchStatus PageSearchStatus;
474 
475 CompressionStats compression_counters;
476 
477 struct CompressParam {
478     bool done;
479     bool quit;
480     bool zero_page;
481     QEMUFile *file;
482     QemuMutex mutex;
483     QemuCond cond;
484     RAMBlock *block;
485     ram_addr_t offset;
486 
487     /* internally used fields */
488     z_stream stream;
489     uint8_t *originbuf;
490 };
491 typedef struct CompressParam CompressParam;
492 
493 struct DecompressParam {
494     bool done;
495     bool quit;
496     QemuMutex mutex;
497     QemuCond cond;
498     void *des;
499     uint8_t *compbuf;
500     int len;
501     z_stream stream;
502 };
503 typedef struct DecompressParam DecompressParam;
504 
505 static CompressParam *comp_param;
506 static QemuThread *compress_threads;
507 /* comp_done_cond is used to wake up the migration thread when
508  * one of the compression threads has finished the compression.
509  * comp_done_lock is used to co-work with comp_done_cond.
510  */
511 static QemuMutex comp_done_lock;
512 static QemuCond comp_done_cond;
513 
514 static QEMUFile *decomp_file;
515 static DecompressParam *decomp_param;
516 static QemuThread *decompress_threads;
517 static QemuMutex decomp_done_lock;
518 static QemuCond decomp_done_cond;
519 
520 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
521                                  ram_addr_t offset, uint8_t *source_buf);
522 
523 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
524                                      bool postcopy_requested);
525 
526 static void *do_data_compress(void *opaque)
527 {
528     CompressParam *param = opaque;
529     RAMBlock *block;
530     ram_addr_t offset;
531     bool zero_page;
532 
533     qemu_mutex_lock(&param->mutex);
534     while (!param->quit) {
535         if (param->block) {
536             block = param->block;
537             offset = param->offset;
538             param->block = NULL;
539             qemu_mutex_unlock(&param->mutex);
540 
541             zero_page = do_compress_ram_page(param->file, &param->stream,
542                                              block, offset, param->originbuf);
543 
544             qemu_mutex_lock(&comp_done_lock);
545             param->done = true;
546             param->zero_page = zero_page;
547             qemu_cond_signal(&comp_done_cond);
548             qemu_mutex_unlock(&comp_done_lock);
549 
550             qemu_mutex_lock(&param->mutex);
551         } else {
552             qemu_cond_wait(&param->cond, &param->mutex);
553         }
554     }
555     qemu_mutex_unlock(&param->mutex);
556 
557     return NULL;
558 }
559 
560 static void compress_threads_save_cleanup(void)
561 {
562     int i, thread_count;
563 
564     if (!migrate_use_compression() || !comp_param) {
565         return;
566     }
567 
568     thread_count = migrate_compress_threads();
569     for (i = 0; i < thread_count; i++) {
570         /*
571          * we use it as a indicator which shows if the thread is
572          * properly init'd or not
573          */
574         if (!comp_param[i].file) {
575             break;
576         }
577 
578         qemu_mutex_lock(&comp_param[i].mutex);
579         comp_param[i].quit = true;
580         qemu_cond_signal(&comp_param[i].cond);
581         qemu_mutex_unlock(&comp_param[i].mutex);
582 
583         qemu_thread_join(compress_threads + i);
584         qemu_mutex_destroy(&comp_param[i].mutex);
585         qemu_cond_destroy(&comp_param[i].cond);
586         deflateEnd(&comp_param[i].stream);
587         g_free(comp_param[i].originbuf);
588         qemu_fclose(comp_param[i].file);
589         comp_param[i].file = NULL;
590     }
591     qemu_mutex_destroy(&comp_done_lock);
592     qemu_cond_destroy(&comp_done_cond);
593     g_free(compress_threads);
594     g_free(comp_param);
595     compress_threads = NULL;
596     comp_param = NULL;
597 }
598 
599 static int compress_threads_save_setup(void)
600 {
601     int i, thread_count;
602 
603     if (!migrate_use_compression()) {
604         return 0;
605     }
606     thread_count = migrate_compress_threads();
607     compress_threads = g_new0(QemuThread, thread_count);
608     comp_param = g_new0(CompressParam, thread_count);
609     qemu_cond_init(&comp_done_cond);
610     qemu_mutex_init(&comp_done_lock);
611     for (i = 0; i < thread_count; i++) {
612         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
613         if (!comp_param[i].originbuf) {
614             goto exit;
615         }
616 
617         if (deflateInit(&comp_param[i].stream,
618                         migrate_compress_level()) != Z_OK) {
619             g_free(comp_param[i].originbuf);
620             goto exit;
621         }
622 
623         /* comp_param[i].file is just used as a dummy buffer to save data,
624          * set its ops to empty.
625          */
626         comp_param[i].file = qemu_file_new_output(
627             QIO_CHANNEL(qio_channel_null_new()));
628         comp_param[i].done = true;
629         comp_param[i].quit = false;
630         qemu_mutex_init(&comp_param[i].mutex);
631         qemu_cond_init(&comp_param[i].cond);
632         qemu_thread_create(compress_threads + i, "compress",
633                            do_data_compress, comp_param + i,
634                            QEMU_THREAD_JOINABLE);
635     }
636     return 0;
637 
638 exit:
639     compress_threads_save_cleanup();
640     return -1;
641 }
642 
643 /**
644  * save_page_header: write page header to wire
645  *
646  * If this is the 1st block, it also writes the block identification
647  *
648  * Returns the number of bytes written
649  *
650  * @f: QEMUFile where to send the data
651  * @block: block that contains the page we want to send
652  * @offset: offset inside the block for the page
653  *          in the lower bits, it contains flags
654  */
655 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
656                                ram_addr_t offset)
657 {
658     size_t size, len;
659 
660     if (block == rs->last_sent_block) {
661         offset |= RAM_SAVE_FLAG_CONTINUE;
662     }
663     qemu_put_be64(f, offset);
664     size = 8;
665 
666     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
667         len = strlen(block->idstr);
668         qemu_put_byte(f, len);
669         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
670         size += 1 + len;
671         rs->last_sent_block = block;
672     }
673     return size;
674 }
675 
676 /**
677  * mig_throttle_guest_down: throttle down the guest
678  *
679  * Reduce amount of guest cpu execution to hopefully slow down memory
680  * writes. If guest dirty memory rate is reduced below the rate at
681  * which we can transfer pages to the destination then we should be
682  * able to complete migration. Some workloads dirty memory way too
683  * fast and will not effectively converge, even with auto-converge.
684  */
685 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
686                                     uint64_t bytes_dirty_threshold)
687 {
688     MigrationState *s = migrate_get_current();
689     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
690     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
691     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
692     int pct_max = s->parameters.max_cpu_throttle;
693 
694     uint64_t throttle_now = cpu_throttle_get_percentage();
695     uint64_t cpu_now, cpu_ideal, throttle_inc;
696 
697     /* We have not started throttling yet. Let's start it. */
698     if (!cpu_throttle_active()) {
699         cpu_throttle_set(pct_initial);
700     } else {
701         /* Throttling already on, just increase the rate */
702         if (!pct_tailslow) {
703             throttle_inc = pct_increment;
704         } else {
705             /* Compute the ideal CPU percentage used by Guest, which may
706              * make the dirty rate match the dirty rate threshold. */
707             cpu_now = 100 - throttle_now;
708             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
709                         bytes_dirty_period);
710             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
711         }
712         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
713     }
714 }
715 
716 void mig_throttle_counter_reset(void)
717 {
718     RAMState *rs = ram_state;
719 
720     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
721     rs->num_dirty_pages_period = 0;
722     rs->bytes_xfer_prev = ram_counters.transferred;
723 }
724 
725 /**
726  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
727  *
728  * @rs: current RAM state
729  * @current_addr: address for the zero page
730  *
731  * Update the xbzrle cache to reflect a page that's been sent as all 0.
732  * The important thing is that a stale (not-yet-0'd) page be replaced
733  * by the new data.
734  * As a bonus, if the page wasn't in the cache it gets added so that
735  * when a small write is made into the 0'd page it gets XBZRLE sent.
736  */
737 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
738 {
739     if (!rs->xbzrle_enabled) {
740         return;
741     }
742 
743     /* We don't care if this fails to allocate a new cache page
744      * as long as it updated an old one */
745     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
746                  ram_counters.dirty_sync_count);
747 }
748 
749 #define ENCODING_FLAG_XBZRLE 0x1
750 
751 /**
752  * save_xbzrle_page: compress and send current page
753  *
754  * Returns: 1 means that we wrote the page
755  *          0 means that page is identical to the one already sent
756  *          -1 means that xbzrle would be longer than normal
757  *
758  * @rs: current RAM state
759  * @current_data: pointer to the address of the page contents
760  * @current_addr: addr of the page
761  * @block: block that contains the page we want to send
762  * @offset: offset inside the block for the page
763  */
764 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
765                             ram_addr_t current_addr, RAMBlock *block,
766                             ram_addr_t offset)
767 {
768     int encoded_len = 0, bytes_xbzrle;
769     uint8_t *prev_cached_page;
770 
771     if (!cache_is_cached(XBZRLE.cache, current_addr,
772                          ram_counters.dirty_sync_count)) {
773         xbzrle_counters.cache_miss++;
774         if (!rs->last_stage) {
775             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
776                              ram_counters.dirty_sync_count) == -1) {
777                 return -1;
778             } else {
779                 /* update *current_data when the page has been
780                    inserted into cache */
781                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
782             }
783         }
784         return -1;
785     }
786 
787     /*
788      * Reaching here means the page has hit the xbzrle cache, no matter what
789      * encoding result it is (normal encoding, overflow or skipping the page),
790      * count the page as encoded. This is used to calculate the encoding rate.
791      *
792      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
793      * 2nd page turns out to be skipped (i.e. no new bytes written to the
794      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
795      * skipped page included. In this way, the encoding rate can tell if the
796      * guest page is good for xbzrle encoding.
797      */
798     xbzrle_counters.pages++;
799     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
800 
801     /* save current buffer into memory */
802     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
803 
804     /* XBZRLE encoding (if there is no overflow) */
805     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
806                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
807                                        TARGET_PAGE_SIZE);
808 
809     /*
810      * Update the cache contents, so that it corresponds to the data
811      * sent, in all cases except where we skip the page.
812      */
813     if (!rs->last_stage && encoded_len != 0) {
814         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
815         /*
816          * In the case where we couldn't compress, ensure that the caller
817          * sends the data from the cache, since the guest might have
818          * changed the RAM since we copied it.
819          */
820         *current_data = prev_cached_page;
821     }
822 
823     if (encoded_len == 0) {
824         trace_save_xbzrle_page_skipping();
825         return 0;
826     } else if (encoded_len == -1) {
827         trace_save_xbzrle_page_overflow();
828         xbzrle_counters.overflow++;
829         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
830         return -1;
831     }
832 
833     /* Send XBZRLE based compressed page */
834     bytes_xbzrle = save_page_header(rs, rs->f, block,
835                                     offset | RAM_SAVE_FLAG_XBZRLE);
836     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
837     qemu_put_be16(rs->f, encoded_len);
838     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
839     bytes_xbzrle += encoded_len + 1 + 2;
840     /*
841      * Like compressed_size (please see update_compress_thread_counts),
842      * the xbzrle encoded bytes don't count the 8 byte header with
843      * RAM_SAVE_FLAG_CONTINUE.
844      */
845     xbzrle_counters.bytes += bytes_xbzrle - 8;
846     ram_transferred_add(bytes_xbzrle);
847 
848     return 1;
849 }
850 
851 /**
852  * migration_bitmap_find_dirty: find the next dirty page from start
853  *
854  * Returns the page offset within memory region of the start of a dirty page
855  *
856  * @rs: current RAM state
857  * @rb: RAMBlock where to search for dirty pages
858  * @start: page where we start the search
859  */
860 static inline
861 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
862                                           unsigned long start)
863 {
864     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
865     unsigned long *bitmap = rb->bmap;
866 
867     if (ramblock_is_ignored(rb)) {
868         return size;
869     }
870 
871     return find_next_bit(bitmap, size, start);
872 }
873 
874 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
875                                                        unsigned long page)
876 {
877     uint8_t shift;
878     hwaddr size, start;
879 
880     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
881         return;
882     }
883 
884     shift = rb->clear_bmap_shift;
885     /*
886      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
887      * can make things easier sometimes since then start address
888      * of the small chunk will always be 64 pages aligned so the
889      * bitmap will always be aligned to unsigned long. We should
890      * even be able to remove this restriction but I'm simply
891      * keeping it.
892      */
893     assert(shift >= 6);
894 
895     size = 1ULL << (TARGET_PAGE_BITS + shift);
896     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
897     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
898     memory_region_clear_dirty_bitmap(rb->mr, start, size);
899 }
900 
901 static void
902 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
903                                                  unsigned long start,
904                                                  unsigned long npages)
905 {
906     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
907     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
908     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
909 
910     /*
911      * Clear pages from start to start + npages - 1, so the end boundary is
912      * exclusive.
913      */
914     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
915         migration_clear_memory_region_dirty_bitmap(rb, i);
916     }
917 }
918 
919 /*
920  * colo_bitmap_find_diry:find contiguous dirty pages from start
921  *
922  * Returns the page offset within memory region of the start of the contiguout
923  * dirty page
924  *
925  * @rs: current RAM state
926  * @rb: RAMBlock where to search for dirty pages
927  * @start: page where we start the search
928  * @num: the number of contiguous dirty pages
929  */
930 static inline
931 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
932                                      unsigned long start, unsigned long *num)
933 {
934     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
935     unsigned long *bitmap = rb->bmap;
936     unsigned long first, next;
937 
938     *num = 0;
939 
940     if (ramblock_is_ignored(rb)) {
941         return size;
942     }
943 
944     first = find_next_bit(bitmap, size, start);
945     if (first >= size) {
946         return first;
947     }
948     next = find_next_zero_bit(bitmap, size, first + 1);
949     assert(next >= first);
950     *num = next - first;
951     return first;
952 }
953 
954 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
955                                                 RAMBlock *rb,
956                                                 unsigned long page)
957 {
958     bool ret;
959 
960     /*
961      * Clear dirty bitmap if needed.  This _must_ be called before we
962      * send any of the page in the chunk because we need to make sure
963      * we can capture further page content changes when we sync dirty
964      * log the next time.  So as long as we are going to send any of
965      * the page in the chunk we clear the remote dirty bitmap for all.
966      * Clearing it earlier won't be a problem, but too late will.
967      */
968     migration_clear_memory_region_dirty_bitmap(rb, page);
969 
970     ret = test_and_clear_bit(page, rb->bmap);
971     if (ret) {
972         rs->migration_dirty_pages--;
973     }
974 
975     return ret;
976 }
977 
978 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
979                                        void *opaque)
980 {
981     const hwaddr offset = section->offset_within_region;
982     const hwaddr size = int128_get64(section->size);
983     const unsigned long start = offset >> TARGET_PAGE_BITS;
984     const unsigned long npages = size >> TARGET_PAGE_BITS;
985     RAMBlock *rb = section->mr->ram_block;
986     uint64_t *cleared_bits = opaque;
987 
988     /*
989      * We don't grab ram_state->bitmap_mutex because we expect to run
990      * only when starting migration or during postcopy recovery where
991      * we don't have concurrent access.
992      */
993     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
994         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
995     }
996     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
997     bitmap_clear(rb->bmap, start, npages);
998 }
999 
1000 /*
1001  * Exclude all dirty pages from migration that fall into a discarded range as
1002  * managed by a RamDiscardManager responsible for the mapped memory region of
1003  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1004  *
1005  * Discarded pages ("logically unplugged") have undefined content and must
1006  * not get migrated, because even reading these pages for migration might
1007  * result in undesired behavior.
1008  *
1009  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1010  *
1011  * Note: The result is only stable while migrating (precopy/postcopy).
1012  */
1013 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1014 {
1015     uint64_t cleared_bits = 0;
1016 
1017     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1018         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1019         MemoryRegionSection section = {
1020             .mr = rb->mr,
1021             .offset_within_region = 0,
1022             .size = int128_make64(qemu_ram_get_used_length(rb)),
1023         };
1024 
1025         ram_discard_manager_replay_discarded(rdm, &section,
1026                                              dirty_bitmap_clear_section,
1027                                              &cleared_bits);
1028     }
1029     return cleared_bits;
1030 }
1031 
1032 /*
1033  * Check if a host-page aligned page falls into a discarded range as managed by
1034  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1035  *
1036  * Note: The result is only stable while migrating (precopy/postcopy).
1037  */
1038 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1039 {
1040     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1041         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1042         MemoryRegionSection section = {
1043             .mr = rb->mr,
1044             .offset_within_region = start,
1045             .size = int128_make64(qemu_ram_pagesize(rb)),
1046         };
1047 
1048         return !ram_discard_manager_is_populated(rdm, &section);
1049     }
1050     return false;
1051 }
1052 
1053 /* Called with RCU critical section */
1054 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1055 {
1056     uint64_t new_dirty_pages =
1057         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1058 
1059     rs->migration_dirty_pages += new_dirty_pages;
1060     rs->num_dirty_pages_period += new_dirty_pages;
1061 }
1062 
1063 /**
1064  * ram_pagesize_summary: calculate all the pagesizes of a VM
1065  *
1066  * Returns a summary bitmap of the page sizes of all RAMBlocks
1067  *
1068  * For VMs with just normal pages this is equivalent to the host page
1069  * size. If it's got some huge pages then it's the OR of all the
1070  * different page sizes.
1071  */
1072 uint64_t ram_pagesize_summary(void)
1073 {
1074     RAMBlock *block;
1075     uint64_t summary = 0;
1076 
1077     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1078         summary |= block->page_size;
1079     }
1080 
1081     return summary;
1082 }
1083 
1084 uint64_t ram_get_total_transferred_pages(void)
1085 {
1086     return  ram_counters.normal + ram_counters.duplicate +
1087                 compression_counters.pages + xbzrle_counters.pages;
1088 }
1089 
1090 static void migration_update_rates(RAMState *rs, int64_t end_time)
1091 {
1092     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1093     double compressed_size;
1094 
1095     /* calculate period counters */
1096     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1097                 / (end_time - rs->time_last_bitmap_sync);
1098 
1099     if (!page_count) {
1100         return;
1101     }
1102 
1103     if (migrate_use_xbzrle()) {
1104         double encoded_size, unencoded_size;
1105 
1106         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1107             rs->xbzrle_cache_miss_prev) / page_count;
1108         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1109         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1110                          TARGET_PAGE_SIZE;
1111         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1112         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1113             xbzrle_counters.encoding_rate = 0;
1114         } else {
1115             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1116         }
1117         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1118         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1119     }
1120 
1121     if (migrate_use_compression()) {
1122         compression_counters.busy_rate = (double)(compression_counters.busy -
1123             rs->compress_thread_busy_prev) / page_count;
1124         rs->compress_thread_busy_prev = compression_counters.busy;
1125 
1126         compressed_size = compression_counters.compressed_size -
1127                           rs->compressed_size_prev;
1128         if (compressed_size) {
1129             double uncompressed_size = (compression_counters.pages -
1130                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1131 
1132             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1133             compression_counters.compression_rate =
1134                                         uncompressed_size / compressed_size;
1135 
1136             rs->compress_pages_prev = compression_counters.pages;
1137             rs->compressed_size_prev = compression_counters.compressed_size;
1138         }
1139     }
1140 }
1141 
1142 static void migration_trigger_throttle(RAMState *rs)
1143 {
1144     MigrationState *s = migrate_get_current();
1145     uint64_t threshold = s->parameters.throttle_trigger_threshold;
1146 
1147     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1148     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1149     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1150 
1151     /* During block migration the auto-converge logic incorrectly detects
1152      * that ram migration makes no progress. Avoid this by disabling the
1153      * throttling logic during the bulk phase of block migration. */
1154     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1155         /* The following detection logic can be refined later. For now:
1156            Check to see if the ratio between dirtied bytes and the approx.
1157            amount of bytes that just got transferred since the last time
1158            we were in this routine reaches the threshold. If that happens
1159            twice, start or increase throttling. */
1160 
1161         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1162             (++rs->dirty_rate_high_cnt >= 2)) {
1163             trace_migration_throttle();
1164             rs->dirty_rate_high_cnt = 0;
1165             mig_throttle_guest_down(bytes_dirty_period,
1166                                     bytes_dirty_threshold);
1167         }
1168     }
1169 }
1170 
1171 static void migration_bitmap_sync(RAMState *rs)
1172 {
1173     RAMBlock *block;
1174     int64_t end_time;
1175 
1176     ram_counters.dirty_sync_count++;
1177 
1178     if (!rs->time_last_bitmap_sync) {
1179         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1180     }
1181 
1182     trace_migration_bitmap_sync_start();
1183     memory_global_dirty_log_sync();
1184 
1185     qemu_mutex_lock(&rs->bitmap_mutex);
1186     WITH_RCU_READ_LOCK_GUARD() {
1187         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1188             ramblock_sync_dirty_bitmap(rs, block);
1189         }
1190         ram_counters.remaining = ram_bytes_remaining();
1191     }
1192     qemu_mutex_unlock(&rs->bitmap_mutex);
1193 
1194     memory_global_after_dirty_log_sync();
1195     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1196 
1197     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1198 
1199     /* more than 1 second = 1000 millisecons */
1200     if (end_time > rs->time_last_bitmap_sync + 1000) {
1201         migration_trigger_throttle(rs);
1202 
1203         migration_update_rates(rs, end_time);
1204 
1205         rs->target_page_count_prev = rs->target_page_count;
1206 
1207         /* reset period counters */
1208         rs->time_last_bitmap_sync = end_time;
1209         rs->num_dirty_pages_period = 0;
1210         rs->bytes_xfer_prev = ram_counters.transferred;
1211     }
1212     if (migrate_use_events()) {
1213         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1214     }
1215 }
1216 
1217 static void migration_bitmap_sync_precopy(RAMState *rs)
1218 {
1219     Error *local_err = NULL;
1220 
1221     /*
1222      * The current notifier usage is just an optimization to migration, so we
1223      * don't stop the normal migration process in the error case.
1224      */
1225     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1226         error_report_err(local_err);
1227         local_err = NULL;
1228     }
1229 
1230     migration_bitmap_sync(rs);
1231 
1232     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1233         error_report_err(local_err);
1234     }
1235 }
1236 
1237 static void ram_release_page(const char *rbname, uint64_t offset)
1238 {
1239     if (!migrate_release_ram() || !migration_in_postcopy()) {
1240         return;
1241     }
1242 
1243     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1244 }
1245 
1246 /**
1247  * save_zero_page_to_file: send the zero page to the file
1248  *
1249  * Returns the size of data written to the file, 0 means the page is not
1250  * a zero page
1251  *
1252  * @rs: current RAM state
1253  * @file: the file where the data is saved
1254  * @block: block that contains the page we want to send
1255  * @offset: offset inside the block for the page
1256  */
1257 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1258                                   RAMBlock *block, ram_addr_t offset)
1259 {
1260     uint8_t *p = block->host + offset;
1261     int len = 0;
1262 
1263     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1264         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1265         qemu_put_byte(file, 0);
1266         len += 1;
1267         ram_release_page(block->idstr, offset);
1268     }
1269     return len;
1270 }
1271 
1272 /**
1273  * save_zero_page: send the zero page to the stream
1274  *
1275  * Returns the number of pages written.
1276  *
1277  * @rs: current RAM state
1278  * @block: block that contains the page we want to send
1279  * @offset: offset inside the block for the page
1280  */
1281 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1282 {
1283     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1284 
1285     if (len) {
1286         ram_counters.duplicate++;
1287         ram_transferred_add(len);
1288         return 1;
1289     }
1290     return -1;
1291 }
1292 
1293 /*
1294  * @pages: the number of pages written by the control path,
1295  *        < 0 - error
1296  *        > 0 - number of pages written
1297  *
1298  * Return true if the pages has been saved, otherwise false is returned.
1299  */
1300 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1301                               int *pages)
1302 {
1303     uint64_t bytes_xmit = 0;
1304     int ret;
1305 
1306     *pages = -1;
1307     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1308                                 &bytes_xmit);
1309     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1310         return false;
1311     }
1312 
1313     if (bytes_xmit) {
1314         ram_transferred_add(bytes_xmit);
1315         *pages = 1;
1316     }
1317 
1318     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1319         return true;
1320     }
1321 
1322     if (bytes_xmit > 0) {
1323         ram_counters.normal++;
1324     } else if (bytes_xmit == 0) {
1325         ram_counters.duplicate++;
1326     }
1327 
1328     return true;
1329 }
1330 
1331 /*
1332  * directly send the page to the stream
1333  *
1334  * Returns the number of pages written.
1335  *
1336  * @rs: current RAM state
1337  * @block: block that contains the page we want to send
1338  * @offset: offset inside the block for the page
1339  * @buf: the page to be sent
1340  * @async: send to page asyncly
1341  */
1342 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1343                             uint8_t *buf, bool async)
1344 {
1345     ram_transferred_add(save_page_header(rs, rs->f, block,
1346                                          offset | RAM_SAVE_FLAG_PAGE));
1347     if (async) {
1348         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1349                               migrate_release_ram() &&
1350                               migration_in_postcopy());
1351     } else {
1352         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1353     }
1354     ram_transferred_add(TARGET_PAGE_SIZE);
1355     ram_counters.normal++;
1356     return 1;
1357 }
1358 
1359 /**
1360  * ram_save_page: send the given page to the stream
1361  *
1362  * Returns the number of pages written.
1363  *          < 0 - error
1364  *          >=0 - Number of pages written - this might legally be 0
1365  *                if xbzrle noticed the page was the same.
1366  *
1367  * @rs: current RAM state
1368  * @block: block that contains the page we want to send
1369  * @offset: offset inside the block for the page
1370  */
1371 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1372 {
1373     int pages = -1;
1374     uint8_t *p;
1375     bool send_async = true;
1376     RAMBlock *block = pss->block;
1377     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1378     ram_addr_t current_addr = block->offset + offset;
1379 
1380     p = block->host + offset;
1381     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1382 
1383     XBZRLE_cache_lock();
1384     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1385         pages = save_xbzrle_page(rs, &p, current_addr, block,
1386                                  offset);
1387         if (!rs->last_stage) {
1388             /* Can't send this cached data async, since the cache page
1389              * might get updated before it gets to the wire
1390              */
1391             send_async = false;
1392         }
1393     }
1394 
1395     /* XBZRLE overflow or normal page */
1396     if (pages == -1) {
1397         pages = save_normal_page(rs, block, offset, p, send_async);
1398     }
1399 
1400     XBZRLE_cache_unlock();
1401 
1402     return pages;
1403 }
1404 
1405 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1406                                  ram_addr_t offset)
1407 {
1408     if (multifd_queue_page(rs->f, block, offset) < 0) {
1409         return -1;
1410     }
1411     ram_counters.normal++;
1412 
1413     return 1;
1414 }
1415 
1416 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1417                                  ram_addr_t offset, uint8_t *source_buf)
1418 {
1419     RAMState *rs = ram_state;
1420     uint8_t *p = block->host + offset;
1421     int ret;
1422 
1423     if (save_zero_page_to_file(rs, f, block, offset)) {
1424         return true;
1425     }
1426 
1427     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1428 
1429     /*
1430      * copy it to a internal buffer to avoid it being modified by VM
1431      * so that we can catch up the error during compression and
1432      * decompression
1433      */
1434     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1435     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1436     if (ret < 0) {
1437         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1438         error_report("compressed data failed!");
1439     }
1440     return false;
1441 }
1442 
1443 static void
1444 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1445 {
1446     ram_transferred_add(bytes_xmit);
1447 
1448     if (param->zero_page) {
1449         ram_counters.duplicate++;
1450         return;
1451     }
1452 
1453     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1454     compression_counters.compressed_size += bytes_xmit - 8;
1455     compression_counters.pages++;
1456 }
1457 
1458 static bool save_page_use_compression(RAMState *rs);
1459 
1460 static void flush_compressed_data(RAMState *rs)
1461 {
1462     int idx, len, thread_count;
1463 
1464     if (!save_page_use_compression(rs)) {
1465         return;
1466     }
1467     thread_count = migrate_compress_threads();
1468 
1469     qemu_mutex_lock(&comp_done_lock);
1470     for (idx = 0; idx < thread_count; idx++) {
1471         while (!comp_param[idx].done) {
1472             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1473         }
1474     }
1475     qemu_mutex_unlock(&comp_done_lock);
1476 
1477     for (idx = 0; idx < thread_count; idx++) {
1478         qemu_mutex_lock(&comp_param[idx].mutex);
1479         if (!comp_param[idx].quit) {
1480             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1481             /*
1482              * it's safe to fetch zero_page without holding comp_done_lock
1483              * as there is no further request submitted to the thread,
1484              * i.e, the thread should be waiting for a request at this point.
1485              */
1486             update_compress_thread_counts(&comp_param[idx], len);
1487         }
1488         qemu_mutex_unlock(&comp_param[idx].mutex);
1489     }
1490 }
1491 
1492 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1493                                        ram_addr_t offset)
1494 {
1495     param->block = block;
1496     param->offset = offset;
1497 }
1498 
1499 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1500                                            ram_addr_t offset)
1501 {
1502     int idx, thread_count, bytes_xmit = -1, pages = -1;
1503     bool wait = migrate_compress_wait_thread();
1504 
1505     thread_count = migrate_compress_threads();
1506     qemu_mutex_lock(&comp_done_lock);
1507 retry:
1508     for (idx = 0; idx < thread_count; idx++) {
1509         if (comp_param[idx].done) {
1510             comp_param[idx].done = false;
1511             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1512             qemu_mutex_lock(&comp_param[idx].mutex);
1513             set_compress_params(&comp_param[idx], block, offset);
1514             qemu_cond_signal(&comp_param[idx].cond);
1515             qemu_mutex_unlock(&comp_param[idx].mutex);
1516             pages = 1;
1517             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1518             break;
1519         }
1520     }
1521 
1522     /*
1523      * wait for the free thread if the user specifies 'compress-wait-thread',
1524      * otherwise we will post the page out in the main thread as normal page.
1525      */
1526     if (pages < 0 && wait) {
1527         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1528         goto retry;
1529     }
1530     qemu_mutex_unlock(&comp_done_lock);
1531 
1532     return pages;
1533 }
1534 
1535 /**
1536  * find_dirty_block: find the next dirty page and update any state
1537  * associated with the search process.
1538  *
1539  * Returns true if a page is found
1540  *
1541  * @rs: current RAM state
1542  * @pss: data about the state of the current dirty page scan
1543  * @again: set to false if the search has scanned the whole of RAM
1544  */
1545 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1546 {
1547     /*
1548      * This is not a postcopy requested page, mark it "not urgent", and use
1549      * precopy channel to send it.
1550      */
1551     pss->postcopy_requested = false;
1552     pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
1553 
1554     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1555     if (pss->complete_round && pss->block == rs->last_seen_block &&
1556         pss->page >= rs->last_page) {
1557         /*
1558          * We've been once around the RAM and haven't found anything.
1559          * Give up.
1560          */
1561         *again = false;
1562         return false;
1563     }
1564     if (!offset_in_ramblock(pss->block,
1565                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1566         /* Didn't find anything in this RAM Block */
1567         pss->page = 0;
1568         pss->block = QLIST_NEXT_RCU(pss->block, next);
1569         if (!pss->block) {
1570             /*
1571              * If memory migration starts over, we will meet a dirtied page
1572              * which may still exists in compression threads's ring, so we
1573              * should flush the compressed data to make sure the new page
1574              * is not overwritten by the old one in the destination.
1575              *
1576              * Also If xbzrle is on, stop using the data compression at this
1577              * point. In theory, xbzrle can do better than compression.
1578              */
1579             flush_compressed_data(rs);
1580 
1581             /* Hit the end of the list */
1582             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1583             /* Flag that we've looped */
1584             pss->complete_round = true;
1585             /* After the first round, enable XBZRLE. */
1586             if (migrate_use_xbzrle()) {
1587                 rs->xbzrle_enabled = true;
1588             }
1589         }
1590         /* Didn't find anything this time, but try again on the new block */
1591         *again = true;
1592         return false;
1593     } else {
1594         /* Can go around again, but... */
1595         *again = true;
1596         /* We've found something so probably don't need to */
1597         return true;
1598     }
1599 }
1600 
1601 /**
1602  * unqueue_page: gets a page of the queue
1603  *
1604  * Helper for 'get_queued_page' - gets a page off the queue
1605  *
1606  * Returns the block of the page (or NULL if none available)
1607  *
1608  * @rs: current RAM state
1609  * @offset: used to return the offset within the RAMBlock
1610  */
1611 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1612 {
1613     struct RAMSrcPageRequest *entry;
1614     RAMBlock *block = NULL;
1615 
1616     if (!postcopy_has_request(rs)) {
1617         return NULL;
1618     }
1619 
1620     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1621 
1622     /*
1623      * This should _never_ change even after we take the lock, because no one
1624      * should be taking anything off the request list other than us.
1625      */
1626     assert(postcopy_has_request(rs));
1627 
1628     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1629     block = entry->rb;
1630     *offset = entry->offset;
1631 
1632     if (entry->len > TARGET_PAGE_SIZE) {
1633         entry->len -= TARGET_PAGE_SIZE;
1634         entry->offset += TARGET_PAGE_SIZE;
1635     } else {
1636         memory_region_unref(block->mr);
1637         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1638         g_free(entry);
1639         migration_consume_urgent_request();
1640     }
1641 
1642     return block;
1643 }
1644 
1645 #if defined(__linux__)
1646 /**
1647  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1648  *   is found, return RAM block pointer and page offset
1649  *
1650  * Returns pointer to the RAMBlock containing faulting page,
1651  *   NULL if no write faults are pending
1652  *
1653  * @rs: current RAM state
1654  * @offset: page offset from the beginning of the block
1655  */
1656 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1657 {
1658     struct uffd_msg uffd_msg;
1659     void *page_address;
1660     RAMBlock *block;
1661     int res;
1662 
1663     if (!migrate_background_snapshot()) {
1664         return NULL;
1665     }
1666 
1667     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1668     if (res <= 0) {
1669         return NULL;
1670     }
1671 
1672     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1673     block = qemu_ram_block_from_host(page_address, false, offset);
1674     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1675     return block;
1676 }
1677 
1678 /**
1679  * ram_save_release_protection: release UFFD write protection after
1680  *   a range of pages has been saved
1681  *
1682  * @rs: current RAM state
1683  * @pss: page-search-status structure
1684  * @start_page: index of the first page in the range relative to pss->block
1685  *
1686  * Returns 0 on success, negative value in case of an error
1687 */
1688 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1689         unsigned long start_page)
1690 {
1691     int res = 0;
1692 
1693     /* Check if page is from UFFD-managed region. */
1694     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1695         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1696         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1697 
1698         /* Flush async buffers before un-protect. */
1699         qemu_fflush(rs->f);
1700         /* Un-protect memory range. */
1701         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1702                 false, false);
1703     }
1704 
1705     return res;
1706 }
1707 
1708 /* ram_write_tracking_available: check if kernel supports required UFFD features
1709  *
1710  * Returns true if supports, false otherwise
1711  */
1712 bool ram_write_tracking_available(void)
1713 {
1714     uint64_t uffd_features;
1715     int res;
1716 
1717     res = uffd_query_features(&uffd_features);
1718     return (res == 0 &&
1719             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1720 }
1721 
1722 /* ram_write_tracking_compatible: check if guest configuration is
1723  *   compatible with 'write-tracking'
1724  *
1725  * Returns true if compatible, false otherwise
1726  */
1727 bool ram_write_tracking_compatible(void)
1728 {
1729     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1730     int uffd_fd;
1731     RAMBlock *block;
1732     bool ret = false;
1733 
1734     /* Open UFFD file descriptor */
1735     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1736     if (uffd_fd < 0) {
1737         return false;
1738     }
1739 
1740     RCU_READ_LOCK_GUARD();
1741 
1742     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1743         uint64_t uffd_ioctls;
1744 
1745         /* Nothing to do with read-only and MMIO-writable regions */
1746         if (block->mr->readonly || block->mr->rom_device) {
1747             continue;
1748         }
1749         /* Try to register block memory via UFFD-IO to track writes */
1750         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1751                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1752             goto out;
1753         }
1754         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1755             goto out;
1756         }
1757     }
1758     ret = true;
1759 
1760 out:
1761     uffd_close_fd(uffd_fd);
1762     return ret;
1763 }
1764 
1765 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1766                                        ram_addr_t size)
1767 {
1768     /*
1769      * We read one byte of each page; this will preallocate page tables if
1770      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1771      * where no page was populated yet. This might require adaption when
1772      * supporting other mappings, like shmem.
1773      */
1774     for (; offset < size; offset += block->page_size) {
1775         char tmp = *((char *)block->host + offset);
1776 
1777         /* Don't optimize the read out */
1778         asm volatile("" : "+r" (tmp));
1779     }
1780 }
1781 
1782 static inline int populate_read_section(MemoryRegionSection *section,
1783                                         void *opaque)
1784 {
1785     const hwaddr size = int128_get64(section->size);
1786     hwaddr offset = section->offset_within_region;
1787     RAMBlock *block = section->mr->ram_block;
1788 
1789     populate_read_range(block, offset, size);
1790     return 0;
1791 }
1792 
1793 /*
1794  * ram_block_populate_read: preallocate page tables and populate pages in the
1795  *   RAM block by reading a byte of each page.
1796  *
1797  * Since it's solely used for userfault_fd WP feature, here we just
1798  *   hardcode page size to qemu_real_host_page_size.
1799  *
1800  * @block: RAM block to populate
1801  */
1802 static void ram_block_populate_read(RAMBlock *rb)
1803 {
1804     /*
1805      * Skip populating all pages that fall into a discarded range as managed by
1806      * a RamDiscardManager responsible for the mapped memory region of the
1807      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1808      * must not get populated automatically. We don't have to track
1809      * modifications via userfaultfd WP reliably, because these pages will
1810      * not be part of the migration stream either way -- see
1811      * ramblock_dirty_bitmap_exclude_discarded_pages().
1812      *
1813      * Note: The result is only stable while migrating (precopy/postcopy).
1814      */
1815     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1816         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1817         MemoryRegionSection section = {
1818             .mr = rb->mr,
1819             .offset_within_region = 0,
1820             .size = rb->mr->size,
1821         };
1822 
1823         ram_discard_manager_replay_populated(rdm, &section,
1824                                              populate_read_section, NULL);
1825     } else {
1826         populate_read_range(rb, 0, rb->used_length);
1827     }
1828 }
1829 
1830 /*
1831  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1832  */
1833 void ram_write_tracking_prepare(void)
1834 {
1835     RAMBlock *block;
1836 
1837     RCU_READ_LOCK_GUARD();
1838 
1839     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1840         /* Nothing to do with read-only and MMIO-writable regions */
1841         if (block->mr->readonly || block->mr->rom_device) {
1842             continue;
1843         }
1844 
1845         /*
1846          * Populate pages of the RAM block before enabling userfault_fd
1847          * write protection.
1848          *
1849          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1850          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1851          * pages with pte_none() entries in page table.
1852          */
1853         ram_block_populate_read(block);
1854     }
1855 }
1856 
1857 /*
1858  * ram_write_tracking_start: start UFFD-WP memory tracking
1859  *
1860  * Returns 0 for success or negative value in case of error
1861  */
1862 int ram_write_tracking_start(void)
1863 {
1864     int uffd_fd;
1865     RAMState *rs = ram_state;
1866     RAMBlock *block;
1867 
1868     /* Open UFFD file descriptor */
1869     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1870     if (uffd_fd < 0) {
1871         return uffd_fd;
1872     }
1873     rs->uffdio_fd = uffd_fd;
1874 
1875     RCU_READ_LOCK_GUARD();
1876 
1877     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1878         /* Nothing to do with read-only and MMIO-writable regions */
1879         if (block->mr->readonly || block->mr->rom_device) {
1880             continue;
1881         }
1882 
1883         /* Register block memory with UFFD to track writes */
1884         if (uffd_register_memory(rs->uffdio_fd, block->host,
1885                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1886             goto fail;
1887         }
1888         /* Apply UFFD write protection to the block memory range */
1889         if (uffd_change_protection(rs->uffdio_fd, block->host,
1890                 block->max_length, true, false)) {
1891             goto fail;
1892         }
1893         block->flags |= RAM_UF_WRITEPROTECT;
1894         memory_region_ref(block->mr);
1895 
1896         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1897                 block->host, block->max_length);
1898     }
1899 
1900     return 0;
1901 
1902 fail:
1903     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1904 
1905     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1906         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1907             continue;
1908         }
1909         /*
1910          * In case some memory block failed to be write-protected
1911          * remove protection and unregister all succeeded RAM blocks
1912          */
1913         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1914                 false, false);
1915         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1916         /* Cleanup flags and remove reference */
1917         block->flags &= ~RAM_UF_WRITEPROTECT;
1918         memory_region_unref(block->mr);
1919     }
1920 
1921     uffd_close_fd(uffd_fd);
1922     rs->uffdio_fd = -1;
1923     return -1;
1924 }
1925 
1926 /**
1927  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1928  */
1929 void ram_write_tracking_stop(void)
1930 {
1931     RAMState *rs = ram_state;
1932     RAMBlock *block;
1933 
1934     RCU_READ_LOCK_GUARD();
1935 
1936     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1937         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1938             continue;
1939         }
1940         /* Remove protection and unregister all affected RAM blocks */
1941         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1942                 false, false);
1943         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1944 
1945         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1946                 block->host, block->max_length);
1947 
1948         /* Cleanup flags and remove reference */
1949         block->flags &= ~RAM_UF_WRITEPROTECT;
1950         memory_region_unref(block->mr);
1951     }
1952 
1953     /* Finally close UFFD file descriptor */
1954     uffd_close_fd(rs->uffdio_fd);
1955     rs->uffdio_fd = -1;
1956 }
1957 
1958 #else
1959 /* No target OS support, stubs just fail or ignore */
1960 
1961 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1962 {
1963     (void) rs;
1964     (void) offset;
1965 
1966     return NULL;
1967 }
1968 
1969 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1970         unsigned long start_page)
1971 {
1972     (void) rs;
1973     (void) pss;
1974     (void) start_page;
1975 
1976     return 0;
1977 }
1978 
1979 bool ram_write_tracking_available(void)
1980 {
1981     return false;
1982 }
1983 
1984 bool ram_write_tracking_compatible(void)
1985 {
1986     assert(0);
1987     return false;
1988 }
1989 
1990 int ram_write_tracking_start(void)
1991 {
1992     assert(0);
1993     return -1;
1994 }
1995 
1996 void ram_write_tracking_stop(void)
1997 {
1998     assert(0);
1999 }
2000 #endif /* defined(__linux__) */
2001 
2002 /*
2003  * Check whether two addr/offset of the ramblock falls onto the same host huge
2004  * page.  Returns true if so, false otherwise.
2005  */
2006 static bool offset_on_same_huge_page(RAMBlock *rb, uint64_t addr1,
2007                                      uint64_t addr2)
2008 {
2009     size_t page_size = qemu_ram_pagesize(rb);
2010 
2011     addr1 = ROUND_DOWN(addr1, page_size);
2012     addr2 = ROUND_DOWN(addr2, page_size);
2013 
2014     return addr1 == addr2;
2015 }
2016 
2017 /*
2018  * Whether a previous preempted precopy huge page contains current requested
2019  * page?  Returns true if so, false otherwise.
2020  *
2021  * This should really happen very rarely, because it means when we were sending
2022  * during background migration for postcopy we're sending exactly the page that
2023  * some vcpu got faulted on on dest node.  When it happens, we probably don't
2024  * need to do much but drop the request, because we know right after we restore
2025  * the precopy stream it'll be serviced.  It'll slightly affect the order of
2026  * postcopy requests to be serviced (e.g. it'll be the same as we move current
2027  * request to the end of the queue) but it shouldn't be a big deal.  The most
2028  * imporant thing is we can _never_ try to send a partial-sent huge page on the
2029  * POSTCOPY channel again, otherwise that huge page will got "split brain" on
2030  * two channels (PRECOPY, POSTCOPY).
2031  */
2032 static bool postcopy_preempted_contains(RAMState *rs, RAMBlock *block,
2033                                         ram_addr_t offset)
2034 {
2035     PostcopyPreemptState *state = &rs->postcopy_preempt_state;
2036 
2037     /* No preemption at all? */
2038     if (!state->preempted) {
2039         return false;
2040     }
2041 
2042     /* Not even the same ramblock? */
2043     if (state->ram_block != block) {
2044         return false;
2045     }
2046 
2047     return offset_on_same_huge_page(block, offset,
2048                                     state->ram_page << TARGET_PAGE_BITS);
2049 }
2050 
2051 /**
2052  * get_queued_page: unqueue a page from the postcopy requests
2053  *
2054  * Skips pages that are already sent (!dirty)
2055  *
2056  * Returns true if a queued page is found
2057  *
2058  * @rs: current RAM state
2059  * @pss: data about the state of the current dirty page scan
2060  */
2061 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2062 {
2063     RAMBlock  *block;
2064     ram_addr_t offset;
2065     bool dirty;
2066 
2067     do {
2068         block = unqueue_page(rs, &offset);
2069         /*
2070          * We're sending this page, and since it's postcopy nothing else
2071          * will dirty it, and we must make sure it doesn't get sent again
2072          * even if this queue request was received after the background
2073          * search already sent it.
2074          */
2075         if (block) {
2076             unsigned long page;
2077 
2078             page = offset >> TARGET_PAGE_BITS;
2079             dirty = test_bit(page, block->bmap);
2080             if (!dirty) {
2081                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2082                                                 page);
2083             } else {
2084                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2085             }
2086         }
2087 
2088     } while (block && !dirty);
2089 
2090     if (block) {
2091         /* See comment above postcopy_preempted_contains() */
2092         if (postcopy_preempted_contains(rs, block, offset)) {
2093             trace_postcopy_preempt_hit(block->idstr, offset);
2094             /*
2095              * If what we preempted previously was exactly what we're
2096              * requesting right now, restore the preempted precopy
2097              * immediately, boosting its priority as it's requested by
2098              * postcopy.
2099              */
2100             postcopy_preempt_restore(rs, pss, true);
2101             return true;
2102         }
2103     } else {
2104         /*
2105          * Poll write faults too if background snapshot is enabled; that's
2106          * when we have vcpus got blocked by the write protected pages.
2107          */
2108         block = poll_fault_page(rs, &offset);
2109     }
2110 
2111     if (block) {
2112         /*
2113          * We want the background search to continue from the queued page
2114          * since the guest is likely to want other pages near to the page
2115          * it just requested.
2116          */
2117         pss->block = block;
2118         pss->page = offset >> TARGET_PAGE_BITS;
2119 
2120         /*
2121          * This unqueued page would break the "one round" check, even is
2122          * really rare.
2123          */
2124         pss->complete_round = false;
2125         /* Mark it an urgent request, meanwhile using POSTCOPY channel */
2126         pss->postcopy_requested = true;
2127         pss->postcopy_target_channel = RAM_CHANNEL_POSTCOPY;
2128     }
2129 
2130     return !!block;
2131 }
2132 
2133 /**
2134  * migration_page_queue_free: drop any remaining pages in the ram
2135  * request queue
2136  *
2137  * It should be empty at the end anyway, but in error cases there may
2138  * be some left.  in case that there is any page left, we drop it.
2139  *
2140  */
2141 static void migration_page_queue_free(RAMState *rs)
2142 {
2143     struct RAMSrcPageRequest *mspr, *next_mspr;
2144     /* This queue generally should be empty - but in the case of a failed
2145      * migration might have some droppings in.
2146      */
2147     RCU_READ_LOCK_GUARD();
2148     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2149         memory_region_unref(mspr->rb->mr);
2150         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2151         g_free(mspr);
2152     }
2153 }
2154 
2155 /**
2156  * ram_save_queue_pages: queue the page for transmission
2157  *
2158  * A request from postcopy destination for example.
2159  *
2160  * Returns zero on success or negative on error
2161  *
2162  * @rbname: Name of the RAMBLock of the request. NULL means the
2163  *          same that last one.
2164  * @start: starting address from the start of the RAMBlock
2165  * @len: length (in bytes) to send
2166  */
2167 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2168 {
2169     RAMBlock *ramblock;
2170     RAMState *rs = ram_state;
2171 
2172     ram_counters.postcopy_requests++;
2173     RCU_READ_LOCK_GUARD();
2174 
2175     if (!rbname) {
2176         /* Reuse last RAMBlock */
2177         ramblock = rs->last_req_rb;
2178 
2179         if (!ramblock) {
2180             /*
2181              * Shouldn't happen, we can't reuse the last RAMBlock if
2182              * it's the 1st request.
2183              */
2184             error_report("ram_save_queue_pages no previous block");
2185             return -1;
2186         }
2187     } else {
2188         ramblock = qemu_ram_block_by_name(rbname);
2189 
2190         if (!ramblock) {
2191             /* We shouldn't be asked for a non-existent RAMBlock */
2192             error_report("ram_save_queue_pages no block '%s'", rbname);
2193             return -1;
2194         }
2195         rs->last_req_rb = ramblock;
2196     }
2197     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2198     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2199         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2200                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2201                      __func__, start, len, ramblock->used_length);
2202         return -1;
2203     }
2204 
2205     struct RAMSrcPageRequest *new_entry =
2206         g_new0(struct RAMSrcPageRequest, 1);
2207     new_entry->rb = ramblock;
2208     new_entry->offset = start;
2209     new_entry->len = len;
2210 
2211     memory_region_ref(ramblock->mr);
2212     qemu_mutex_lock(&rs->src_page_req_mutex);
2213     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2214     migration_make_urgent_request();
2215     qemu_mutex_unlock(&rs->src_page_req_mutex);
2216 
2217     return 0;
2218 }
2219 
2220 static bool save_page_use_compression(RAMState *rs)
2221 {
2222     if (!migrate_use_compression()) {
2223         return false;
2224     }
2225 
2226     /*
2227      * If xbzrle is enabled (e.g., after first round of migration), stop
2228      * using the data compression. In theory, xbzrle can do better than
2229      * compression.
2230      */
2231     if (rs->xbzrle_enabled) {
2232         return false;
2233     }
2234 
2235     return true;
2236 }
2237 
2238 /*
2239  * try to compress the page before posting it out, return true if the page
2240  * has been properly handled by compression, otherwise needs other
2241  * paths to handle it
2242  */
2243 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2244 {
2245     if (!save_page_use_compression(rs)) {
2246         return false;
2247     }
2248 
2249     /*
2250      * When starting the process of a new block, the first page of
2251      * the block should be sent out before other pages in the same
2252      * block, and all the pages in last block should have been sent
2253      * out, keeping this order is important, because the 'cont' flag
2254      * is used to avoid resending the block name.
2255      *
2256      * We post the fist page as normal page as compression will take
2257      * much CPU resource.
2258      */
2259     if (block != rs->last_sent_block) {
2260         flush_compressed_data(rs);
2261         return false;
2262     }
2263 
2264     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2265         return true;
2266     }
2267 
2268     compression_counters.busy++;
2269     return false;
2270 }
2271 
2272 /**
2273  * ram_save_target_page: save one target page
2274  *
2275  * Returns the number of pages written
2276  *
2277  * @rs: current RAM state
2278  * @pss: data about the page we want to send
2279  */
2280 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
2281 {
2282     RAMBlock *block = pss->block;
2283     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2284     int res;
2285 
2286     if (control_save_page(rs, block, offset, &res)) {
2287         return res;
2288     }
2289 
2290     if (save_compress_page(rs, block, offset)) {
2291         return 1;
2292     }
2293 
2294     res = save_zero_page(rs, block, offset);
2295     if (res > 0) {
2296         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2297          * page would be stale
2298          */
2299         if (!save_page_use_compression(rs)) {
2300             XBZRLE_cache_lock();
2301             xbzrle_cache_zero_page(rs, block->offset + offset);
2302             XBZRLE_cache_unlock();
2303         }
2304         return res;
2305     }
2306 
2307     /*
2308      * Do not use multifd for:
2309      * 1. Compression as the first page in the new block should be posted out
2310      *    before sending the compressed page
2311      * 2. In postcopy as one whole host page should be placed
2312      */
2313     if (!save_page_use_compression(rs) && migrate_use_multifd()
2314         && !migration_in_postcopy()) {
2315         return ram_save_multifd_page(rs, block, offset);
2316     }
2317 
2318     return ram_save_page(rs, pss);
2319 }
2320 
2321 static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss)
2322 {
2323     MigrationState *ms = migrate_get_current();
2324 
2325     /* Not enabled eager preempt?  Then never do that. */
2326     if (!migrate_postcopy_preempt()) {
2327         return false;
2328     }
2329 
2330     /* If the user explicitly disabled breaking of huge page, skip */
2331     if (!ms->postcopy_preempt_break_huge) {
2332         return false;
2333     }
2334 
2335     /* If the ramblock we're sending is a small page?  Never bother. */
2336     if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) {
2337         return false;
2338     }
2339 
2340     /* Not in postcopy at all? */
2341     if (!migration_in_postcopy()) {
2342         return false;
2343     }
2344 
2345     /*
2346      * If we're already handling a postcopy request, don't preempt as this page
2347      * has got the same high priority.
2348      */
2349     if (pss->postcopy_requested) {
2350         return false;
2351     }
2352 
2353     /* If there's postcopy requests, then check it up! */
2354     return postcopy_has_request(rs);
2355 }
2356 
2357 /* Returns true if we preempted precopy, false otherwise */
2358 static void postcopy_do_preempt(RAMState *rs, PageSearchStatus *pss)
2359 {
2360     PostcopyPreemptState *p_state = &rs->postcopy_preempt_state;
2361 
2362     trace_postcopy_preempt_triggered(pss->block->idstr, pss->page);
2363 
2364     /*
2365      * Time to preempt precopy. Cache current PSS into preempt state, so that
2366      * after handling the postcopy pages we can recover to it.  We need to do
2367      * so because the dest VM will have partial of the precopy huge page kept
2368      * over in its tmp huge page caches; better move on with it when we can.
2369      */
2370     p_state->ram_block = pss->block;
2371     p_state->ram_page = pss->page;
2372     p_state->preempted = true;
2373 }
2374 
2375 /* Whether we're preempted by a postcopy request during sending a huge page */
2376 static bool postcopy_preempt_triggered(RAMState *rs)
2377 {
2378     return rs->postcopy_preempt_state.preempted;
2379 }
2380 
2381 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
2382                                      bool postcopy_requested)
2383 {
2384     PostcopyPreemptState *state = &rs->postcopy_preempt_state;
2385 
2386     assert(state->preempted);
2387 
2388     pss->block = state->ram_block;
2389     pss->page = state->ram_page;
2390 
2391     /* Whether this is a postcopy request? */
2392     pss->postcopy_requested = postcopy_requested;
2393     /*
2394      * When restoring a preempted page, the old data resides in PRECOPY
2395      * slow channel, even if postcopy_requested is set.  So always use
2396      * PRECOPY channel here.
2397      */
2398     pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
2399 
2400     trace_postcopy_preempt_restored(pss->block->idstr, pss->page);
2401 
2402     /* Reset preempt state, most importantly, set preempted==false */
2403     postcopy_preempt_reset(rs);
2404 }
2405 
2406 static void postcopy_preempt_choose_channel(RAMState *rs, PageSearchStatus *pss)
2407 {
2408     MigrationState *s = migrate_get_current();
2409     unsigned int channel = pss->postcopy_target_channel;
2410     QEMUFile *next;
2411 
2412     if (channel != rs->postcopy_channel) {
2413         if (channel == RAM_CHANNEL_PRECOPY) {
2414             next = s->to_dst_file;
2415         } else {
2416             next = s->postcopy_qemufile_src;
2417         }
2418         /* Update and cache the current channel */
2419         rs->f = next;
2420         rs->postcopy_channel = channel;
2421 
2422         /*
2423          * If channel switched, reset last_sent_block since the old sent block
2424          * may not be on the same channel.
2425          */
2426         rs->last_sent_block = NULL;
2427 
2428         trace_postcopy_preempt_switch_channel(channel);
2429     }
2430 
2431     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2432 }
2433 
2434 /* We need to make sure rs->f always points to the default channel elsewhere */
2435 static void postcopy_preempt_reset_channel(RAMState *rs)
2436 {
2437     if (migrate_postcopy_preempt() && migration_in_postcopy()) {
2438         rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
2439         rs->f = migrate_get_current()->to_dst_file;
2440         trace_postcopy_preempt_reset_channel();
2441     }
2442 }
2443 
2444 /**
2445  * ram_save_host_page: save a whole host page
2446  *
2447  * Starting at *offset send pages up to the end of the current host
2448  * page. It's valid for the initial offset to point into the middle of
2449  * a host page in which case the remainder of the hostpage is sent.
2450  * Only dirty target pages are sent. Note that the host page size may
2451  * be a huge page for this block.
2452  * The saving stops at the boundary of the used_length of the block
2453  * if the RAMBlock isn't a multiple of the host page size.
2454  *
2455  * Returns the number of pages written or negative on error
2456  *
2457  * @rs: current RAM state
2458  * @pss: data about the page we want to send
2459  */
2460 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2461 {
2462     int tmppages, pages = 0;
2463     size_t pagesize_bits =
2464         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2465     unsigned long hostpage_boundary =
2466         QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2467     unsigned long start_page = pss->page;
2468     int res;
2469 
2470     if (ramblock_is_ignored(pss->block)) {
2471         error_report("block %s should not be migrated !", pss->block->idstr);
2472         return 0;
2473     }
2474 
2475     if (migrate_postcopy_preempt() && migration_in_postcopy()) {
2476         postcopy_preempt_choose_channel(rs, pss);
2477     }
2478 
2479     do {
2480         if (postcopy_needs_preempt(rs, pss)) {
2481             postcopy_do_preempt(rs, pss);
2482             break;
2483         }
2484 
2485         /* Check the pages is dirty and if it is send it */
2486         if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2487             tmppages = ram_save_target_page(rs, pss);
2488             if (tmppages < 0) {
2489                 return tmppages;
2490             }
2491 
2492             pages += tmppages;
2493             /*
2494              * Allow rate limiting to happen in the middle of huge pages if
2495              * something is sent in the current iteration.
2496              */
2497             if (pagesize_bits > 1 && tmppages > 0) {
2498                 migration_rate_limit();
2499             }
2500         }
2501         pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2502     } while ((pss->page < hostpage_boundary) &&
2503              offset_in_ramblock(pss->block,
2504                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2505     /* The offset we leave with is the min boundary of host page and block */
2506     pss->page = MIN(pss->page, hostpage_boundary);
2507 
2508     /*
2509      * When with postcopy preempt mode, flush the data as soon as possible for
2510      * postcopy requests, because we've already sent a whole huge page, so the
2511      * dst node should already have enough resource to atomically filling in
2512      * the current missing page.
2513      *
2514      * More importantly, when using separate postcopy channel, we must do
2515      * explicit flush or it won't flush until the buffer is full.
2516      */
2517     if (migrate_postcopy_preempt() && pss->postcopy_requested) {
2518         qemu_fflush(rs->f);
2519     }
2520 
2521     res = ram_save_release_protection(rs, pss, start_page);
2522     return (res < 0 ? res : pages);
2523 }
2524 
2525 /**
2526  * ram_find_and_save_block: finds a dirty page and sends it to f
2527  *
2528  * Called within an RCU critical section.
2529  *
2530  * Returns the number of pages written where zero means no dirty pages,
2531  * or negative on error
2532  *
2533  * @rs: current RAM state
2534  *
2535  * On systems where host-page-size > target-page-size it will send all the
2536  * pages in a host page that are dirty.
2537  */
2538 static int ram_find_and_save_block(RAMState *rs)
2539 {
2540     PageSearchStatus pss;
2541     int pages = 0;
2542     bool again, found;
2543 
2544     /* No dirty page as there is zero RAM */
2545     if (!ram_bytes_total()) {
2546         return pages;
2547     }
2548 
2549     pss.block = rs->last_seen_block;
2550     pss.page = rs->last_page;
2551     pss.complete_round = false;
2552 
2553     if (!pss.block) {
2554         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2555     }
2556 
2557     do {
2558         again = true;
2559         found = get_queued_page(rs, &pss);
2560 
2561         if (!found) {
2562             /*
2563              * Recover previous precopy ramblock/offset if postcopy has
2564              * preempted precopy.  Otherwise find the next dirty bit.
2565              */
2566             if (postcopy_preempt_triggered(rs)) {
2567                 postcopy_preempt_restore(rs, &pss, false);
2568                 found = true;
2569             } else {
2570                 /* priority queue empty, so just search for something dirty */
2571                 found = find_dirty_block(rs, &pss, &again);
2572             }
2573         }
2574 
2575         if (found) {
2576             pages = ram_save_host_page(rs, &pss);
2577         }
2578     } while (!pages && again);
2579 
2580     rs->last_seen_block = pss.block;
2581     rs->last_page = pss.page;
2582 
2583     return pages;
2584 }
2585 
2586 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2587 {
2588     uint64_t pages = size / TARGET_PAGE_SIZE;
2589 
2590     if (zero) {
2591         ram_counters.duplicate += pages;
2592     } else {
2593         ram_counters.normal += pages;
2594         ram_transferred_add(size);
2595         qemu_file_credit_transfer(f, size);
2596     }
2597 }
2598 
2599 static uint64_t ram_bytes_total_common(bool count_ignored)
2600 {
2601     RAMBlock *block;
2602     uint64_t total = 0;
2603 
2604     RCU_READ_LOCK_GUARD();
2605 
2606     if (count_ignored) {
2607         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2608             total += block->used_length;
2609         }
2610     } else {
2611         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2612             total += block->used_length;
2613         }
2614     }
2615     return total;
2616 }
2617 
2618 uint64_t ram_bytes_total(void)
2619 {
2620     return ram_bytes_total_common(false);
2621 }
2622 
2623 static void xbzrle_load_setup(void)
2624 {
2625     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2626 }
2627 
2628 static void xbzrle_load_cleanup(void)
2629 {
2630     g_free(XBZRLE.decoded_buf);
2631     XBZRLE.decoded_buf = NULL;
2632 }
2633 
2634 static void ram_state_cleanup(RAMState **rsp)
2635 {
2636     if (*rsp) {
2637         migration_page_queue_free(*rsp);
2638         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2639         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2640         g_free(*rsp);
2641         *rsp = NULL;
2642     }
2643 }
2644 
2645 static void xbzrle_cleanup(void)
2646 {
2647     XBZRLE_cache_lock();
2648     if (XBZRLE.cache) {
2649         cache_fini(XBZRLE.cache);
2650         g_free(XBZRLE.encoded_buf);
2651         g_free(XBZRLE.current_buf);
2652         g_free(XBZRLE.zero_target_page);
2653         XBZRLE.cache = NULL;
2654         XBZRLE.encoded_buf = NULL;
2655         XBZRLE.current_buf = NULL;
2656         XBZRLE.zero_target_page = NULL;
2657     }
2658     XBZRLE_cache_unlock();
2659 }
2660 
2661 static void ram_save_cleanup(void *opaque)
2662 {
2663     RAMState **rsp = opaque;
2664     RAMBlock *block;
2665 
2666     /* We don't use dirty log with background snapshots */
2667     if (!migrate_background_snapshot()) {
2668         /* caller have hold iothread lock or is in a bh, so there is
2669          * no writing race against the migration bitmap
2670          */
2671         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2672             /*
2673              * do not stop dirty log without starting it, since
2674              * memory_global_dirty_log_stop will assert that
2675              * memory_global_dirty_log_start/stop used in pairs
2676              */
2677             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2678         }
2679     }
2680 
2681     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2682         g_free(block->clear_bmap);
2683         block->clear_bmap = NULL;
2684         g_free(block->bmap);
2685         block->bmap = NULL;
2686     }
2687 
2688     xbzrle_cleanup();
2689     compress_threads_save_cleanup();
2690     ram_state_cleanup(rsp);
2691 }
2692 
2693 static void ram_state_reset(RAMState *rs)
2694 {
2695     rs->last_seen_block = NULL;
2696     rs->last_sent_block = NULL;
2697     rs->last_page = 0;
2698     rs->last_version = ram_list.version;
2699     rs->xbzrle_enabled = false;
2700     postcopy_preempt_reset(rs);
2701     rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
2702 }
2703 
2704 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2705 
2706 /* **** functions for postcopy ***** */
2707 
2708 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2709 {
2710     struct RAMBlock *block;
2711 
2712     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2713         unsigned long *bitmap = block->bmap;
2714         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2715         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2716 
2717         while (run_start < range) {
2718             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2719             ram_discard_range(block->idstr,
2720                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2721                               ((ram_addr_t)(run_end - run_start))
2722                                 << TARGET_PAGE_BITS);
2723             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2724         }
2725     }
2726 }
2727 
2728 /**
2729  * postcopy_send_discard_bm_ram: discard a RAMBlock
2730  *
2731  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2732  *
2733  * @ms: current migration state
2734  * @block: RAMBlock to discard
2735  */
2736 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2737 {
2738     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2739     unsigned long current;
2740     unsigned long *bitmap = block->bmap;
2741 
2742     for (current = 0; current < end; ) {
2743         unsigned long one = find_next_bit(bitmap, end, current);
2744         unsigned long zero, discard_length;
2745 
2746         if (one >= end) {
2747             break;
2748         }
2749 
2750         zero = find_next_zero_bit(bitmap, end, one + 1);
2751 
2752         if (zero >= end) {
2753             discard_length = end - one;
2754         } else {
2755             discard_length = zero - one;
2756         }
2757         postcopy_discard_send_range(ms, one, discard_length);
2758         current = one + discard_length;
2759     }
2760 }
2761 
2762 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2763 
2764 /**
2765  * postcopy_each_ram_send_discard: discard all RAMBlocks
2766  *
2767  * Utility for the outgoing postcopy code.
2768  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2769  *   passing it bitmap indexes and name.
2770  * (qemu_ram_foreach_block ends up passing unscaled lengths
2771  *  which would mean postcopy code would have to deal with target page)
2772  *
2773  * @ms: current migration state
2774  */
2775 static void postcopy_each_ram_send_discard(MigrationState *ms)
2776 {
2777     struct RAMBlock *block;
2778 
2779     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2780         postcopy_discard_send_init(ms, block->idstr);
2781 
2782         /*
2783          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2784          * host-page size chunks, mark any partially dirty host-page size
2785          * chunks as all dirty.  In this case the host-page is the host-page
2786          * for the particular RAMBlock, i.e. it might be a huge page.
2787          */
2788         postcopy_chunk_hostpages_pass(ms, block);
2789 
2790         /*
2791          * Postcopy sends chunks of bitmap over the wire, but it
2792          * just needs indexes at this point, avoids it having
2793          * target page specific code.
2794          */
2795         postcopy_send_discard_bm_ram(ms, block);
2796         postcopy_discard_send_finish(ms);
2797     }
2798 }
2799 
2800 /**
2801  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2802  *
2803  * Helper for postcopy_chunk_hostpages; it's called twice to
2804  * canonicalize the two bitmaps, that are similar, but one is
2805  * inverted.
2806  *
2807  * Postcopy requires that all target pages in a hostpage are dirty or
2808  * clean, not a mix.  This function canonicalizes the bitmaps.
2809  *
2810  * @ms: current migration state
2811  * @block: block that contains the page we want to canonicalize
2812  */
2813 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2814 {
2815     RAMState *rs = ram_state;
2816     unsigned long *bitmap = block->bmap;
2817     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2818     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2819     unsigned long run_start;
2820 
2821     if (block->page_size == TARGET_PAGE_SIZE) {
2822         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2823         return;
2824     }
2825 
2826     /* Find a dirty page */
2827     run_start = find_next_bit(bitmap, pages, 0);
2828 
2829     while (run_start < pages) {
2830 
2831         /*
2832          * If the start of this run of pages is in the middle of a host
2833          * page, then we need to fixup this host page.
2834          */
2835         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2836             /* Find the end of this run */
2837             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2838             /*
2839              * If the end isn't at the start of a host page, then the
2840              * run doesn't finish at the end of a host page
2841              * and we need to discard.
2842              */
2843         }
2844 
2845         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2846             unsigned long page;
2847             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2848                                                              host_ratio);
2849             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2850 
2851             /* Clean up the bitmap */
2852             for (page = fixup_start_addr;
2853                  page < fixup_start_addr + host_ratio; page++) {
2854                 /*
2855                  * Remark them as dirty, updating the count for any pages
2856                  * that weren't previously dirty.
2857                  */
2858                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2859             }
2860         }
2861 
2862         /* Find the next dirty page for the next iteration */
2863         run_start = find_next_bit(bitmap, pages, run_start);
2864     }
2865 }
2866 
2867 /**
2868  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2869  *
2870  * Transmit the set of pages to be discarded after precopy to the target
2871  * these are pages that:
2872  *     a) Have been previously transmitted but are now dirty again
2873  *     b) Pages that have never been transmitted, this ensures that
2874  *        any pages on the destination that have been mapped by background
2875  *        tasks get discarded (transparent huge pages is the specific concern)
2876  * Hopefully this is pretty sparse
2877  *
2878  * @ms: current migration state
2879  */
2880 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2881 {
2882     RAMState *rs = ram_state;
2883 
2884     RCU_READ_LOCK_GUARD();
2885 
2886     /* This should be our last sync, the src is now paused */
2887     migration_bitmap_sync(rs);
2888 
2889     /* Easiest way to make sure we don't resume in the middle of a host-page */
2890     rs->last_seen_block = NULL;
2891     rs->last_sent_block = NULL;
2892     rs->last_page = 0;
2893 
2894     postcopy_each_ram_send_discard(ms);
2895 
2896     trace_ram_postcopy_send_discard_bitmap();
2897 }
2898 
2899 /**
2900  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2901  *
2902  * Returns zero on success
2903  *
2904  * @rbname: name of the RAMBlock of the request. NULL means the
2905  *          same that last one.
2906  * @start: RAMBlock starting page
2907  * @length: RAMBlock size
2908  */
2909 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2910 {
2911     trace_ram_discard_range(rbname, start, length);
2912 
2913     RCU_READ_LOCK_GUARD();
2914     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2915 
2916     if (!rb) {
2917         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2918         return -1;
2919     }
2920 
2921     /*
2922      * On source VM, we don't need to update the received bitmap since
2923      * we don't even have one.
2924      */
2925     if (rb->receivedmap) {
2926         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2927                      length >> qemu_target_page_bits());
2928     }
2929 
2930     return ram_block_discard_range(rb, start, length);
2931 }
2932 
2933 /*
2934  * For every allocation, we will try not to crash the VM if the
2935  * allocation failed.
2936  */
2937 static int xbzrle_init(void)
2938 {
2939     Error *local_err = NULL;
2940 
2941     if (!migrate_use_xbzrle()) {
2942         return 0;
2943     }
2944 
2945     XBZRLE_cache_lock();
2946 
2947     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2948     if (!XBZRLE.zero_target_page) {
2949         error_report("%s: Error allocating zero page", __func__);
2950         goto err_out;
2951     }
2952 
2953     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2954                               TARGET_PAGE_SIZE, &local_err);
2955     if (!XBZRLE.cache) {
2956         error_report_err(local_err);
2957         goto free_zero_page;
2958     }
2959 
2960     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2961     if (!XBZRLE.encoded_buf) {
2962         error_report("%s: Error allocating encoded_buf", __func__);
2963         goto free_cache;
2964     }
2965 
2966     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2967     if (!XBZRLE.current_buf) {
2968         error_report("%s: Error allocating current_buf", __func__);
2969         goto free_encoded_buf;
2970     }
2971 
2972     /* We are all good */
2973     XBZRLE_cache_unlock();
2974     return 0;
2975 
2976 free_encoded_buf:
2977     g_free(XBZRLE.encoded_buf);
2978     XBZRLE.encoded_buf = NULL;
2979 free_cache:
2980     cache_fini(XBZRLE.cache);
2981     XBZRLE.cache = NULL;
2982 free_zero_page:
2983     g_free(XBZRLE.zero_target_page);
2984     XBZRLE.zero_target_page = NULL;
2985 err_out:
2986     XBZRLE_cache_unlock();
2987     return -ENOMEM;
2988 }
2989 
2990 static int ram_state_init(RAMState **rsp)
2991 {
2992     *rsp = g_try_new0(RAMState, 1);
2993 
2994     if (!*rsp) {
2995         error_report("%s: Init ramstate fail", __func__);
2996         return -1;
2997     }
2998 
2999     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3000     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3001     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3002 
3003     /*
3004      * Count the total number of pages used by ram blocks not including any
3005      * gaps due to alignment or unplugs.
3006      * This must match with the initial values of dirty bitmap.
3007      */
3008     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3009     ram_state_reset(*rsp);
3010 
3011     return 0;
3012 }
3013 
3014 static void ram_list_init_bitmaps(void)
3015 {
3016     MigrationState *ms = migrate_get_current();
3017     RAMBlock *block;
3018     unsigned long pages;
3019     uint8_t shift;
3020 
3021     /* Skip setting bitmap if there is no RAM */
3022     if (ram_bytes_total()) {
3023         shift = ms->clear_bitmap_shift;
3024         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3025             error_report("clear_bitmap_shift (%u) too big, using "
3026                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3027             shift = CLEAR_BITMAP_SHIFT_MAX;
3028         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3029             error_report("clear_bitmap_shift (%u) too small, using "
3030                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3031             shift = CLEAR_BITMAP_SHIFT_MIN;
3032         }
3033 
3034         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3035             pages = block->max_length >> TARGET_PAGE_BITS;
3036             /*
3037              * The initial dirty bitmap for migration must be set with all
3038              * ones to make sure we'll migrate every guest RAM page to
3039              * destination.
3040              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3041              * new migration after a failed migration, ram_list.
3042              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3043              * guest memory.
3044              */
3045             block->bmap = bitmap_new(pages);
3046             bitmap_set(block->bmap, 0, pages);
3047             block->clear_bmap_shift = shift;
3048             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3049         }
3050     }
3051 }
3052 
3053 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3054 {
3055     unsigned long pages;
3056     RAMBlock *rb;
3057 
3058     RCU_READ_LOCK_GUARD();
3059 
3060     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3061             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3062             rs->migration_dirty_pages -= pages;
3063     }
3064 }
3065 
3066 static void ram_init_bitmaps(RAMState *rs)
3067 {
3068     /* For memory_global_dirty_log_start below.  */
3069     qemu_mutex_lock_iothread();
3070     qemu_mutex_lock_ramlist();
3071 
3072     WITH_RCU_READ_LOCK_GUARD() {
3073         ram_list_init_bitmaps();
3074         /* We don't use dirty log with background snapshots */
3075         if (!migrate_background_snapshot()) {
3076             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3077             migration_bitmap_sync_precopy(rs);
3078         }
3079     }
3080     qemu_mutex_unlock_ramlist();
3081     qemu_mutex_unlock_iothread();
3082 
3083     /*
3084      * After an eventual first bitmap sync, fixup the initial bitmap
3085      * containing all 1s to exclude any discarded pages from migration.
3086      */
3087     migration_bitmap_clear_discarded_pages(rs);
3088 }
3089 
3090 static int ram_init_all(RAMState **rsp)
3091 {
3092     if (ram_state_init(rsp)) {
3093         return -1;
3094     }
3095 
3096     if (xbzrle_init()) {
3097         ram_state_cleanup(rsp);
3098         return -1;
3099     }
3100 
3101     ram_init_bitmaps(*rsp);
3102 
3103     return 0;
3104 }
3105 
3106 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3107 {
3108     RAMBlock *block;
3109     uint64_t pages = 0;
3110 
3111     /*
3112      * Postcopy is not using xbzrle/compression, so no need for that.
3113      * Also, since source are already halted, we don't need to care
3114      * about dirty page logging as well.
3115      */
3116 
3117     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3118         pages += bitmap_count_one(block->bmap,
3119                                   block->used_length >> TARGET_PAGE_BITS);
3120     }
3121 
3122     /* This may not be aligned with current bitmaps. Recalculate. */
3123     rs->migration_dirty_pages = pages;
3124 
3125     ram_state_reset(rs);
3126 
3127     /* Update RAMState cache of output QEMUFile */
3128     rs->f = out;
3129 
3130     trace_ram_state_resume_prepare(pages);
3131 }
3132 
3133 /*
3134  * This function clears bits of the free pages reported by the caller from the
3135  * migration dirty bitmap. @addr is the host address corresponding to the
3136  * start of the continuous guest free pages, and @len is the total bytes of
3137  * those pages.
3138  */
3139 void qemu_guest_free_page_hint(void *addr, size_t len)
3140 {
3141     RAMBlock *block;
3142     ram_addr_t offset;
3143     size_t used_len, start, npages;
3144     MigrationState *s = migrate_get_current();
3145 
3146     /* This function is currently expected to be used during live migration */
3147     if (!migration_is_setup_or_active(s->state)) {
3148         return;
3149     }
3150 
3151     for (; len > 0; len -= used_len, addr += used_len) {
3152         block = qemu_ram_block_from_host(addr, false, &offset);
3153         if (unlikely(!block || offset >= block->used_length)) {
3154             /*
3155              * The implementation might not support RAMBlock resize during
3156              * live migration, but it could happen in theory with future
3157              * updates. So we add a check here to capture that case.
3158              */
3159             error_report_once("%s unexpected error", __func__);
3160             return;
3161         }
3162 
3163         if (len <= block->used_length - offset) {
3164             used_len = len;
3165         } else {
3166             used_len = block->used_length - offset;
3167         }
3168 
3169         start = offset >> TARGET_PAGE_BITS;
3170         npages = used_len >> TARGET_PAGE_BITS;
3171 
3172         qemu_mutex_lock(&ram_state->bitmap_mutex);
3173         /*
3174          * The skipped free pages are equavalent to be sent from clear_bmap's
3175          * perspective, so clear the bits from the memory region bitmap which
3176          * are initially set. Otherwise those skipped pages will be sent in
3177          * the next round after syncing from the memory region bitmap.
3178          */
3179         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3180         ram_state->migration_dirty_pages -=
3181                       bitmap_count_one_with_offset(block->bmap, start, npages);
3182         bitmap_clear(block->bmap, start, npages);
3183         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3184     }
3185 }
3186 
3187 /*
3188  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3189  * long-running RCU critical section.  When rcu-reclaims in the code
3190  * start to become numerous it will be necessary to reduce the
3191  * granularity of these critical sections.
3192  */
3193 
3194 /**
3195  * ram_save_setup: Setup RAM for migration
3196  *
3197  * Returns zero to indicate success and negative for error
3198  *
3199  * @f: QEMUFile where to send the data
3200  * @opaque: RAMState pointer
3201  */
3202 static int ram_save_setup(QEMUFile *f, void *opaque)
3203 {
3204     RAMState **rsp = opaque;
3205     RAMBlock *block;
3206     int ret;
3207 
3208     if (compress_threads_save_setup()) {
3209         return -1;
3210     }
3211 
3212     /* migration has already setup the bitmap, reuse it. */
3213     if (!migration_in_colo_state()) {
3214         if (ram_init_all(rsp) != 0) {
3215             compress_threads_save_cleanup();
3216             return -1;
3217         }
3218     }
3219     (*rsp)->f = f;
3220 
3221     WITH_RCU_READ_LOCK_GUARD() {
3222         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3223 
3224         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3225             qemu_put_byte(f, strlen(block->idstr));
3226             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3227             qemu_put_be64(f, block->used_length);
3228             if (migrate_postcopy_ram() && block->page_size !=
3229                                           qemu_host_page_size) {
3230                 qemu_put_be64(f, block->page_size);
3231             }
3232             if (migrate_ignore_shared()) {
3233                 qemu_put_be64(f, block->mr->addr);
3234             }
3235         }
3236     }
3237 
3238     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3239     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3240 
3241     ret =  multifd_send_sync_main(f);
3242     if (ret < 0) {
3243         return ret;
3244     }
3245 
3246     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3247     qemu_fflush(f);
3248 
3249     return 0;
3250 }
3251 
3252 /**
3253  * ram_save_iterate: iterative stage for migration
3254  *
3255  * Returns zero to indicate success and negative for error
3256  *
3257  * @f: QEMUFile where to send the data
3258  * @opaque: RAMState pointer
3259  */
3260 static int ram_save_iterate(QEMUFile *f, void *opaque)
3261 {
3262     RAMState **temp = opaque;
3263     RAMState *rs = *temp;
3264     int ret = 0;
3265     int i;
3266     int64_t t0;
3267     int done = 0;
3268 
3269     if (blk_mig_bulk_active()) {
3270         /* Avoid transferring ram during bulk phase of block migration as
3271          * the bulk phase will usually take a long time and transferring
3272          * ram updates during that time is pointless. */
3273         goto out;
3274     }
3275 
3276     /*
3277      * We'll take this lock a little bit long, but it's okay for two reasons.
3278      * Firstly, the only possible other thread to take it is who calls
3279      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3280      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3281      * guarantees that we'll at least released it in a regular basis.
3282      */
3283     qemu_mutex_lock(&rs->bitmap_mutex);
3284     WITH_RCU_READ_LOCK_GUARD() {
3285         if (ram_list.version != rs->last_version) {
3286             ram_state_reset(rs);
3287         }
3288 
3289         /* Read version before ram_list.blocks */
3290         smp_rmb();
3291 
3292         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3293 
3294         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3295         i = 0;
3296         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3297                postcopy_has_request(rs)) {
3298             int pages;
3299 
3300             if (qemu_file_get_error(f)) {
3301                 break;
3302             }
3303 
3304             pages = ram_find_and_save_block(rs);
3305             /* no more pages to sent */
3306             if (pages == 0) {
3307                 done = 1;
3308                 break;
3309             }
3310 
3311             if (pages < 0) {
3312                 qemu_file_set_error(f, pages);
3313                 break;
3314             }
3315 
3316             rs->target_page_count += pages;
3317 
3318             /*
3319              * During postcopy, it is necessary to make sure one whole host
3320              * page is sent in one chunk.
3321              */
3322             if (migrate_postcopy_ram()) {
3323                 flush_compressed_data(rs);
3324             }
3325 
3326             /*
3327              * we want to check in the 1st loop, just in case it was the 1st
3328              * time and we had to sync the dirty bitmap.
3329              * qemu_clock_get_ns() is a bit expensive, so we only check each
3330              * some iterations
3331              */
3332             if ((i & 63) == 0) {
3333                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3334                               1000000;
3335                 if (t1 > MAX_WAIT) {
3336                     trace_ram_save_iterate_big_wait(t1, i);
3337                     break;
3338                 }
3339             }
3340             i++;
3341         }
3342     }
3343     qemu_mutex_unlock(&rs->bitmap_mutex);
3344 
3345     postcopy_preempt_reset_channel(rs);
3346 
3347     /*
3348      * Must occur before EOS (or any QEMUFile operation)
3349      * because of RDMA protocol.
3350      */
3351     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3352 
3353 out:
3354     if (ret >= 0
3355         && migration_is_setup_or_active(migrate_get_current()->state)) {
3356         ret = multifd_send_sync_main(rs->f);
3357         if (ret < 0) {
3358             return ret;
3359         }
3360 
3361         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3362         qemu_fflush(f);
3363         ram_transferred_add(8);
3364 
3365         ret = qemu_file_get_error(f);
3366     }
3367     if (ret < 0) {
3368         return ret;
3369     }
3370 
3371     return done;
3372 }
3373 
3374 /**
3375  * ram_save_complete: function called to send the remaining amount of ram
3376  *
3377  * Returns zero to indicate success or negative on error
3378  *
3379  * Called with iothread lock
3380  *
3381  * @f: QEMUFile where to send the data
3382  * @opaque: RAMState pointer
3383  */
3384 static int ram_save_complete(QEMUFile *f, void *opaque)
3385 {
3386     RAMState **temp = opaque;
3387     RAMState *rs = *temp;
3388     int ret = 0;
3389 
3390     rs->last_stage = !migration_in_colo_state();
3391 
3392     WITH_RCU_READ_LOCK_GUARD() {
3393         if (!migration_in_postcopy()) {
3394             migration_bitmap_sync_precopy(rs);
3395         }
3396 
3397         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3398 
3399         /* try transferring iterative blocks of memory */
3400 
3401         /* flush all remaining blocks regardless of rate limiting */
3402         while (true) {
3403             int pages;
3404 
3405             pages = ram_find_and_save_block(rs);
3406             /* no more blocks to sent */
3407             if (pages == 0) {
3408                 break;
3409             }
3410             if (pages < 0) {
3411                 ret = pages;
3412                 break;
3413             }
3414         }
3415 
3416         flush_compressed_data(rs);
3417         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3418     }
3419 
3420     if (ret < 0) {
3421         return ret;
3422     }
3423 
3424     postcopy_preempt_reset_channel(rs);
3425 
3426     ret = multifd_send_sync_main(rs->f);
3427     if (ret < 0) {
3428         return ret;
3429     }
3430 
3431     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3432     qemu_fflush(f);
3433 
3434     return 0;
3435 }
3436 
3437 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3438                              uint64_t *res_precopy_only,
3439                              uint64_t *res_compatible,
3440                              uint64_t *res_postcopy_only)
3441 {
3442     RAMState **temp = opaque;
3443     RAMState *rs = *temp;
3444     uint64_t remaining_size;
3445 
3446     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3447 
3448     if (!migration_in_postcopy() &&
3449         remaining_size < max_size) {
3450         qemu_mutex_lock_iothread();
3451         WITH_RCU_READ_LOCK_GUARD() {
3452             migration_bitmap_sync_precopy(rs);
3453         }
3454         qemu_mutex_unlock_iothread();
3455         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3456     }
3457 
3458     if (migrate_postcopy_ram()) {
3459         /* We can do postcopy, and all the data is postcopiable */
3460         *res_compatible += remaining_size;
3461     } else {
3462         *res_precopy_only += remaining_size;
3463     }
3464 }
3465 
3466 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3467 {
3468     unsigned int xh_len;
3469     int xh_flags;
3470     uint8_t *loaded_data;
3471 
3472     /* extract RLE header */
3473     xh_flags = qemu_get_byte(f);
3474     xh_len = qemu_get_be16(f);
3475 
3476     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3477         error_report("Failed to load XBZRLE page - wrong compression!");
3478         return -1;
3479     }
3480 
3481     if (xh_len > TARGET_PAGE_SIZE) {
3482         error_report("Failed to load XBZRLE page - len overflow!");
3483         return -1;
3484     }
3485     loaded_data = XBZRLE.decoded_buf;
3486     /* load data and decode */
3487     /* it can change loaded_data to point to an internal buffer */
3488     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3489 
3490     /* decode RLE */
3491     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3492                              TARGET_PAGE_SIZE) == -1) {
3493         error_report("Failed to load XBZRLE page - decode error!");
3494         return -1;
3495     }
3496 
3497     return 0;
3498 }
3499 
3500 /**
3501  * ram_block_from_stream: read a RAMBlock id from the migration stream
3502  *
3503  * Must be called from within a rcu critical section.
3504  *
3505  * Returns a pointer from within the RCU-protected ram_list.
3506  *
3507  * @mis: the migration incoming state pointer
3508  * @f: QEMUFile where to read the data from
3509  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3510  * @channel: the channel we're using
3511  */
3512 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3513                                               QEMUFile *f, int flags,
3514                                               int channel)
3515 {
3516     RAMBlock *block = mis->last_recv_block[channel];
3517     char id[256];
3518     uint8_t len;
3519 
3520     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3521         if (!block) {
3522             error_report("Ack, bad migration stream!");
3523             return NULL;
3524         }
3525         return block;
3526     }
3527 
3528     len = qemu_get_byte(f);
3529     qemu_get_buffer(f, (uint8_t *)id, len);
3530     id[len] = 0;
3531 
3532     block = qemu_ram_block_by_name(id);
3533     if (!block) {
3534         error_report("Can't find block %s", id);
3535         return NULL;
3536     }
3537 
3538     if (ramblock_is_ignored(block)) {
3539         error_report("block %s should not be migrated !", id);
3540         return NULL;
3541     }
3542 
3543     mis->last_recv_block[channel] = block;
3544 
3545     return block;
3546 }
3547 
3548 static inline void *host_from_ram_block_offset(RAMBlock *block,
3549                                                ram_addr_t offset)
3550 {
3551     if (!offset_in_ramblock(block, offset)) {
3552         return NULL;
3553     }
3554 
3555     return block->host + offset;
3556 }
3557 
3558 static void *host_page_from_ram_block_offset(RAMBlock *block,
3559                                              ram_addr_t offset)
3560 {
3561     /* Note: Explicitly no check against offset_in_ramblock(). */
3562     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3563                                    block->page_size);
3564 }
3565 
3566 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3567                                                          ram_addr_t offset)
3568 {
3569     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3570 }
3571 
3572 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3573                              ram_addr_t offset, bool record_bitmap)
3574 {
3575     if (!offset_in_ramblock(block, offset)) {
3576         return NULL;
3577     }
3578     if (!block->colo_cache) {
3579         error_report("%s: colo_cache is NULL in block :%s",
3580                      __func__, block->idstr);
3581         return NULL;
3582     }
3583 
3584     /*
3585     * During colo checkpoint, we need bitmap of these migrated pages.
3586     * It help us to decide which pages in ram cache should be flushed
3587     * into VM's RAM later.
3588     */
3589     if (record_bitmap &&
3590         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3591         ram_state->migration_dirty_pages++;
3592     }
3593     return block->colo_cache + offset;
3594 }
3595 
3596 /**
3597  * ram_handle_compressed: handle the zero page case
3598  *
3599  * If a page (or a whole RDMA chunk) has been
3600  * determined to be zero, then zap it.
3601  *
3602  * @host: host address for the zero page
3603  * @ch: what the page is filled from.  We only support zero
3604  * @size: size of the zero page
3605  */
3606 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3607 {
3608     if (ch != 0 || !buffer_is_zero(host, size)) {
3609         memset(host, ch, size);
3610     }
3611 }
3612 
3613 /* return the size after decompression, or negative value on error */
3614 static int
3615 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3616                      const uint8_t *source, size_t source_len)
3617 {
3618     int err;
3619 
3620     err = inflateReset(stream);
3621     if (err != Z_OK) {
3622         return -1;
3623     }
3624 
3625     stream->avail_in = source_len;
3626     stream->next_in = (uint8_t *)source;
3627     stream->avail_out = dest_len;
3628     stream->next_out = dest;
3629 
3630     err = inflate(stream, Z_NO_FLUSH);
3631     if (err != Z_STREAM_END) {
3632         return -1;
3633     }
3634 
3635     return stream->total_out;
3636 }
3637 
3638 static void *do_data_decompress(void *opaque)
3639 {
3640     DecompressParam *param = opaque;
3641     unsigned long pagesize;
3642     uint8_t *des;
3643     int len, ret;
3644 
3645     qemu_mutex_lock(&param->mutex);
3646     while (!param->quit) {
3647         if (param->des) {
3648             des = param->des;
3649             len = param->len;
3650             param->des = 0;
3651             qemu_mutex_unlock(&param->mutex);
3652 
3653             pagesize = TARGET_PAGE_SIZE;
3654 
3655             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3656                                        param->compbuf, len);
3657             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3658                 error_report("decompress data failed");
3659                 qemu_file_set_error(decomp_file, ret);
3660             }
3661 
3662             qemu_mutex_lock(&decomp_done_lock);
3663             param->done = true;
3664             qemu_cond_signal(&decomp_done_cond);
3665             qemu_mutex_unlock(&decomp_done_lock);
3666 
3667             qemu_mutex_lock(&param->mutex);
3668         } else {
3669             qemu_cond_wait(&param->cond, &param->mutex);
3670         }
3671     }
3672     qemu_mutex_unlock(&param->mutex);
3673 
3674     return NULL;
3675 }
3676 
3677 static int wait_for_decompress_done(void)
3678 {
3679     int idx, thread_count;
3680 
3681     if (!migrate_use_compression()) {
3682         return 0;
3683     }
3684 
3685     thread_count = migrate_decompress_threads();
3686     qemu_mutex_lock(&decomp_done_lock);
3687     for (idx = 0; idx < thread_count; idx++) {
3688         while (!decomp_param[idx].done) {
3689             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3690         }
3691     }
3692     qemu_mutex_unlock(&decomp_done_lock);
3693     return qemu_file_get_error(decomp_file);
3694 }
3695 
3696 static void compress_threads_load_cleanup(void)
3697 {
3698     int i, thread_count;
3699 
3700     if (!migrate_use_compression()) {
3701         return;
3702     }
3703     thread_count = migrate_decompress_threads();
3704     for (i = 0; i < thread_count; i++) {
3705         /*
3706          * we use it as a indicator which shows if the thread is
3707          * properly init'd or not
3708          */
3709         if (!decomp_param[i].compbuf) {
3710             break;
3711         }
3712 
3713         qemu_mutex_lock(&decomp_param[i].mutex);
3714         decomp_param[i].quit = true;
3715         qemu_cond_signal(&decomp_param[i].cond);
3716         qemu_mutex_unlock(&decomp_param[i].mutex);
3717     }
3718     for (i = 0; i < thread_count; i++) {
3719         if (!decomp_param[i].compbuf) {
3720             break;
3721         }
3722 
3723         qemu_thread_join(decompress_threads + i);
3724         qemu_mutex_destroy(&decomp_param[i].mutex);
3725         qemu_cond_destroy(&decomp_param[i].cond);
3726         inflateEnd(&decomp_param[i].stream);
3727         g_free(decomp_param[i].compbuf);
3728         decomp_param[i].compbuf = NULL;
3729     }
3730     g_free(decompress_threads);
3731     g_free(decomp_param);
3732     decompress_threads = NULL;
3733     decomp_param = NULL;
3734     decomp_file = NULL;
3735 }
3736 
3737 static int compress_threads_load_setup(QEMUFile *f)
3738 {
3739     int i, thread_count;
3740 
3741     if (!migrate_use_compression()) {
3742         return 0;
3743     }
3744 
3745     thread_count = migrate_decompress_threads();
3746     decompress_threads = g_new0(QemuThread, thread_count);
3747     decomp_param = g_new0(DecompressParam, thread_count);
3748     qemu_mutex_init(&decomp_done_lock);
3749     qemu_cond_init(&decomp_done_cond);
3750     decomp_file = f;
3751     for (i = 0; i < thread_count; i++) {
3752         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3753             goto exit;
3754         }
3755 
3756         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3757         qemu_mutex_init(&decomp_param[i].mutex);
3758         qemu_cond_init(&decomp_param[i].cond);
3759         decomp_param[i].done = true;
3760         decomp_param[i].quit = false;
3761         qemu_thread_create(decompress_threads + i, "decompress",
3762                            do_data_decompress, decomp_param + i,
3763                            QEMU_THREAD_JOINABLE);
3764     }
3765     return 0;
3766 exit:
3767     compress_threads_load_cleanup();
3768     return -1;
3769 }
3770 
3771 static void decompress_data_with_multi_threads(QEMUFile *f,
3772                                                void *host, int len)
3773 {
3774     int idx, thread_count;
3775 
3776     thread_count = migrate_decompress_threads();
3777     QEMU_LOCK_GUARD(&decomp_done_lock);
3778     while (true) {
3779         for (idx = 0; idx < thread_count; idx++) {
3780             if (decomp_param[idx].done) {
3781                 decomp_param[idx].done = false;
3782                 qemu_mutex_lock(&decomp_param[idx].mutex);
3783                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3784                 decomp_param[idx].des = host;
3785                 decomp_param[idx].len = len;
3786                 qemu_cond_signal(&decomp_param[idx].cond);
3787                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3788                 break;
3789             }
3790         }
3791         if (idx < thread_count) {
3792             break;
3793         } else {
3794             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3795         }
3796     }
3797 }
3798 
3799 static void colo_init_ram_state(void)
3800 {
3801     ram_state_init(&ram_state);
3802 }
3803 
3804 /*
3805  * colo cache: this is for secondary VM, we cache the whole
3806  * memory of the secondary VM, it is need to hold the global lock
3807  * to call this helper.
3808  */
3809 int colo_init_ram_cache(void)
3810 {
3811     RAMBlock *block;
3812 
3813     WITH_RCU_READ_LOCK_GUARD() {
3814         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3815             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3816                                                     NULL, false, false);
3817             if (!block->colo_cache) {
3818                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3819                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3820                              block->used_length);
3821                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3822                     if (block->colo_cache) {
3823                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3824                         block->colo_cache = NULL;
3825                     }
3826                 }
3827                 return -errno;
3828             }
3829             if (!machine_dump_guest_core(current_machine)) {
3830                 qemu_madvise(block->colo_cache, block->used_length,
3831                              QEMU_MADV_DONTDUMP);
3832             }
3833         }
3834     }
3835 
3836     /*
3837     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3838     * with to decide which page in cache should be flushed into SVM's RAM. Here
3839     * we use the same name 'ram_bitmap' as for migration.
3840     */
3841     if (ram_bytes_total()) {
3842         RAMBlock *block;
3843 
3844         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3845             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3846             block->bmap = bitmap_new(pages);
3847         }
3848     }
3849 
3850     colo_init_ram_state();
3851     return 0;
3852 }
3853 
3854 /* TODO: duplicated with ram_init_bitmaps */
3855 void colo_incoming_start_dirty_log(void)
3856 {
3857     RAMBlock *block = NULL;
3858     /* For memory_global_dirty_log_start below. */
3859     qemu_mutex_lock_iothread();
3860     qemu_mutex_lock_ramlist();
3861 
3862     memory_global_dirty_log_sync();
3863     WITH_RCU_READ_LOCK_GUARD() {
3864         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3865             ramblock_sync_dirty_bitmap(ram_state, block);
3866             /* Discard this dirty bitmap record */
3867             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3868         }
3869         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3870     }
3871     ram_state->migration_dirty_pages = 0;
3872     qemu_mutex_unlock_ramlist();
3873     qemu_mutex_unlock_iothread();
3874 }
3875 
3876 /* It is need to hold the global lock to call this helper */
3877 void colo_release_ram_cache(void)
3878 {
3879     RAMBlock *block;
3880 
3881     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3882     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3883         g_free(block->bmap);
3884         block->bmap = NULL;
3885     }
3886 
3887     WITH_RCU_READ_LOCK_GUARD() {
3888         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3889             if (block->colo_cache) {
3890                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3891                 block->colo_cache = NULL;
3892             }
3893         }
3894     }
3895     ram_state_cleanup(&ram_state);
3896 }
3897 
3898 /**
3899  * ram_load_setup: Setup RAM for migration incoming side
3900  *
3901  * Returns zero to indicate success and negative for error
3902  *
3903  * @f: QEMUFile where to receive the data
3904  * @opaque: RAMState pointer
3905  */
3906 static int ram_load_setup(QEMUFile *f, void *opaque)
3907 {
3908     if (compress_threads_load_setup(f)) {
3909         return -1;
3910     }
3911 
3912     xbzrle_load_setup();
3913     ramblock_recv_map_init();
3914 
3915     return 0;
3916 }
3917 
3918 static int ram_load_cleanup(void *opaque)
3919 {
3920     RAMBlock *rb;
3921 
3922     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3923         qemu_ram_block_writeback(rb);
3924     }
3925 
3926     xbzrle_load_cleanup();
3927     compress_threads_load_cleanup();
3928 
3929     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3930         g_free(rb->receivedmap);
3931         rb->receivedmap = NULL;
3932     }
3933 
3934     return 0;
3935 }
3936 
3937 /**
3938  * ram_postcopy_incoming_init: allocate postcopy data structures
3939  *
3940  * Returns 0 for success and negative if there was one error
3941  *
3942  * @mis: current migration incoming state
3943  *
3944  * Allocate data structures etc needed by incoming migration with
3945  * postcopy-ram. postcopy-ram's similarly names
3946  * postcopy_ram_incoming_init does the work.
3947  */
3948 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3949 {
3950     return postcopy_ram_incoming_init(mis);
3951 }
3952 
3953 /**
3954  * ram_load_postcopy: load a page in postcopy case
3955  *
3956  * Returns 0 for success or -errno in case of error
3957  *
3958  * Called in postcopy mode by ram_load().
3959  * rcu_read_lock is taken prior to this being called.
3960  *
3961  * @f: QEMUFile where to send the data
3962  * @channel: the channel to use for loading
3963  */
3964 int ram_load_postcopy(QEMUFile *f, int channel)
3965 {
3966     int flags = 0, ret = 0;
3967     bool place_needed = false;
3968     bool matches_target_page_size = false;
3969     MigrationIncomingState *mis = migration_incoming_get_current();
3970     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
3971 
3972     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3973         ram_addr_t addr;
3974         void *page_buffer = NULL;
3975         void *place_source = NULL;
3976         RAMBlock *block = NULL;
3977         uint8_t ch;
3978         int len;
3979 
3980         addr = qemu_get_be64(f);
3981 
3982         /*
3983          * If qemu file error, we should stop here, and then "addr"
3984          * may be invalid
3985          */
3986         ret = qemu_file_get_error(f);
3987         if (ret) {
3988             break;
3989         }
3990 
3991         flags = addr & ~TARGET_PAGE_MASK;
3992         addr &= TARGET_PAGE_MASK;
3993 
3994         trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
3995         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3996                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3997             block = ram_block_from_stream(mis, f, flags, channel);
3998             if (!block) {
3999                 ret = -EINVAL;
4000                 break;
4001             }
4002 
4003             /*
4004              * Relying on used_length is racy and can result in false positives.
4005              * We might place pages beyond used_length in case RAM was shrunk
4006              * while in postcopy, which is fine - trying to place via
4007              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4008              */
4009             if (!block->host || addr >= block->postcopy_length) {
4010                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4011                 ret = -EINVAL;
4012                 break;
4013             }
4014             tmp_page->target_pages++;
4015             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4016             /*
4017              * Postcopy requires that we place whole host pages atomically;
4018              * these may be huge pages for RAMBlocks that are backed by
4019              * hugetlbfs.
4020              * To make it atomic, the data is read into a temporary page
4021              * that's moved into place later.
4022              * The migration protocol uses,  possibly smaller, target-pages
4023              * however the source ensures it always sends all the components
4024              * of a host page in one chunk.
4025              */
4026             page_buffer = tmp_page->tmp_huge_page +
4027                           host_page_offset_from_ram_block_offset(block, addr);
4028             /* If all TP are zero then we can optimise the place */
4029             if (tmp_page->target_pages == 1) {
4030                 tmp_page->host_addr =
4031                     host_page_from_ram_block_offset(block, addr);
4032             } else if (tmp_page->host_addr !=
4033                        host_page_from_ram_block_offset(block, addr)) {
4034                 /* not the 1st TP within the HP */
4035                 error_report("Non-same host page detected on channel %d: "
4036                              "Target host page %p, received host page %p "
4037                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4038                              channel, tmp_page->host_addr,
4039                              host_page_from_ram_block_offset(block, addr),
4040                              block->idstr, addr, tmp_page->target_pages);
4041                 ret = -EINVAL;
4042                 break;
4043             }
4044 
4045             /*
4046              * If it's the last part of a host page then we place the host
4047              * page
4048              */
4049             if (tmp_page->target_pages ==
4050                 (block->page_size / TARGET_PAGE_SIZE)) {
4051                 place_needed = true;
4052             }
4053             place_source = tmp_page->tmp_huge_page;
4054         }
4055 
4056         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4057         case RAM_SAVE_FLAG_ZERO:
4058             ch = qemu_get_byte(f);
4059             /*
4060              * Can skip to set page_buffer when
4061              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4062              */
4063             if (ch || !matches_target_page_size) {
4064                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4065             }
4066             if (ch) {
4067                 tmp_page->all_zero = false;
4068             }
4069             break;
4070 
4071         case RAM_SAVE_FLAG_PAGE:
4072             tmp_page->all_zero = false;
4073             if (!matches_target_page_size) {
4074                 /* For huge pages, we always use temporary buffer */
4075                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4076             } else {
4077                 /*
4078                  * For small pages that matches target page size, we
4079                  * avoid the qemu_file copy.  Instead we directly use
4080                  * the buffer of QEMUFile to place the page.  Note: we
4081                  * cannot do any QEMUFile operation before using that
4082                  * buffer to make sure the buffer is valid when
4083                  * placing the page.
4084                  */
4085                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4086                                          TARGET_PAGE_SIZE);
4087             }
4088             break;
4089         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4090             tmp_page->all_zero = false;
4091             len = qemu_get_be32(f);
4092             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4093                 error_report("Invalid compressed data length: %d", len);
4094                 ret = -EINVAL;
4095                 break;
4096             }
4097             decompress_data_with_multi_threads(f, page_buffer, len);
4098             break;
4099 
4100         case RAM_SAVE_FLAG_EOS:
4101             /* normal exit */
4102             multifd_recv_sync_main();
4103             break;
4104         default:
4105             error_report("Unknown combination of migration flags: 0x%x"
4106                          " (postcopy mode)", flags);
4107             ret = -EINVAL;
4108             break;
4109         }
4110 
4111         /* Got the whole host page, wait for decompress before placing. */
4112         if (place_needed) {
4113             ret |= wait_for_decompress_done();
4114         }
4115 
4116         /* Detect for any possible file errors */
4117         if (!ret && qemu_file_get_error(f)) {
4118             ret = qemu_file_get_error(f);
4119         }
4120 
4121         if (!ret && place_needed) {
4122             if (tmp_page->all_zero) {
4123                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4124             } else {
4125                 ret = postcopy_place_page(mis, tmp_page->host_addr,
4126                                           place_source, block);
4127             }
4128             place_needed = false;
4129             postcopy_temp_page_reset(tmp_page);
4130         }
4131     }
4132 
4133     return ret;
4134 }
4135 
4136 static bool postcopy_is_advised(void)
4137 {
4138     PostcopyState ps = postcopy_state_get();
4139     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4140 }
4141 
4142 static bool postcopy_is_running(void)
4143 {
4144     PostcopyState ps = postcopy_state_get();
4145     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4146 }
4147 
4148 /*
4149  * Flush content of RAM cache into SVM's memory.
4150  * Only flush the pages that be dirtied by PVM or SVM or both.
4151  */
4152 void colo_flush_ram_cache(void)
4153 {
4154     RAMBlock *block = NULL;
4155     void *dst_host;
4156     void *src_host;
4157     unsigned long offset = 0;
4158 
4159     memory_global_dirty_log_sync();
4160     WITH_RCU_READ_LOCK_GUARD() {
4161         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4162             ramblock_sync_dirty_bitmap(ram_state, block);
4163         }
4164     }
4165 
4166     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4167     WITH_RCU_READ_LOCK_GUARD() {
4168         block = QLIST_FIRST_RCU(&ram_list.blocks);
4169 
4170         while (block) {
4171             unsigned long num = 0;
4172 
4173             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4174             if (!offset_in_ramblock(block,
4175                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4176                 offset = 0;
4177                 num = 0;
4178                 block = QLIST_NEXT_RCU(block, next);
4179             } else {
4180                 unsigned long i = 0;
4181 
4182                 for (i = 0; i < num; i++) {
4183                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
4184                 }
4185                 dst_host = block->host
4186                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4187                 src_host = block->colo_cache
4188                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4189                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4190                 offset += num;
4191             }
4192         }
4193     }
4194     trace_colo_flush_ram_cache_end();
4195 }
4196 
4197 /**
4198  * ram_load_precopy: load pages in precopy case
4199  *
4200  * Returns 0 for success or -errno in case of error
4201  *
4202  * Called in precopy mode by ram_load().
4203  * rcu_read_lock is taken prior to this being called.
4204  *
4205  * @f: QEMUFile where to send the data
4206  */
4207 static int ram_load_precopy(QEMUFile *f)
4208 {
4209     MigrationIncomingState *mis = migration_incoming_get_current();
4210     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4211     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4212     bool postcopy_advised = postcopy_is_advised();
4213     if (!migrate_use_compression()) {
4214         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4215     }
4216 
4217     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4218         ram_addr_t addr, total_ram_bytes;
4219         void *host = NULL, *host_bak = NULL;
4220         uint8_t ch;
4221 
4222         /*
4223          * Yield periodically to let main loop run, but an iteration of
4224          * the main loop is expensive, so do it each some iterations
4225          */
4226         if ((i & 32767) == 0 && qemu_in_coroutine()) {
4227             aio_co_schedule(qemu_get_current_aio_context(),
4228                             qemu_coroutine_self());
4229             qemu_coroutine_yield();
4230         }
4231         i++;
4232 
4233         addr = qemu_get_be64(f);
4234         flags = addr & ~TARGET_PAGE_MASK;
4235         addr &= TARGET_PAGE_MASK;
4236 
4237         if (flags & invalid_flags) {
4238             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4239                 error_report("Received an unexpected compressed page");
4240             }
4241 
4242             ret = -EINVAL;
4243             break;
4244         }
4245 
4246         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4247                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4248             RAMBlock *block = ram_block_from_stream(mis, f, flags,
4249                                                     RAM_CHANNEL_PRECOPY);
4250 
4251             host = host_from_ram_block_offset(block, addr);
4252             /*
4253              * After going into COLO stage, we should not load the page
4254              * into SVM's memory directly, we put them into colo_cache firstly.
4255              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4256              * Previously, we copied all these memory in preparing stage of COLO
4257              * while we need to stop VM, which is a time-consuming process.
4258              * Here we optimize it by a trick, back-up every page while in
4259              * migration process while COLO is enabled, though it affects the
4260              * speed of the migration, but it obviously reduce the downtime of
4261              * back-up all SVM'S memory in COLO preparing stage.
4262              */
4263             if (migration_incoming_colo_enabled()) {
4264                 if (migration_incoming_in_colo_state()) {
4265                     /* In COLO stage, put all pages into cache temporarily */
4266                     host = colo_cache_from_block_offset(block, addr, true);
4267                 } else {
4268                    /*
4269                     * In migration stage but before COLO stage,
4270                     * Put all pages into both cache and SVM's memory.
4271                     */
4272                     host_bak = colo_cache_from_block_offset(block, addr, false);
4273                 }
4274             }
4275             if (!host) {
4276                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4277                 ret = -EINVAL;
4278                 break;
4279             }
4280             if (!migration_incoming_in_colo_state()) {
4281                 ramblock_recv_bitmap_set(block, host);
4282             }
4283 
4284             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4285         }
4286 
4287         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4288         case RAM_SAVE_FLAG_MEM_SIZE:
4289             /* Synchronize RAM block list */
4290             total_ram_bytes = addr;
4291             while (!ret && total_ram_bytes) {
4292                 RAMBlock *block;
4293                 char id[256];
4294                 ram_addr_t length;
4295 
4296                 len = qemu_get_byte(f);
4297                 qemu_get_buffer(f, (uint8_t *)id, len);
4298                 id[len] = 0;
4299                 length = qemu_get_be64(f);
4300 
4301                 block = qemu_ram_block_by_name(id);
4302                 if (block && !qemu_ram_is_migratable(block)) {
4303                     error_report("block %s should not be migrated !", id);
4304                     ret = -EINVAL;
4305                 } else if (block) {
4306                     if (length != block->used_length) {
4307                         Error *local_err = NULL;
4308 
4309                         ret = qemu_ram_resize(block, length,
4310                                               &local_err);
4311                         if (local_err) {
4312                             error_report_err(local_err);
4313                         }
4314                     }
4315                     /* For postcopy we need to check hugepage sizes match */
4316                     if (postcopy_advised && migrate_postcopy_ram() &&
4317                         block->page_size != qemu_host_page_size) {
4318                         uint64_t remote_page_size = qemu_get_be64(f);
4319                         if (remote_page_size != block->page_size) {
4320                             error_report("Mismatched RAM page size %s "
4321                                          "(local) %zd != %" PRId64,
4322                                          id, block->page_size,
4323                                          remote_page_size);
4324                             ret = -EINVAL;
4325                         }
4326                     }
4327                     if (migrate_ignore_shared()) {
4328                         hwaddr addr = qemu_get_be64(f);
4329                         if (ramblock_is_ignored(block) &&
4330                             block->mr->addr != addr) {
4331                             error_report("Mismatched GPAs for block %s "
4332                                          "%" PRId64 "!= %" PRId64,
4333                                          id, (uint64_t)addr,
4334                                          (uint64_t)block->mr->addr);
4335                             ret = -EINVAL;
4336                         }
4337                     }
4338                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4339                                           block->idstr);
4340                 } else {
4341                     error_report("Unknown ramblock \"%s\", cannot "
4342                                  "accept migration", id);
4343                     ret = -EINVAL;
4344                 }
4345 
4346                 total_ram_bytes -= length;
4347             }
4348             break;
4349 
4350         case RAM_SAVE_FLAG_ZERO:
4351             ch = qemu_get_byte(f);
4352             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4353             break;
4354 
4355         case RAM_SAVE_FLAG_PAGE:
4356             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4357             break;
4358 
4359         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4360             len = qemu_get_be32(f);
4361             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4362                 error_report("Invalid compressed data length: %d", len);
4363                 ret = -EINVAL;
4364                 break;
4365             }
4366             decompress_data_with_multi_threads(f, host, len);
4367             break;
4368 
4369         case RAM_SAVE_FLAG_XBZRLE:
4370             if (load_xbzrle(f, addr, host) < 0) {
4371                 error_report("Failed to decompress XBZRLE page at "
4372                              RAM_ADDR_FMT, addr);
4373                 ret = -EINVAL;
4374                 break;
4375             }
4376             break;
4377         case RAM_SAVE_FLAG_EOS:
4378             /* normal exit */
4379             multifd_recv_sync_main();
4380             break;
4381         default:
4382             if (flags & RAM_SAVE_FLAG_HOOK) {
4383                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4384             } else {
4385                 error_report("Unknown combination of migration flags: 0x%x",
4386                              flags);
4387                 ret = -EINVAL;
4388             }
4389         }
4390         if (!ret) {
4391             ret = qemu_file_get_error(f);
4392         }
4393         if (!ret && host_bak) {
4394             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4395         }
4396     }
4397 
4398     ret |= wait_for_decompress_done();
4399     return ret;
4400 }
4401 
4402 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4403 {
4404     int ret = 0;
4405     static uint64_t seq_iter;
4406     /*
4407      * If system is running in postcopy mode, page inserts to host memory must
4408      * be atomic
4409      */
4410     bool postcopy_running = postcopy_is_running();
4411 
4412     seq_iter++;
4413 
4414     if (version_id != 4) {
4415         return -EINVAL;
4416     }
4417 
4418     /*
4419      * This RCU critical section can be very long running.
4420      * When RCU reclaims in the code start to become numerous,
4421      * it will be necessary to reduce the granularity of this
4422      * critical section.
4423      */
4424     WITH_RCU_READ_LOCK_GUARD() {
4425         if (postcopy_running) {
4426             /*
4427              * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4428              * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4429              * service fast page faults.
4430              */
4431             ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4432         } else {
4433             ret = ram_load_precopy(f);
4434         }
4435     }
4436     trace_ram_load_complete(ret, seq_iter);
4437 
4438     return ret;
4439 }
4440 
4441 static bool ram_has_postcopy(void *opaque)
4442 {
4443     RAMBlock *rb;
4444     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4445         if (ramblock_is_pmem(rb)) {
4446             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4447                          "is not supported now!", rb->idstr, rb->host);
4448             return false;
4449         }
4450     }
4451 
4452     return migrate_postcopy_ram();
4453 }
4454 
4455 /* Sync all the dirty bitmap with destination VM.  */
4456 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4457 {
4458     RAMBlock *block;
4459     QEMUFile *file = s->to_dst_file;
4460     int ramblock_count = 0;
4461 
4462     trace_ram_dirty_bitmap_sync_start();
4463 
4464     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4465         qemu_savevm_send_recv_bitmap(file, block->idstr);
4466         trace_ram_dirty_bitmap_request(block->idstr);
4467         ramblock_count++;
4468     }
4469 
4470     trace_ram_dirty_bitmap_sync_wait();
4471 
4472     /* Wait until all the ramblocks' dirty bitmap synced */
4473     while (ramblock_count--) {
4474         qemu_sem_wait(&s->rp_state.rp_sem);
4475     }
4476 
4477     trace_ram_dirty_bitmap_sync_complete();
4478 
4479     return 0;
4480 }
4481 
4482 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4483 {
4484     qemu_sem_post(&s->rp_state.rp_sem);
4485 }
4486 
4487 /*
4488  * Read the received bitmap, revert it as the initial dirty bitmap.
4489  * This is only used when the postcopy migration is paused but wants
4490  * to resume from a middle point.
4491  */
4492 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4493 {
4494     int ret = -EINVAL;
4495     /* from_dst_file is always valid because we're within rp_thread */
4496     QEMUFile *file = s->rp_state.from_dst_file;
4497     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4498     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4499     uint64_t size, end_mark;
4500 
4501     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4502 
4503     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4504         error_report("%s: incorrect state %s", __func__,
4505                      MigrationStatus_str(s->state));
4506         return -EINVAL;
4507     }
4508 
4509     /*
4510      * Note: see comments in ramblock_recv_bitmap_send() on why we
4511      * need the endianness conversion, and the paddings.
4512      */
4513     local_size = ROUND_UP(local_size, 8);
4514 
4515     /* Add paddings */
4516     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4517 
4518     size = qemu_get_be64(file);
4519 
4520     /* The size of the bitmap should match with our ramblock */
4521     if (size != local_size) {
4522         error_report("%s: ramblock '%s' bitmap size mismatch "
4523                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4524                      block->idstr, size, local_size);
4525         ret = -EINVAL;
4526         goto out;
4527     }
4528 
4529     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4530     end_mark = qemu_get_be64(file);
4531 
4532     ret = qemu_file_get_error(file);
4533     if (ret || size != local_size) {
4534         error_report("%s: read bitmap failed for ramblock '%s': %d"
4535                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4536                      __func__, block->idstr, ret, local_size, size);
4537         ret = -EIO;
4538         goto out;
4539     }
4540 
4541     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4542         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4543                      __func__, block->idstr, end_mark);
4544         ret = -EINVAL;
4545         goto out;
4546     }
4547 
4548     /*
4549      * Endianness conversion. We are during postcopy (though paused).
4550      * The dirty bitmap won't change. We can directly modify it.
4551      */
4552     bitmap_from_le(block->bmap, le_bitmap, nbits);
4553 
4554     /*
4555      * What we received is "received bitmap". Revert it as the initial
4556      * dirty bitmap for this ramblock.
4557      */
4558     bitmap_complement(block->bmap, block->bmap, nbits);
4559 
4560     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4561     ramblock_dirty_bitmap_clear_discarded_pages(block);
4562 
4563     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4564     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4565 
4566     /*
4567      * We succeeded to sync bitmap for current ramblock. If this is
4568      * the last one to sync, we need to notify the main send thread.
4569      */
4570     ram_dirty_bitmap_reload_notify(s);
4571 
4572     ret = 0;
4573 out:
4574     g_free(le_bitmap);
4575     return ret;
4576 }
4577 
4578 static int ram_resume_prepare(MigrationState *s, void *opaque)
4579 {
4580     RAMState *rs = *(RAMState **)opaque;
4581     int ret;
4582 
4583     ret = ram_dirty_bitmap_sync_all(s, rs);
4584     if (ret) {
4585         return ret;
4586     }
4587 
4588     ram_state_resume_prepare(rs, s->to_dst_file);
4589 
4590     return 0;
4591 }
4592 
4593 void postcopy_preempt_shutdown_file(MigrationState *s)
4594 {
4595     qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4596     qemu_fflush(s->postcopy_qemufile_src);
4597 }
4598 
4599 static SaveVMHandlers savevm_ram_handlers = {
4600     .save_setup = ram_save_setup,
4601     .save_live_iterate = ram_save_iterate,
4602     .save_live_complete_postcopy = ram_save_complete,
4603     .save_live_complete_precopy = ram_save_complete,
4604     .has_postcopy = ram_has_postcopy,
4605     .save_live_pending = ram_save_pending,
4606     .load_state = ram_load,
4607     .save_cleanup = ram_save_cleanup,
4608     .load_setup = ram_load_setup,
4609     .load_cleanup = ram_load_cleanup,
4610     .resume_prepare = ram_resume_prepare,
4611 };
4612 
4613 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4614                                       size_t old_size, size_t new_size)
4615 {
4616     PostcopyState ps = postcopy_state_get();
4617     ram_addr_t offset;
4618     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4619     Error *err = NULL;
4620 
4621     if (ramblock_is_ignored(rb)) {
4622         return;
4623     }
4624 
4625     if (!migration_is_idle()) {
4626         /*
4627          * Precopy code on the source cannot deal with the size of RAM blocks
4628          * changing at random points in time - especially after sending the
4629          * RAM block sizes in the migration stream, they must no longer change.
4630          * Abort and indicate a proper reason.
4631          */
4632         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4633         migration_cancel(err);
4634         error_free(err);
4635     }
4636 
4637     switch (ps) {
4638     case POSTCOPY_INCOMING_ADVISE:
4639         /*
4640          * Update what ram_postcopy_incoming_init()->init_range() does at the
4641          * time postcopy was advised. Syncing RAM blocks with the source will
4642          * result in RAM resizes.
4643          */
4644         if (old_size < new_size) {
4645             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4646                 error_report("RAM block '%s' discard of resized RAM failed",
4647                              rb->idstr);
4648             }
4649         }
4650         rb->postcopy_length = new_size;
4651         break;
4652     case POSTCOPY_INCOMING_NONE:
4653     case POSTCOPY_INCOMING_RUNNING:
4654     case POSTCOPY_INCOMING_END:
4655         /*
4656          * Once our guest is running, postcopy does no longer care about
4657          * resizes. When growing, the new memory was not available on the
4658          * source, no handler needed.
4659          */
4660         break;
4661     default:
4662         error_report("RAM block '%s' resized during postcopy state: %d",
4663                      rb->idstr, ps);
4664         exit(-1);
4665     }
4666 }
4667 
4668 static RAMBlockNotifier ram_mig_ram_notifier = {
4669     .ram_block_resized = ram_mig_ram_block_resized,
4670 };
4671 
4672 void ram_mig_init(void)
4673 {
4674     qemu_mutex_init(&XBZRLE.lock);
4675     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4676     ram_block_notifier_add(&ram_mig_ram_notifier);
4677 }
4678