1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/main-loop.h"
34 #include "xbzrle.h"
35 #include "ram.h"
36 #include "migration.h"
37 #include "migration/register.h"
38 #include "migration/misc.h"
39 #include "qemu-file.h"
40 #include "postcopy-ram.h"
41 #include "page_cache.h"
42 #include "qemu/error-report.h"
43 #include "qapi/error.h"
44 #include "qapi/qapi-types-migration.h"
45 #include "qapi/qapi-events-migration.h"
46 #include "qapi/qmp/qerror.h"
47 #include "trace.h"
48 #include "exec/ram_addr.h"
49 #include "exec/target_page.h"
50 #include "qemu/rcu_queue.h"
51 #include "migration/colo.h"
52 #include "block.h"
53 #include "sysemu/cpu-throttle.h"
54 #include "savevm.h"
55 #include "qemu/iov.h"
56 #include "multifd.h"
57 #include "sysemu/runstate.h"
58 
59 #if defined(__linux__)
60 #include "qemu/userfaultfd.h"
61 #endif /* defined(__linux__) */
62 
63 /***********************************************************/
64 /* ram save/restore */
65 
66 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
67  * worked for pages that where filled with the same char.  We switched
68  * it to only search for the zero value.  And to avoid confusion with
69  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
70  */
71 
72 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
73 #define RAM_SAVE_FLAG_ZERO     0x02
74 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
75 #define RAM_SAVE_FLAG_PAGE     0x08
76 #define RAM_SAVE_FLAG_EOS      0x10
77 #define RAM_SAVE_FLAG_CONTINUE 0x20
78 #define RAM_SAVE_FLAG_XBZRLE   0x40
79 /* 0x80 is reserved in migration.h start with 0x100 next */
80 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
81 
is_zero_range(uint8_t * p,uint64_t size)82 static inline bool is_zero_range(uint8_t *p, uint64_t size)
83 {
84     return buffer_is_zero(p, size);
85 }
86 
87 XBZRLECacheStats xbzrle_counters;
88 
89 /* struct contains XBZRLE cache and a static page
90    used by the compression */
91 static struct {
92     /* buffer used for XBZRLE encoding */
93     uint8_t *encoded_buf;
94     /* buffer for storing page content */
95     uint8_t *current_buf;
96     /* Cache for XBZRLE, Protected by lock. */
97     PageCache *cache;
98     QemuMutex lock;
99     /* it will store a page full of zeros */
100     uint8_t *zero_target_page;
101     /* buffer used for XBZRLE decoding */
102     uint8_t *decoded_buf;
103 } XBZRLE;
104 
XBZRLE_cache_lock(void)105 static void XBZRLE_cache_lock(void)
106 {
107     if (migrate_use_xbzrle()) {
108         qemu_mutex_lock(&XBZRLE.lock);
109     }
110 }
111 
XBZRLE_cache_unlock(void)112 static void XBZRLE_cache_unlock(void)
113 {
114     if (migrate_use_xbzrle()) {
115         qemu_mutex_unlock(&XBZRLE.lock);
116     }
117 }
118 
119 /**
120  * xbzrle_cache_resize: resize the xbzrle cache
121  *
122  * This function is called from migrate_params_apply in main
123  * thread, possibly while a migration is in progress.  A running
124  * migration may be using the cache and might finish during this call,
125  * hence changes to the cache are protected by XBZRLE.lock().
126  *
127  * Returns 0 for success or -1 for error
128  *
129  * @new_size: new cache size
130  * @errp: set *errp if the check failed, with reason
131  */
xbzrle_cache_resize(uint64_t new_size,Error ** errp)132 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
133 {
134     PageCache *new_cache;
135     int64_t ret = 0;
136 
137     /* Check for truncation */
138     if (new_size != (size_t)new_size) {
139         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
140                    "exceeding address space");
141         return -1;
142     }
143 
144     if (new_size == migrate_xbzrle_cache_size()) {
145         /* nothing to do */
146         return 0;
147     }
148 
149     XBZRLE_cache_lock();
150 
151     if (XBZRLE.cache != NULL) {
152         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
153         if (!new_cache) {
154             ret = -1;
155             goto out;
156         }
157 
158         cache_fini(XBZRLE.cache);
159         XBZRLE.cache = new_cache;
160     }
161 out:
162     XBZRLE_cache_unlock();
163     return ret;
164 }
165 
ramblock_is_ignored(RAMBlock * block)166 bool ramblock_is_ignored(RAMBlock *block)
167 {
168     return !qemu_ram_is_migratable(block) ||
169            (migrate_ignore_shared() && qemu_ram_is_shared(block));
170 }
171 
172 #undef RAMBLOCK_FOREACH
173 
foreach_not_ignored_block(RAMBlockIterFunc func,void * opaque)174 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
175 {
176     RAMBlock *block;
177     int ret = 0;
178 
179     RCU_READ_LOCK_GUARD();
180 
181     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
182         ret = func(block, opaque);
183         if (ret) {
184             break;
185         }
186     }
187     return ret;
188 }
189 
ramblock_recv_map_init(void)190 static void ramblock_recv_map_init(void)
191 {
192     RAMBlock *rb;
193 
194     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
195         assert(!rb->receivedmap);
196         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
197     }
198 }
199 
ramblock_recv_bitmap_test(RAMBlock * rb,void * host_addr)200 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
201 {
202     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
203                     rb->receivedmap);
204 }
205 
ramblock_recv_bitmap_test_byte_offset(RAMBlock * rb,uint64_t byte_offset)206 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
207 {
208     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
209 }
210 
ramblock_recv_bitmap_set(RAMBlock * rb,void * host_addr)211 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
212 {
213     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
214 }
215 
ramblock_recv_bitmap_set_range(RAMBlock * rb,void * host_addr,size_t nr)216 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
217                                     size_t nr)
218 {
219     bitmap_set_atomic(rb->receivedmap,
220                       ramblock_recv_bitmap_offset(host_addr, rb),
221                       nr);
222 }
223 
224 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
225 
226 /*
227  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
228  *
229  * Returns >0 if success with sent bytes, or <0 if error.
230  */
ramblock_recv_bitmap_send(QEMUFile * file,const char * block_name)231 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
232                                   const char *block_name)
233 {
234     RAMBlock *block = qemu_ram_block_by_name(block_name);
235     unsigned long *le_bitmap, nbits;
236     uint64_t size;
237 
238     if (!block) {
239         error_report("%s: invalid block name: %s", __func__, block_name);
240         return -1;
241     }
242 
243     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
244 
245     /*
246      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
247      * machines we may need 4 more bytes for padding (see below
248      * comment). So extend it a bit before hand.
249      */
250     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
251 
252     /*
253      * Always use little endian when sending the bitmap. This is
254      * required that when source and destination VMs are not using the
255      * same endianness. (Note: big endian won't work.)
256      */
257     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
258 
259     /* Size of the bitmap, in bytes */
260     size = DIV_ROUND_UP(nbits, 8);
261 
262     /*
263      * size is always aligned to 8 bytes for 64bit machines, but it
264      * may not be true for 32bit machines. We need this padding to
265      * make sure the migration can survive even between 32bit and
266      * 64bit machines.
267      */
268     size = ROUND_UP(size, 8);
269 
270     qemu_put_be64(file, size);
271     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
272     /*
273      * Mark as an end, in case the middle part is screwed up due to
274      * some "mysterious" reason.
275      */
276     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
277     qemu_fflush(file);
278 
279     g_free(le_bitmap);
280 
281     if (qemu_file_get_error(file)) {
282         return qemu_file_get_error(file);
283     }
284 
285     return size + sizeof(size);
286 }
287 
288 /*
289  * An outstanding page request, on the source, having been received
290  * and queued
291  */
292 struct RAMSrcPageRequest {
293     RAMBlock *rb;
294     hwaddr    offset;
295     hwaddr    len;
296 
297     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
298 };
299 
300 /* State of RAM for migration */
301 struct RAMState {
302     /* QEMUFile used for this migration */
303     QEMUFile *f;
304     /* UFFD file descriptor, used in 'write-tracking' migration */
305     int uffdio_fd;
306     /* Last block that we have visited searching for dirty pages */
307     RAMBlock *last_seen_block;
308     /* Last block from where we have sent data */
309     RAMBlock *last_sent_block;
310     /* Last dirty target page we have sent */
311     ram_addr_t last_page;
312     /* last ram version we have seen */
313     uint32_t last_version;
314     /* How many times we have dirty too many pages */
315     int dirty_rate_high_cnt;
316     /* these variables are used for bitmap sync */
317     /* last time we did a full bitmap_sync */
318     int64_t time_last_bitmap_sync;
319     /* bytes transferred at start_time */
320     uint64_t bytes_xfer_prev;
321     /* number of dirty pages since start_time */
322     uint64_t num_dirty_pages_period;
323     /* xbzrle misses since the beginning of the period */
324     uint64_t xbzrle_cache_miss_prev;
325     /* Amount of xbzrle pages since the beginning of the period */
326     uint64_t xbzrle_pages_prev;
327     /* Amount of xbzrle encoded bytes since the beginning of the period */
328     uint64_t xbzrle_bytes_prev;
329     /* Start using XBZRLE (e.g., after the first round). */
330     bool xbzrle_enabled;
331 
332     /* compression statistics since the beginning of the period */
333     /* amount of count that no free thread to compress data */
334     uint64_t compress_thread_busy_prev;
335     /* amount bytes after compression */
336     uint64_t compressed_size_prev;
337     /* amount of compressed pages */
338     uint64_t compress_pages_prev;
339 
340     /* total handled target pages at the beginning of period */
341     uint64_t target_page_count_prev;
342     /* total handled target pages since start */
343     uint64_t target_page_count;
344     /* number of dirty bits in the bitmap */
345     uint64_t migration_dirty_pages;
346     /* Protects modification of the bitmap and migration dirty pages */
347     QemuMutex bitmap_mutex;
348     /* The RAMBlock used in the last src_page_requests */
349     RAMBlock *last_req_rb;
350     /* Queue of outstanding page requests from the destination */
351     QemuMutex src_page_req_mutex;
352     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
353 };
354 typedef struct RAMState RAMState;
355 
356 static RAMState *ram_state;
357 
358 static NotifierWithReturnList precopy_notifier_list;
359 
precopy_infrastructure_init(void)360 void precopy_infrastructure_init(void)
361 {
362     notifier_with_return_list_init(&precopy_notifier_list);
363 }
364 
precopy_add_notifier(NotifierWithReturn * n)365 void precopy_add_notifier(NotifierWithReturn *n)
366 {
367     notifier_with_return_list_add(&precopy_notifier_list, n);
368 }
369 
precopy_remove_notifier(NotifierWithReturn * n)370 void precopy_remove_notifier(NotifierWithReturn *n)
371 {
372     notifier_with_return_remove(n);
373 }
374 
precopy_notify(PrecopyNotifyReason reason,Error ** errp)375 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
376 {
377     PrecopyNotifyData pnd;
378     pnd.reason = reason;
379     pnd.errp = errp;
380 
381     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
382 }
383 
ram_bytes_remaining(void)384 uint64_t ram_bytes_remaining(void)
385 {
386     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
387                        0;
388 }
389 
390 MigrationStats ram_counters;
391 
392 /* used by the search for pages to send */
393 struct PageSearchStatus {
394     /* Current block being searched */
395     RAMBlock    *block;
396     /* Current page to search from */
397     unsigned long page;
398     /* Set once we wrap around */
399     bool         complete_round;
400 };
401 typedef struct PageSearchStatus PageSearchStatus;
402 
403 CompressionStats compression_counters;
404 
405 struct CompressParam {
406     bool done;
407     bool quit;
408     bool zero_page;
409     QEMUFile *file;
410     QemuMutex mutex;
411     QemuCond cond;
412     RAMBlock *block;
413     ram_addr_t offset;
414 
415     /* internally used fields */
416     z_stream stream;
417     uint8_t *originbuf;
418 };
419 typedef struct CompressParam CompressParam;
420 
421 struct DecompressParam {
422     bool done;
423     bool quit;
424     QemuMutex mutex;
425     QemuCond cond;
426     void *des;
427     uint8_t *compbuf;
428     int len;
429     z_stream stream;
430 };
431 typedef struct DecompressParam DecompressParam;
432 
433 static CompressParam *comp_param;
434 static QemuThread *compress_threads;
435 /* comp_done_cond is used to wake up the migration thread when
436  * one of the compression threads has finished the compression.
437  * comp_done_lock is used to co-work with comp_done_cond.
438  */
439 static QemuMutex comp_done_lock;
440 static QemuCond comp_done_cond;
441 /* The empty QEMUFileOps will be used by file in CompressParam */
442 static const QEMUFileOps empty_ops = { };
443 
444 static QEMUFile *decomp_file;
445 static DecompressParam *decomp_param;
446 static QemuThread *decompress_threads;
447 static QemuMutex decomp_done_lock;
448 static QemuCond decomp_done_cond;
449 
450 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
451                                  ram_addr_t offset, uint8_t *source_buf);
452 
do_data_compress(void * opaque)453 static void *do_data_compress(void *opaque)
454 {
455     CompressParam *param = opaque;
456     RAMBlock *block;
457     ram_addr_t offset;
458     bool zero_page;
459 
460     qemu_mutex_lock(&param->mutex);
461     while (!param->quit) {
462         if (param->block) {
463             block = param->block;
464             offset = param->offset;
465             param->block = NULL;
466             qemu_mutex_unlock(&param->mutex);
467 
468             zero_page = do_compress_ram_page(param->file, &param->stream,
469                                              block, offset, param->originbuf);
470 
471             qemu_mutex_lock(&comp_done_lock);
472             param->done = true;
473             param->zero_page = zero_page;
474             qemu_cond_signal(&comp_done_cond);
475             qemu_mutex_unlock(&comp_done_lock);
476 
477             qemu_mutex_lock(&param->mutex);
478         } else {
479             qemu_cond_wait(&param->cond, &param->mutex);
480         }
481     }
482     qemu_mutex_unlock(&param->mutex);
483 
484     return NULL;
485 }
486 
compress_threads_save_cleanup(void)487 static void compress_threads_save_cleanup(void)
488 {
489     int i, thread_count;
490 
491     if (!migrate_use_compression() || !comp_param) {
492         return;
493     }
494 
495     thread_count = migrate_compress_threads();
496     for (i = 0; i < thread_count; i++) {
497         /*
498          * we use it as a indicator which shows if the thread is
499          * properly init'd or not
500          */
501         if (!comp_param[i].file) {
502             break;
503         }
504 
505         qemu_mutex_lock(&comp_param[i].mutex);
506         comp_param[i].quit = true;
507         qemu_cond_signal(&comp_param[i].cond);
508         qemu_mutex_unlock(&comp_param[i].mutex);
509 
510         qemu_thread_join(compress_threads + i);
511         qemu_mutex_destroy(&comp_param[i].mutex);
512         qemu_cond_destroy(&comp_param[i].cond);
513         deflateEnd(&comp_param[i].stream);
514         g_free(comp_param[i].originbuf);
515         qemu_fclose(comp_param[i].file);
516         comp_param[i].file = NULL;
517     }
518     qemu_mutex_destroy(&comp_done_lock);
519     qemu_cond_destroy(&comp_done_cond);
520     g_free(compress_threads);
521     g_free(comp_param);
522     compress_threads = NULL;
523     comp_param = NULL;
524 }
525 
compress_threads_save_setup(void)526 static int compress_threads_save_setup(void)
527 {
528     int i, thread_count;
529 
530     if (!migrate_use_compression()) {
531         return 0;
532     }
533     thread_count = migrate_compress_threads();
534     compress_threads = g_new0(QemuThread, thread_count);
535     comp_param = g_new0(CompressParam, thread_count);
536     qemu_cond_init(&comp_done_cond);
537     qemu_mutex_init(&comp_done_lock);
538     for (i = 0; i < thread_count; i++) {
539         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
540         if (!comp_param[i].originbuf) {
541             goto exit;
542         }
543 
544         if (deflateInit(&comp_param[i].stream,
545                         migrate_compress_level()) != Z_OK) {
546             g_free(comp_param[i].originbuf);
547             goto exit;
548         }
549 
550         /* comp_param[i].file is just used as a dummy buffer to save data,
551          * set its ops to empty.
552          */
553         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
554         comp_param[i].done = true;
555         comp_param[i].quit = false;
556         qemu_mutex_init(&comp_param[i].mutex);
557         qemu_cond_init(&comp_param[i].cond);
558         qemu_thread_create(compress_threads + i, "compress",
559                            do_data_compress, comp_param + i,
560                            QEMU_THREAD_JOINABLE);
561     }
562     return 0;
563 
564 exit:
565     compress_threads_save_cleanup();
566     return -1;
567 }
568 
569 /**
570  * save_page_header: write page header to wire
571  *
572  * If this is the 1st block, it also writes the block identification
573  *
574  * Returns the number of bytes written
575  *
576  * @f: QEMUFile where to send the data
577  * @block: block that contains the page we want to send
578  * @offset: offset inside the block for the page
579  *          in the lower bits, it contains flags
580  */
save_page_header(RAMState * rs,QEMUFile * f,RAMBlock * block,ram_addr_t offset)581 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
582                                ram_addr_t offset)
583 {
584     size_t size, len;
585 
586     if (block == rs->last_sent_block) {
587         offset |= RAM_SAVE_FLAG_CONTINUE;
588     }
589     qemu_put_be64(f, offset);
590     size = 8;
591 
592     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
593         len = strlen(block->idstr);
594         qemu_put_byte(f, len);
595         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
596         size += 1 + len;
597         rs->last_sent_block = block;
598     }
599     return size;
600 }
601 
602 /**
603  * mig_throttle_guest_down: throttle down the guest
604  *
605  * Reduce amount of guest cpu execution to hopefully slow down memory
606  * writes. If guest dirty memory rate is reduced below the rate at
607  * which we can transfer pages to the destination then we should be
608  * able to complete migration. Some workloads dirty memory way too
609  * fast and will not effectively converge, even with auto-converge.
610  */
mig_throttle_guest_down(uint64_t bytes_dirty_period,uint64_t bytes_dirty_threshold)611 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
612                                     uint64_t bytes_dirty_threshold)
613 {
614     MigrationState *s = migrate_get_current();
615     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
616     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
617     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
618     int pct_max = s->parameters.max_cpu_throttle;
619 
620     uint64_t throttle_now = cpu_throttle_get_percentage();
621     uint64_t cpu_now, cpu_ideal, throttle_inc;
622 
623     /* We have not started throttling yet. Let's start it. */
624     if (!cpu_throttle_active()) {
625         cpu_throttle_set(pct_initial);
626     } else {
627         /* Throttling already on, just increase the rate */
628         if (!pct_tailslow) {
629             throttle_inc = pct_increment;
630         } else {
631             /* Compute the ideal CPU percentage used by Guest, which may
632              * make the dirty rate match the dirty rate threshold. */
633             cpu_now = 100 - throttle_now;
634             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
635                         bytes_dirty_period);
636             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
637         }
638         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
639     }
640 }
641 
642 /**
643  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
644  *
645  * @rs: current RAM state
646  * @current_addr: address for the zero page
647  *
648  * Update the xbzrle cache to reflect a page that's been sent as all 0.
649  * The important thing is that a stale (not-yet-0'd) page be replaced
650  * by the new data.
651  * As a bonus, if the page wasn't in the cache it gets added so that
652  * when a small write is made into the 0'd page it gets XBZRLE sent.
653  */
xbzrle_cache_zero_page(RAMState * rs,ram_addr_t current_addr)654 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
655 {
656     if (!rs->xbzrle_enabled) {
657         return;
658     }
659 
660     /* We don't care if this fails to allocate a new cache page
661      * as long as it updated an old one */
662     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
663                  ram_counters.dirty_sync_count);
664 }
665 
666 #define ENCODING_FLAG_XBZRLE 0x1
667 
668 /**
669  * save_xbzrle_page: compress and send current page
670  *
671  * Returns: 1 means that we wrote the page
672  *          0 means that page is identical to the one already sent
673  *          -1 means that xbzrle would be longer than normal
674  *
675  * @rs: current RAM state
676  * @current_data: pointer to the address of the page contents
677  * @current_addr: addr of the page
678  * @block: block that contains the page we want to send
679  * @offset: offset inside the block for the page
680  * @last_stage: if we are at the completion stage
681  */
save_xbzrle_page(RAMState * rs,uint8_t ** current_data,ram_addr_t current_addr,RAMBlock * block,ram_addr_t offset,bool last_stage)682 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
683                             ram_addr_t current_addr, RAMBlock *block,
684                             ram_addr_t offset, bool last_stage)
685 {
686     int encoded_len = 0, bytes_xbzrle;
687     uint8_t *prev_cached_page;
688 
689     if (!cache_is_cached(XBZRLE.cache, current_addr,
690                          ram_counters.dirty_sync_count)) {
691         xbzrle_counters.cache_miss++;
692         if (!last_stage) {
693             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
694                              ram_counters.dirty_sync_count) == -1) {
695                 return -1;
696             } else {
697                 /* update *current_data when the page has been
698                    inserted into cache */
699                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
700             }
701         }
702         return -1;
703     }
704 
705     /*
706      * Reaching here means the page has hit the xbzrle cache, no matter what
707      * encoding result it is (normal encoding, overflow or skipping the page),
708      * count the page as encoded. This is used to calculate the encoding rate.
709      *
710      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
711      * 2nd page turns out to be skipped (i.e. no new bytes written to the
712      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
713      * skipped page included. In this way, the encoding rate can tell if the
714      * guest page is good for xbzrle encoding.
715      */
716     xbzrle_counters.pages++;
717     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
718 
719     /* save current buffer into memory */
720     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
721 
722     /* XBZRLE encoding (if there is no overflow) */
723     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
724                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
725                                        TARGET_PAGE_SIZE);
726 
727     /*
728      * Update the cache contents, so that it corresponds to the data
729      * sent, in all cases except where we skip the page.
730      */
731     if (!last_stage && encoded_len != 0) {
732         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
733         /*
734          * In the case where we couldn't compress, ensure that the caller
735          * sends the data from the cache, since the guest might have
736          * changed the RAM since we copied it.
737          */
738         *current_data = prev_cached_page;
739     }
740 
741     if (encoded_len == 0) {
742         trace_save_xbzrle_page_skipping();
743         return 0;
744     } else if (encoded_len == -1) {
745         trace_save_xbzrle_page_overflow();
746         xbzrle_counters.overflow++;
747         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
748         return -1;
749     }
750 
751     /* Send XBZRLE based compressed page */
752     bytes_xbzrle = save_page_header(rs, rs->f, block,
753                                     offset | RAM_SAVE_FLAG_XBZRLE);
754     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
755     qemu_put_be16(rs->f, encoded_len);
756     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
757     bytes_xbzrle += encoded_len + 1 + 2;
758     /*
759      * Like compressed_size (please see update_compress_thread_counts),
760      * the xbzrle encoded bytes don't count the 8 byte header with
761      * RAM_SAVE_FLAG_CONTINUE.
762      */
763     xbzrle_counters.bytes += bytes_xbzrle - 8;
764     ram_counters.transferred += bytes_xbzrle;
765 
766     return 1;
767 }
768 
769 /**
770  * migration_bitmap_find_dirty: find the next dirty page from start
771  *
772  * Returns the page offset within memory region of the start of a dirty page
773  *
774  * @rs: current RAM state
775  * @rb: RAMBlock where to search for dirty pages
776  * @start: page where we start the search
777  */
778 static inline
migration_bitmap_find_dirty(RAMState * rs,RAMBlock * rb,unsigned long start)779 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
780                                           unsigned long start)
781 {
782     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
783     unsigned long *bitmap = rb->bmap;
784 
785     if (ramblock_is_ignored(rb)) {
786         return size;
787     }
788 
789     return find_next_bit(bitmap, size, start);
790 }
791 
migration_clear_memory_region_dirty_bitmap(RAMState * rs,RAMBlock * rb,unsigned long page)792 static void migration_clear_memory_region_dirty_bitmap(RAMState *rs,
793                                                        RAMBlock *rb,
794                                                        unsigned long page)
795 {
796     uint8_t shift;
797     hwaddr size, start;
798 
799     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
800         return;
801     }
802 
803     shift = rb->clear_bmap_shift;
804     /*
805      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
806      * can make things easier sometimes since then start address
807      * of the small chunk will always be 64 pages aligned so the
808      * bitmap will always be aligned to unsigned long. We should
809      * even be able to remove this restriction but I'm simply
810      * keeping it.
811      */
812     assert(shift >= 6);
813 
814     size = 1ULL << (TARGET_PAGE_BITS + shift);
815     start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
816     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
817     memory_region_clear_dirty_bitmap(rb->mr, start, size);
818 }
819 
820 static void
migration_clear_memory_region_dirty_bitmap_range(RAMState * rs,RAMBlock * rb,unsigned long start,unsigned long npages)821 migration_clear_memory_region_dirty_bitmap_range(RAMState *rs,
822                                                  RAMBlock *rb,
823                                                  unsigned long start,
824                                                  unsigned long npages)
825 {
826     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
827     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
828     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
829 
830     /*
831      * Clear pages from start to start + npages - 1, so the end boundary is
832      * exclusive.
833      */
834     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
835         migration_clear_memory_region_dirty_bitmap(rs, rb, i);
836     }
837 }
838 
migration_bitmap_clear_dirty(RAMState * rs,RAMBlock * rb,unsigned long page)839 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
840                                                 RAMBlock *rb,
841                                                 unsigned long page)
842 {
843     bool ret;
844 
845     /*
846      * Clear dirty bitmap if needed.  This _must_ be called before we
847      * send any of the page in the chunk because we need to make sure
848      * we can capture further page content changes when we sync dirty
849      * log the next time.  So as long as we are going to send any of
850      * the page in the chunk we clear the remote dirty bitmap for all.
851      * Clearing it earlier won't be a problem, but too late will.
852      */
853     migration_clear_memory_region_dirty_bitmap(rs, rb, page);
854 
855     ret = test_and_clear_bit(page, rb->bmap);
856     if (ret) {
857         rs->migration_dirty_pages--;
858     }
859 
860     return ret;
861 }
862 
863 /* Called with RCU critical section */
ramblock_sync_dirty_bitmap(RAMState * rs,RAMBlock * rb)864 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
865 {
866     uint64_t new_dirty_pages =
867         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
868 
869     rs->migration_dirty_pages += new_dirty_pages;
870     rs->num_dirty_pages_period += new_dirty_pages;
871 }
872 
873 /**
874  * ram_pagesize_summary: calculate all the pagesizes of a VM
875  *
876  * Returns a summary bitmap of the page sizes of all RAMBlocks
877  *
878  * For VMs with just normal pages this is equivalent to the host page
879  * size. If it's got some huge pages then it's the OR of all the
880  * different page sizes.
881  */
ram_pagesize_summary(void)882 uint64_t ram_pagesize_summary(void)
883 {
884     RAMBlock *block;
885     uint64_t summary = 0;
886 
887     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
888         summary |= block->page_size;
889     }
890 
891     return summary;
892 }
893 
ram_get_total_transferred_pages(void)894 uint64_t ram_get_total_transferred_pages(void)
895 {
896     return  ram_counters.normal + ram_counters.duplicate +
897                 compression_counters.pages + xbzrle_counters.pages;
898 }
899 
migration_update_rates(RAMState * rs,int64_t end_time)900 static void migration_update_rates(RAMState *rs, int64_t end_time)
901 {
902     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
903     double compressed_size;
904 
905     /* calculate period counters */
906     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
907                 / (end_time - rs->time_last_bitmap_sync);
908 
909     if (!page_count) {
910         return;
911     }
912 
913     if (migrate_use_xbzrle()) {
914         double encoded_size, unencoded_size;
915 
916         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
917             rs->xbzrle_cache_miss_prev) / page_count;
918         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
919         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
920                          TARGET_PAGE_SIZE;
921         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
922         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
923             xbzrle_counters.encoding_rate = 0;
924         } else {
925             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
926         }
927         rs->xbzrle_pages_prev = xbzrle_counters.pages;
928         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
929     }
930 
931     if (migrate_use_compression()) {
932         compression_counters.busy_rate = (double)(compression_counters.busy -
933             rs->compress_thread_busy_prev) / page_count;
934         rs->compress_thread_busy_prev = compression_counters.busy;
935 
936         compressed_size = compression_counters.compressed_size -
937                           rs->compressed_size_prev;
938         if (compressed_size) {
939             double uncompressed_size = (compression_counters.pages -
940                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
941 
942             /* Compression-Ratio = Uncompressed-size / Compressed-size */
943             compression_counters.compression_rate =
944                                         uncompressed_size / compressed_size;
945 
946             rs->compress_pages_prev = compression_counters.pages;
947             rs->compressed_size_prev = compression_counters.compressed_size;
948         }
949     }
950 }
951 
migration_trigger_throttle(RAMState * rs)952 static void migration_trigger_throttle(RAMState *rs)
953 {
954     MigrationState *s = migrate_get_current();
955     uint64_t threshold = s->parameters.throttle_trigger_threshold;
956 
957     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
958     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
959     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
960 
961     /* During block migration the auto-converge logic incorrectly detects
962      * that ram migration makes no progress. Avoid this by disabling the
963      * throttling logic during the bulk phase of block migration. */
964     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
965         /* The following detection logic can be refined later. For now:
966            Check to see if the ratio between dirtied bytes and the approx.
967            amount of bytes that just got transferred since the last time
968            we were in this routine reaches the threshold. If that happens
969            twice, start or increase throttling. */
970 
971         if ((bytes_dirty_period > bytes_dirty_threshold) &&
972             (++rs->dirty_rate_high_cnt >= 2)) {
973             trace_migration_throttle();
974             rs->dirty_rate_high_cnt = 0;
975             mig_throttle_guest_down(bytes_dirty_period,
976                                     bytes_dirty_threshold);
977         }
978     }
979 }
980 
migration_bitmap_sync(RAMState * rs)981 static void migration_bitmap_sync(RAMState *rs)
982 {
983     RAMBlock *block;
984     int64_t end_time;
985 
986     ram_counters.dirty_sync_count++;
987 
988     if (!rs->time_last_bitmap_sync) {
989         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
990     }
991 
992     trace_migration_bitmap_sync_start();
993     memory_global_dirty_log_sync();
994 
995     qemu_mutex_lock(&rs->bitmap_mutex);
996     WITH_RCU_READ_LOCK_GUARD() {
997         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
998             ramblock_sync_dirty_bitmap(rs, block);
999         }
1000         ram_counters.remaining = ram_bytes_remaining();
1001     }
1002     qemu_mutex_unlock(&rs->bitmap_mutex);
1003 
1004     memory_global_after_dirty_log_sync();
1005     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1006 
1007     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1008 
1009     /* more than 1 second = 1000 millisecons */
1010     if (end_time > rs->time_last_bitmap_sync + 1000) {
1011         migration_trigger_throttle(rs);
1012 
1013         migration_update_rates(rs, end_time);
1014 
1015         rs->target_page_count_prev = rs->target_page_count;
1016 
1017         /* reset period counters */
1018         rs->time_last_bitmap_sync = end_time;
1019         rs->num_dirty_pages_period = 0;
1020         rs->bytes_xfer_prev = ram_counters.transferred;
1021     }
1022     if (migrate_use_events()) {
1023         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1024     }
1025 }
1026 
migration_bitmap_sync_precopy(RAMState * rs)1027 static void migration_bitmap_sync_precopy(RAMState *rs)
1028 {
1029     Error *local_err = NULL;
1030 
1031     /*
1032      * The current notifier usage is just an optimization to migration, so we
1033      * don't stop the normal migration process in the error case.
1034      */
1035     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1036         error_report_err(local_err);
1037         local_err = NULL;
1038     }
1039 
1040     migration_bitmap_sync(rs);
1041 
1042     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1043         error_report_err(local_err);
1044     }
1045 }
1046 
1047 /**
1048  * save_zero_page_to_file: send the zero page to the file
1049  *
1050  * Returns the size of data written to the file, 0 means the page is not
1051  * a zero page
1052  *
1053  * @rs: current RAM state
1054  * @file: the file where the data is saved
1055  * @block: block that contains the page we want to send
1056  * @offset: offset inside the block for the page
1057  */
save_zero_page_to_file(RAMState * rs,QEMUFile * file,RAMBlock * block,ram_addr_t offset)1058 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1059                                   RAMBlock *block, ram_addr_t offset)
1060 {
1061     uint8_t *p = block->host + offset;
1062     int len = 0;
1063 
1064     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1065         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1066         qemu_put_byte(file, 0);
1067         len += 1;
1068     }
1069     return len;
1070 }
1071 
1072 /**
1073  * save_zero_page: send the zero page to the stream
1074  *
1075  * Returns the number of pages written.
1076  *
1077  * @rs: current RAM state
1078  * @block: block that contains the page we want to send
1079  * @offset: offset inside the block for the page
1080  */
save_zero_page(RAMState * rs,RAMBlock * block,ram_addr_t offset)1081 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1082 {
1083     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1084 
1085     if (len) {
1086         ram_counters.duplicate++;
1087         ram_counters.transferred += len;
1088         return 1;
1089     }
1090     return -1;
1091 }
1092 
ram_release_pages(const char * rbname,uint64_t offset,int pages)1093 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1094 {
1095     if (!migrate_release_ram() || !migration_in_postcopy()) {
1096         return;
1097     }
1098 
1099     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1100 }
1101 
1102 /*
1103  * @pages: the number of pages written by the control path,
1104  *        < 0 - error
1105  *        > 0 - number of pages written
1106  *
1107  * Return true if the pages has been saved, otherwise false is returned.
1108  */
control_save_page(RAMState * rs,RAMBlock * block,ram_addr_t offset,int * pages)1109 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1110                               int *pages)
1111 {
1112     uint64_t bytes_xmit = 0;
1113     int ret;
1114 
1115     *pages = -1;
1116     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1117                                 &bytes_xmit);
1118     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1119         return false;
1120     }
1121 
1122     if (bytes_xmit) {
1123         ram_counters.transferred += bytes_xmit;
1124         *pages = 1;
1125     }
1126 
1127     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1128         return true;
1129     }
1130 
1131     if (bytes_xmit > 0) {
1132         ram_counters.normal++;
1133     } else if (bytes_xmit == 0) {
1134         ram_counters.duplicate++;
1135     }
1136 
1137     return true;
1138 }
1139 
1140 /*
1141  * directly send the page to the stream
1142  *
1143  * Returns the number of pages written.
1144  *
1145  * @rs: current RAM state
1146  * @block: block that contains the page we want to send
1147  * @offset: offset inside the block for the page
1148  * @buf: the page to be sent
1149  * @async: send to page asyncly
1150  */
save_normal_page(RAMState * rs,RAMBlock * block,ram_addr_t offset,uint8_t * buf,bool async)1151 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1152                             uint8_t *buf, bool async)
1153 {
1154     ram_counters.transferred += save_page_header(rs, rs->f, block,
1155                                                  offset | RAM_SAVE_FLAG_PAGE);
1156     if (async) {
1157         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1158                               migrate_release_ram() &
1159                               migration_in_postcopy());
1160     } else {
1161         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1162     }
1163     ram_counters.transferred += TARGET_PAGE_SIZE;
1164     ram_counters.normal++;
1165     return 1;
1166 }
1167 
1168 /**
1169  * ram_save_page: send the given page to the stream
1170  *
1171  * Returns the number of pages written.
1172  *          < 0 - error
1173  *          >=0 - Number of pages written - this might legally be 0
1174  *                if xbzrle noticed the page was the same.
1175  *
1176  * @rs: current RAM state
1177  * @block: block that contains the page we want to send
1178  * @offset: offset inside the block for the page
1179  * @last_stage: if we are at the completion stage
1180  */
ram_save_page(RAMState * rs,PageSearchStatus * pss,bool last_stage)1181 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1182 {
1183     int pages = -1;
1184     uint8_t *p;
1185     bool send_async = true;
1186     RAMBlock *block = pss->block;
1187     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1188     ram_addr_t current_addr = block->offset + offset;
1189 
1190     p = block->host + offset;
1191     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1192 
1193     XBZRLE_cache_lock();
1194     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1195         pages = save_xbzrle_page(rs, &p, current_addr, block,
1196                                  offset, last_stage);
1197         if (!last_stage) {
1198             /* Can't send this cached data async, since the cache page
1199              * might get updated before it gets to the wire
1200              */
1201             send_async = false;
1202         }
1203     }
1204 
1205     /* XBZRLE overflow or normal page */
1206     if (pages == -1) {
1207         pages = save_normal_page(rs, block, offset, p, send_async);
1208     }
1209 
1210     XBZRLE_cache_unlock();
1211 
1212     return pages;
1213 }
1214 
ram_save_multifd_page(RAMState * rs,RAMBlock * block,ram_addr_t offset)1215 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1216                                  ram_addr_t offset)
1217 {
1218     if (multifd_queue_page(rs->f, block, offset) < 0) {
1219         return -1;
1220     }
1221     ram_counters.normal++;
1222 
1223     return 1;
1224 }
1225 
do_compress_ram_page(QEMUFile * f,z_stream * stream,RAMBlock * block,ram_addr_t offset,uint8_t * source_buf)1226 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1227                                  ram_addr_t offset, uint8_t *source_buf)
1228 {
1229     RAMState *rs = ram_state;
1230     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1231     bool zero_page = false;
1232     int ret;
1233 
1234     if (save_zero_page_to_file(rs, f, block, offset)) {
1235         zero_page = true;
1236         goto exit;
1237     }
1238 
1239     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1240 
1241     /*
1242      * copy it to a internal buffer to avoid it being modified by VM
1243      * so that we can catch up the error during compression and
1244      * decompression
1245      */
1246     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1247     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1248     if (ret < 0) {
1249         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1250         error_report("compressed data failed!");
1251         return false;
1252     }
1253 
1254 exit:
1255     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1256     return zero_page;
1257 }
1258 
1259 static void
update_compress_thread_counts(const CompressParam * param,int bytes_xmit)1260 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1261 {
1262     ram_counters.transferred += bytes_xmit;
1263 
1264     if (param->zero_page) {
1265         ram_counters.duplicate++;
1266         return;
1267     }
1268 
1269     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1270     compression_counters.compressed_size += bytes_xmit - 8;
1271     compression_counters.pages++;
1272 }
1273 
1274 static bool save_page_use_compression(RAMState *rs);
1275 
flush_compressed_data(RAMState * rs)1276 static void flush_compressed_data(RAMState *rs)
1277 {
1278     int idx, len, thread_count;
1279 
1280     if (!save_page_use_compression(rs)) {
1281         return;
1282     }
1283     thread_count = migrate_compress_threads();
1284 
1285     qemu_mutex_lock(&comp_done_lock);
1286     for (idx = 0; idx < thread_count; idx++) {
1287         while (!comp_param[idx].done) {
1288             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1289         }
1290     }
1291     qemu_mutex_unlock(&comp_done_lock);
1292 
1293     for (idx = 0; idx < thread_count; idx++) {
1294         qemu_mutex_lock(&comp_param[idx].mutex);
1295         if (!comp_param[idx].quit) {
1296             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1297             /*
1298              * it's safe to fetch zero_page without holding comp_done_lock
1299              * as there is no further request submitted to the thread,
1300              * i.e, the thread should be waiting for a request at this point.
1301              */
1302             update_compress_thread_counts(&comp_param[idx], len);
1303         }
1304         qemu_mutex_unlock(&comp_param[idx].mutex);
1305     }
1306 }
1307 
set_compress_params(CompressParam * param,RAMBlock * block,ram_addr_t offset)1308 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1309                                        ram_addr_t offset)
1310 {
1311     param->block = block;
1312     param->offset = offset;
1313 }
1314 
compress_page_with_multi_thread(RAMState * rs,RAMBlock * block,ram_addr_t offset)1315 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1316                                            ram_addr_t offset)
1317 {
1318     int idx, thread_count, bytes_xmit = -1, pages = -1;
1319     bool wait = migrate_compress_wait_thread();
1320 
1321     thread_count = migrate_compress_threads();
1322     qemu_mutex_lock(&comp_done_lock);
1323 retry:
1324     for (idx = 0; idx < thread_count; idx++) {
1325         if (comp_param[idx].done) {
1326             comp_param[idx].done = false;
1327             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1328             qemu_mutex_lock(&comp_param[idx].mutex);
1329             set_compress_params(&comp_param[idx], block, offset);
1330             qemu_cond_signal(&comp_param[idx].cond);
1331             qemu_mutex_unlock(&comp_param[idx].mutex);
1332             pages = 1;
1333             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1334             break;
1335         }
1336     }
1337 
1338     /*
1339      * wait for the free thread if the user specifies 'compress-wait-thread',
1340      * otherwise we will post the page out in the main thread as normal page.
1341      */
1342     if (pages < 0 && wait) {
1343         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1344         goto retry;
1345     }
1346     qemu_mutex_unlock(&comp_done_lock);
1347 
1348     return pages;
1349 }
1350 
1351 /**
1352  * find_dirty_block: find the next dirty page and update any state
1353  * associated with the search process.
1354  *
1355  * Returns true if a page is found
1356  *
1357  * @rs: current RAM state
1358  * @pss: data about the state of the current dirty page scan
1359  * @again: set to false if the search has scanned the whole of RAM
1360  */
find_dirty_block(RAMState * rs,PageSearchStatus * pss,bool * again)1361 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1362 {
1363     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1364     if (pss->complete_round && pss->block == rs->last_seen_block &&
1365         pss->page >= rs->last_page) {
1366         /*
1367          * We've been once around the RAM and haven't found anything.
1368          * Give up.
1369          */
1370         *again = false;
1371         return false;
1372     }
1373     if (!offset_in_ramblock(pss->block,
1374                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1375         /* Didn't find anything in this RAM Block */
1376         pss->page = 0;
1377         pss->block = QLIST_NEXT_RCU(pss->block, next);
1378         if (!pss->block) {
1379             /*
1380              * If memory migration starts over, we will meet a dirtied page
1381              * which may still exists in compression threads's ring, so we
1382              * should flush the compressed data to make sure the new page
1383              * is not overwritten by the old one in the destination.
1384              *
1385              * Also If xbzrle is on, stop using the data compression at this
1386              * point. In theory, xbzrle can do better than compression.
1387              */
1388             flush_compressed_data(rs);
1389 
1390             /* Hit the end of the list */
1391             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1392             /* Flag that we've looped */
1393             pss->complete_round = true;
1394             /* After the first round, enable XBZRLE. */
1395             if (migrate_use_xbzrle()) {
1396                 rs->xbzrle_enabled = true;
1397             }
1398         }
1399         /* Didn't find anything this time, but try again on the new block */
1400         *again = true;
1401         return false;
1402     } else {
1403         /* Can go around again, but... */
1404         *again = true;
1405         /* We've found something so probably don't need to */
1406         return true;
1407     }
1408 }
1409 
1410 /**
1411  * unqueue_page: gets a page of the queue
1412  *
1413  * Helper for 'get_queued_page' - gets a page off the queue
1414  *
1415  * Returns the block of the page (or NULL if none available)
1416  *
1417  * @rs: current RAM state
1418  * @offset: used to return the offset within the RAMBlock
1419  */
unqueue_page(RAMState * rs,ram_addr_t * offset)1420 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1421 {
1422     RAMBlock *block = NULL;
1423 
1424     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1425         return NULL;
1426     }
1427 
1428     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1429     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1430         struct RAMSrcPageRequest *entry =
1431                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1432         block = entry->rb;
1433         *offset = entry->offset;
1434 
1435         if (entry->len > TARGET_PAGE_SIZE) {
1436             entry->len -= TARGET_PAGE_SIZE;
1437             entry->offset += TARGET_PAGE_SIZE;
1438         } else {
1439             memory_region_unref(block->mr);
1440             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1441             g_free(entry);
1442             migration_consume_urgent_request();
1443         }
1444     }
1445 
1446     return block;
1447 }
1448 
1449 #if defined(__linux__)
1450 /**
1451  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1452  *   is found, return RAM block pointer and page offset
1453  *
1454  * Returns pointer to the RAMBlock containing faulting page,
1455  *   NULL if no write faults are pending
1456  *
1457  * @rs: current RAM state
1458  * @offset: page offset from the beginning of the block
1459  */
poll_fault_page(RAMState * rs,ram_addr_t * offset)1460 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1461 {
1462     struct uffd_msg uffd_msg;
1463     void *page_address;
1464     RAMBlock *block;
1465     int res;
1466 
1467     if (!migrate_background_snapshot()) {
1468         return NULL;
1469     }
1470 
1471     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1472     if (res <= 0) {
1473         return NULL;
1474     }
1475 
1476     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1477     block = qemu_ram_block_from_host(page_address, false, offset);
1478     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1479     return block;
1480 }
1481 
1482 /**
1483  * ram_save_release_protection: release UFFD write protection after
1484  *   a range of pages has been saved
1485  *
1486  * @rs: current RAM state
1487  * @pss: page-search-status structure
1488  * @start_page: index of the first page in the range relative to pss->block
1489  *
1490  * Returns 0 on success, negative value in case of an error
1491 */
ram_save_release_protection(RAMState * rs,PageSearchStatus * pss,unsigned long start_page)1492 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1493         unsigned long start_page)
1494 {
1495     int res = 0;
1496 
1497     /* Check if page is from UFFD-managed region. */
1498     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1499         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1500         uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1501 
1502         /* Flush async buffers before un-protect. */
1503         qemu_fflush(rs->f);
1504         /* Un-protect memory range. */
1505         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1506                 false, false);
1507     }
1508 
1509     return res;
1510 }
1511 
1512 /* ram_write_tracking_available: check if kernel supports required UFFD features
1513  *
1514  * Returns true if supports, false otherwise
1515  */
ram_write_tracking_available(void)1516 bool ram_write_tracking_available(void)
1517 {
1518     uint64_t uffd_features;
1519     int res;
1520 
1521     res = uffd_query_features(&uffd_features);
1522     return (res == 0 &&
1523             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1524 }
1525 
1526 /* ram_write_tracking_compatible: check if guest configuration is
1527  *   compatible with 'write-tracking'
1528  *
1529  * Returns true if compatible, false otherwise
1530  */
ram_write_tracking_compatible(void)1531 bool ram_write_tracking_compatible(void)
1532 {
1533     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1534     int uffd_fd;
1535     RAMBlock *block;
1536     bool ret = false;
1537 
1538     /* Open UFFD file descriptor */
1539     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1540     if (uffd_fd < 0) {
1541         return false;
1542     }
1543 
1544     RCU_READ_LOCK_GUARD();
1545 
1546     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1547         uint64_t uffd_ioctls;
1548 
1549         /* Nothing to do with read-only and MMIO-writable regions */
1550         if (block->mr->readonly || block->mr->rom_device) {
1551             continue;
1552         }
1553         /* Try to register block memory via UFFD-IO to track writes */
1554         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1555                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1556             goto out;
1557         }
1558         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1559             goto out;
1560         }
1561     }
1562     ret = true;
1563 
1564 out:
1565     uffd_close_fd(uffd_fd);
1566     return ret;
1567 }
1568 
1569 /*
1570  * ram_block_populate_pages: populate memory in the RAM block by reading
1571  *   an integer from the beginning of each page.
1572  *
1573  * Since it's solely used for userfault_fd WP feature, here we just
1574  *   hardcode page size to qemu_real_host_page_size.
1575  *
1576  * @block: RAM block to populate
1577  */
ram_block_populate_pages(RAMBlock * block)1578 static void ram_block_populate_pages(RAMBlock *block)
1579 {
1580     char *ptr = (char *) block->host;
1581 
1582     for (ram_addr_t offset = 0; offset < block->used_length;
1583             offset += qemu_real_host_page_size) {
1584         char tmp = *(ptr + offset);
1585 
1586         /* Don't optimize the read out */
1587         asm volatile("" : "+r" (tmp));
1588     }
1589 }
1590 
1591 /*
1592  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1593  */
ram_write_tracking_prepare(void)1594 void ram_write_tracking_prepare(void)
1595 {
1596     RAMBlock *block;
1597 
1598     RCU_READ_LOCK_GUARD();
1599 
1600     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1601         /* Nothing to do with read-only and MMIO-writable regions */
1602         if (block->mr->readonly || block->mr->rom_device) {
1603             continue;
1604         }
1605 
1606         /*
1607          * Populate pages of the RAM block before enabling userfault_fd
1608          * write protection.
1609          *
1610          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1611          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1612          * pages with pte_none() entries in page table.
1613          */
1614         ram_block_populate_pages(block);
1615     }
1616 }
1617 
1618 /*
1619  * ram_write_tracking_start: start UFFD-WP memory tracking
1620  *
1621  * Returns 0 for success or negative value in case of error
1622  */
ram_write_tracking_start(void)1623 int ram_write_tracking_start(void)
1624 {
1625     int uffd_fd;
1626     RAMState *rs = ram_state;
1627     RAMBlock *block;
1628 
1629     /* Open UFFD file descriptor */
1630     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1631     if (uffd_fd < 0) {
1632         return uffd_fd;
1633     }
1634     rs->uffdio_fd = uffd_fd;
1635 
1636     RCU_READ_LOCK_GUARD();
1637 
1638     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1639         /* Nothing to do with read-only and MMIO-writable regions */
1640         if (block->mr->readonly || block->mr->rom_device) {
1641             continue;
1642         }
1643 
1644         /* Register block memory with UFFD to track writes */
1645         if (uffd_register_memory(rs->uffdio_fd, block->host,
1646                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1647             goto fail;
1648         }
1649         /* Apply UFFD write protection to the block memory range */
1650         if (uffd_change_protection(rs->uffdio_fd, block->host,
1651                 block->max_length, true, false)) {
1652             goto fail;
1653         }
1654         block->flags |= RAM_UF_WRITEPROTECT;
1655         memory_region_ref(block->mr);
1656 
1657         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1658                 block->host, block->max_length);
1659     }
1660 
1661     return 0;
1662 
1663 fail:
1664     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1665 
1666     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1667         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1668             continue;
1669         }
1670         /*
1671          * In case some memory block failed to be write-protected
1672          * remove protection and unregister all succeeded RAM blocks
1673          */
1674         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1675                 false, false);
1676         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1677         /* Cleanup flags and remove reference */
1678         block->flags &= ~RAM_UF_WRITEPROTECT;
1679         memory_region_unref(block->mr);
1680     }
1681 
1682     uffd_close_fd(uffd_fd);
1683     rs->uffdio_fd = -1;
1684     return -1;
1685 }
1686 
1687 /**
1688  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1689  */
ram_write_tracking_stop(void)1690 void ram_write_tracking_stop(void)
1691 {
1692     RAMState *rs = ram_state;
1693     RAMBlock *block;
1694 
1695     RCU_READ_LOCK_GUARD();
1696 
1697     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1698         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1699             continue;
1700         }
1701         /* Remove protection and unregister all affected RAM blocks */
1702         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1703                 false, false);
1704         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1705 
1706         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1707                 block->host, block->max_length);
1708 
1709         /* Cleanup flags and remove reference */
1710         block->flags &= ~RAM_UF_WRITEPROTECT;
1711         memory_region_unref(block->mr);
1712     }
1713 
1714     /* Finally close UFFD file descriptor */
1715     uffd_close_fd(rs->uffdio_fd);
1716     rs->uffdio_fd = -1;
1717 }
1718 
1719 #else
1720 /* No target OS support, stubs just fail or ignore */
1721 
poll_fault_page(RAMState * rs,ram_addr_t * offset)1722 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1723 {
1724     (void) rs;
1725     (void) offset;
1726 
1727     return NULL;
1728 }
1729 
ram_save_release_protection(RAMState * rs,PageSearchStatus * pss,unsigned long start_page)1730 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1731         unsigned long start_page)
1732 {
1733     (void) rs;
1734     (void) pss;
1735     (void) start_page;
1736 
1737     return 0;
1738 }
1739 
ram_write_tracking_available(void)1740 bool ram_write_tracking_available(void)
1741 {
1742     return false;
1743 }
1744 
ram_write_tracking_compatible(void)1745 bool ram_write_tracking_compatible(void)
1746 {
1747     assert(0);
1748     return false;
1749 }
1750 
ram_write_tracking_start(void)1751 int ram_write_tracking_start(void)
1752 {
1753     assert(0);
1754     return -1;
1755 }
1756 
ram_write_tracking_stop(void)1757 void ram_write_tracking_stop(void)
1758 {
1759     assert(0);
1760 }
1761 #endif /* defined(__linux__) */
1762 
1763 /**
1764  * get_queued_page: unqueue a page from the postcopy requests
1765  *
1766  * Skips pages that are already sent (!dirty)
1767  *
1768  * Returns true if a queued page is found
1769  *
1770  * @rs: current RAM state
1771  * @pss: data about the state of the current dirty page scan
1772  */
get_queued_page(RAMState * rs,PageSearchStatus * pss)1773 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1774 {
1775     RAMBlock  *block;
1776     ram_addr_t offset;
1777     bool dirty;
1778 
1779     do {
1780         block = unqueue_page(rs, &offset);
1781         /*
1782          * We're sending this page, and since it's postcopy nothing else
1783          * will dirty it, and we must make sure it doesn't get sent again
1784          * even if this queue request was received after the background
1785          * search already sent it.
1786          */
1787         if (block) {
1788             unsigned long page;
1789 
1790             page = offset >> TARGET_PAGE_BITS;
1791             dirty = test_bit(page, block->bmap);
1792             if (!dirty) {
1793                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1794                                                 page);
1795             } else {
1796                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1797             }
1798         }
1799 
1800     } while (block && !dirty);
1801 
1802     if (!block) {
1803         /*
1804          * Poll write faults too if background snapshot is enabled; that's
1805          * when we have vcpus got blocked by the write protected pages.
1806          */
1807         block = poll_fault_page(rs, &offset);
1808     }
1809 
1810     if (block) {
1811         /*
1812          * We want the background search to continue from the queued page
1813          * since the guest is likely to want other pages near to the page
1814          * it just requested.
1815          */
1816         pss->block = block;
1817         pss->page = offset >> TARGET_PAGE_BITS;
1818 
1819         /*
1820          * This unqueued page would break the "one round" check, even is
1821          * really rare.
1822          */
1823         pss->complete_round = false;
1824     }
1825 
1826     return !!block;
1827 }
1828 
1829 /**
1830  * migration_page_queue_free: drop any remaining pages in the ram
1831  * request queue
1832  *
1833  * It should be empty at the end anyway, but in error cases there may
1834  * be some left.  in case that there is any page left, we drop it.
1835  *
1836  */
migration_page_queue_free(RAMState * rs)1837 static void migration_page_queue_free(RAMState *rs)
1838 {
1839     struct RAMSrcPageRequest *mspr, *next_mspr;
1840     /* This queue generally should be empty - but in the case of a failed
1841      * migration might have some droppings in.
1842      */
1843     RCU_READ_LOCK_GUARD();
1844     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1845         memory_region_unref(mspr->rb->mr);
1846         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1847         g_free(mspr);
1848     }
1849 }
1850 
1851 /**
1852  * ram_save_queue_pages: queue the page for transmission
1853  *
1854  * A request from postcopy destination for example.
1855  *
1856  * Returns zero on success or negative on error
1857  *
1858  * @rbname: Name of the RAMBLock of the request. NULL means the
1859  *          same that last one.
1860  * @start: starting address from the start of the RAMBlock
1861  * @len: length (in bytes) to send
1862  */
ram_save_queue_pages(const char * rbname,ram_addr_t start,ram_addr_t len)1863 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1864 {
1865     RAMBlock *ramblock;
1866     RAMState *rs = ram_state;
1867 
1868     ram_counters.postcopy_requests++;
1869     RCU_READ_LOCK_GUARD();
1870 
1871     if (!rbname) {
1872         /* Reuse last RAMBlock */
1873         ramblock = rs->last_req_rb;
1874 
1875         if (!ramblock) {
1876             /*
1877              * Shouldn't happen, we can't reuse the last RAMBlock if
1878              * it's the 1st request.
1879              */
1880             error_report("ram_save_queue_pages no previous block");
1881             return -1;
1882         }
1883     } else {
1884         ramblock = qemu_ram_block_by_name(rbname);
1885 
1886         if (!ramblock) {
1887             /* We shouldn't be asked for a non-existent RAMBlock */
1888             error_report("ram_save_queue_pages no block '%s'", rbname);
1889             return -1;
1890         }
1891         rs->last_req_rb = ramblock;
1892     }
1893     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1894     if (!offset_in_ramblock(ramblock, start + len - 1)) {
1895         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1896                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1897                      __func__, start, len, ramblock->used_length);
1898         return -1;
1899     }
1900 
1901     struct RAMSrcPageRequest *new_entry =
1902         g_malloc0(sizeof(struct RAMSrcPageRequest));
1903     new_entry->rb = ramblock;
1904     new_entry->offset = start;
1905     new_entry->len = len;
1906 
1907     memory_region_ref(ramblock->mr);
1908     qemu_mutex_lock(&rs->src_page_req_mutex);
1909     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1910     migration_make_urgent_request();
1911     qemu_mutex_unlock(&rs->src_page_req_mutex);
1912 
1913     return 0;
1914 }
1915 
save_page_use_compression(RAMState * rs)1916 static bool save_page_use_compression(RAMState *rs)
1917 {
1918     if (!migrate_use_compression()) {
1919         return false;
1920     }
1921 
1922     /*
1923      * If xbzrle is enabled (e.g., after first round of migration), stop
1924      * using the data compression. In theory, xbzrle can do better than
1925      * compression.
1926      */
1927     if (rs->xbzrle_enabled) {
1928         return false;
1929     }
1930 
1931     return true;
1932 }
1933 
1934 /*
1935  * try to compress the page before posting it out, return true if the page
1936  * has been properly handled by compression, otherwise needs other
1937  * paths to handle it
1938  */
save_compress_page(RAMState * rs,RAMBlock * block,ram_addr_t offset)1939 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1940 {
1941     if (!save_page_use_compression(rs)) {
1942         return false;
1943     }
1944 
1945     /*
1946      * When starting the process of a new block, the first page of
1947      * the block should be sent out before other pages in the same
1948      * block, and all the pages in last block should have been sent
1949      * out, keeping this order is important, because the 'cont' flag
1950      * is used to avoid resending the block name.
1951      *
1952      * We post the fist page as normal page as compression will take
1953      * much CPU resource.
1954      */
1955     if (block != rs->last_sent_block) {
1956         flush_compressed_data(rs);
1957         return false;
1958     }
1959 
1960     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1961         return true;
1962     }
1963 
1964     compression_counters.busy++;
1965     return false;
1966 }
1967 
1968 /**
1969  * ram_save_target_page: save one target page
1970  *
1971  * Returns the number of pages written
1972  *
1973  * @rs: current RAM state
1974  * @pss: data about the page we want to send
1975  * @last_stage: if we are at the completion stage
1976  */
ram_save_target_page(RAMState * rs,PageSearchStatus * pss,bool last_stage)1977 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1978                                 bool last_stage)
1979 {
1980     RAMBlock *block = pss->block;
1981     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1982     int res;
1983 
1984     if (control_save_page(rs, block, offset, &res)) {
1985         return res;
1986     }
1987 
1988     if (save_compress_page(rs, block, offset)) {
1989         return 1;
1990     }
1991 
1992     res = save_zero_page(rs, block, offset);
1993     if (res > 0) {
1994         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1995          * page would be stale
1996          */
1997         if (!save_page_use_compression(rs)) {
1998             XBZRLE_cache_lock();
1999             xbzrle_cache_zero_page(rs, block->offset + offset);
2000             XBZRLE_cache_unlock();
2001         }
2002         ram_release_pages(block->idstr, offset, res);
2003         return res;
2004     }
2005 
2006     /*
2007      * Do not use multifd for:
2008      * 1. Compression as the first page in the new block should be posted out
2009      *    before sending the compressed page
2010      * 2. In postcopy as one whole host page should be placed
2011      */
2012     if (!save_page_use_compression(rs) && migrate_use_multifd()
2013         && !migration_in_postcopy()) {
2014         return ram_save_multifd_page(rs, block, offset);
2015     }
2016 
2017     return ram_save_page(rs, pss, last_stage);
2018 }
2019 
2020 /**
2021  * ram_save_host_page: save a whole host page
2022  *
2023  * Starting at *offset send pages up to the end of the current host
2024  * page. It's valid for the initial offset to point into the middle of
2025  * a host page in which case the remainder of the hostpage is sent.
2026  * Only dirty target pages are sent. Note that the host page size may
2027  * be a huge page for this block.
2028  * The saving stops at the boundary of the used_length of the block
2029  * if the RAMBlock isn't a multiple of the host page size.
2030  *
2031  * Returns the number of pages written or negative on error
2032  *
2033  * @rs: current RAM state
2034  * @ms: current migration state
2035  * @pss: data about the page we want to send
2036  * @last_stage: if we are at the completion stage
2037  */
ram_save_host_page(RAMState * rs,PageSearchStatus * pss,bool last_stage)2038 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2039                               bool last_stage)
2040 {
2041     int tmppages, pages = 0;
2042     size_t pagesize_bits =
2043         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2044     unsigned long hostpage_boundary =
2045         QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2046     unsigned long start_page = pss->page;
2047     int res;
2048 
2049     if (ramblock_is_ignored(pss->block)) {
2050         error_report("block %s should not be migrated !", pss->block->idstr);
2051         return 0;
2052     }
2053 
2054     do {
2055         /* Check the pages is dirty and if it is send it */
2056         if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2057             tmppages = ram_save_target_page(rs, pss, last_stage);
2058             if (tmppages < 0) {
2059                 return tmppages;
2060             }
2061 
2062             pages += tmppages;
2063             /*
2064              * Allow rate limiting to happen in the middle of huge pages if
2065              * something is sent in the current iteration.
2066              */
2067             if (pagesize_bits > 1 && tmppages > 0) {
2068                 migration_rate_limit();
2069             }
2070         }
2071         pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2072     } while ((pss->page < hostpage_boundary) &&
2073              offset_in_ramblock(pss->block,
2074                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2075     /* The offset we leave with is the min boundary of host page and block */
2076     pss->page = MIN(pss->page, hostpage_boundary) - 1;
2077 
2078     res = ram_save_release_protection(rs, pss, start_page);
2079     return (res < 0 ? res : pages);
2080 }
2081 
2082 /**
2083  * ram_find_and_save_block: finds a dirty page and sends it to f
2084  *
2085  * Called within an RCU critical section.
2086  *
2087  * Returns the number of pages written where zero means no dirty pages,
2088  * or negative on error
2089  *
2090  * @rs: current RAM state
2091  * @last_stage: if we are at the completion stage
2092  *
2093  * On systems where host-page-size > target-page-size it will send all the
2094  * pages in a host page that are dirty.
2095  */
2096 
ram_find_and_save_block(RAMState * rs,bool last_stage)2097 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2098 {
2099     PageSearchStatus pss;
2100     int pages = 0;
2101     bool again, found;
2102 
2103     /* No dirty page as there is zero RAM */
2104     if (!ram_bytes_total()) {
2105         return pages;
2106     }
2107 
2108     pss.block = rs->last_seen_block;
2109     pss.page = rs->last_page;
2110     pss.complete_round = false;
2111 
2112     if (!pss.block) {
2113         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2114     }
2115 
2116     do {
2117         again = true;
2118         found = get_queued_page(rs, &pss);
2119 
2120         if (!found) {
2121             /* priority queue empty, so just search for something dirty */
2122             found = find_dirty_block(rs, &pss, &again);
2123         }
2124 
2125         if (found) {
2126             pages = ram_save_host_page(rs, &pss, last_stage);
2127         }
2128     } while (!pages && again);
2129 
2130     rs->last_seen_block = pss.block;
2131     rs->last_page = pss.page;
2132 
2133     return pages;
2134 }
2135 
acct_update_position(QEMUFile * f,size_t size,bool zero)2136 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2137 {
2138     uint64_t pages = size / TARGET_PAGE_SIZE;
2139 
2140     if (zero) {
2141         ram_counters.duplicate += pages;
2142     } else {
2143         ram_counters.normal += pages;
2144         ram_counters.transferred += size;
2145         qemu_update_position(f, size);
2146     }
2147 }
2148 
ram_bytes_total_common(bool count_ignored)2149 static uint64_t ram_bytes_total_common(bool count_ignored)
2150 {
2151     RAMBlock *block;
2152     uint64_t total = 0;
2153 
2154     RCU_READ_LOCK_GUARD();
2155 
2156     if (count_ignored) {
2157         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2158             total += block->used_length;
2159         }
2160     } else {
2161         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2162             total += block->used_length;
2163         }
2164     }
2165     return total;
2166 }
2167 
ram_bytes_total(void)2168 uint64_t ram_bytes_total(void)
2169 {
2170     return ram_bytes_total_common(false);
2171 }
2172 
xbzrle_load_setup(void)2173 static void xbzrle_load_setup(void)
2174 {
2175     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2176 }
2177 
xbzrle_load_cleanup(void)2178 static void xbzrle_load_cleanup(void)
2179 {
2180     g_free(XBZRLE.decoded_buf);
2181     XBZRLE.decoded_buf = NULL;
2182 }
2183 
ram_state_cleanup(RAMState ** rsp)2184 static void ram_state_cleanup(RAMState **rsp)
2185 {
2186     if (*rsp) {
2187         migration_page_queue_free(*rsp);
2188         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2189         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2190         g_free(*rsp);
2191         *rsp = NULL;
2192     }
2193 }
2194 
xbzrle_cleanup(void)2195 static void xbzrle_cleanup(void)
2196 {
2197     XBZRLE_cache_lock();
2198     if (XBZRLE.cache) {
2199         cache_fini(XBZRLE.cache);
2200         g_free(XBZRLE.encoded_buf);
2201         g_free(XBZRLE.current_buf);
2202         g_free(XBZRLE.zero_target_page);
2203         XBZRLE.cache = NULL;
2204         XBZRLE.encoded_buf = NULL;
2205         XBZRLE.current_buf = NULL;
2206         XBZRLE.zero_target_page = NULL;
2207     }
2208     XBZRLE_cache_unlock();
2209 }
2210 
ram_save_cleanup(void * opaque)2211 static void ram_save_cleanup(void *opaque)
2212 {
2213     RAMState **rsp = opaque;
2214     RAMBlock *block;
2215 
2216     /* We don't use dirty log with background snapshots */
2217     if (!migrate_background_snapshot()) {
2218         /* caller have hold iothread lock or is in a bh, so there is
2219          * no writing race against the migration bitmap
2220          */
2221         memory_global_dirty_log_stop();
2222     }
2223 
2224     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2225         g_free(block->clear_bmap);
2226         block->clear_bmap = NULL;
2227         g_free(block->bmap);
2228         block->bmap = NULL;
2229     }
2230 
2231     xbzrle_cleanup();
2232     compress_threads_save_cleanup();
2233     ram_state_cleanup(rsp);
2234 }
2235 
ram_state_reset(RAMState * rs)2236 static void ram_state_reset(RAMState *rs)
2237 {
2238     rs->last_seen_block = NULL;
2239     rs->last_sent_block = NULL;
2240     rs->last_page = 0;
2241     rs->last_version = ram_list.version;
2242     rs->xbzrle_enabled = false;
2243 }
2244 
2245 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2246 
2247 /*
2248  * 'expected' is the value you expect the bitmap mostly to be full
2249  * of; it won't bother printing lines that are all this value.
2250  * If 'todump' is null the migration bitmap is dumped.
2251  */
ram_debug_dump_bitmap(unsigned long * todump,bool expected,unsigned long pages)2252 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2253                            unsigned long pages)
2254 {
2255     int64_t cur;
2256     int64_t linelen = 128;
2257     char linebuf[129];
2258 
2259     for (cur = 0; cur < pages; cur += linelen) {
2260         int64_t curb;
2261         bool found = false;
2262         /*
2263          * Last line; catch the case where the line length
2264          * is longer than remaining ram
2265          */
2266         if (cur + linelen > pages) {
2267             linelen = pages - cur;
2268         }
2269         for (curb = 0; curb < linelen; curb++) {
2270             bool thisbit = test_bit(cur + curb, todump);
2271             linebuf[curb] = thisbit ? '1' : '.';
2272             found = found || (thisbit != expected);
2273         }
2274         if (found) {
2275             linebuf[curb] = '\0';
2276             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2277         }
2278     }
2279 }
2280 
2281 /* **** functions for postcopy ***** */
2282 
ram_postcopy_migrated_memory_release(MigrationState * ms)2283 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2284 {
2285     struct RAMBlock *block;
2286 
2287     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2288         unsigned long *bitmap = block->bmap;
2289         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2290         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2291 
2292         while (run_start < range) {
2293             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2294             ram_discard_range(block->idstr,
2295                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2296                               ((ram_addr_t)(run_end - run_start))
2297                                 << TARGET_PAGE_BITS);
2298             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2299         }
2300     }
2301 }
2302 
2303 /**
2304  * postcopy_send_discard_bm_ram: discard a RAMBlock
2305  *
2306  * Returns zero on success
2307  *
2308  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2309  *
2310  * @ms: current migration state
2311  * @block: RAMBlock to discard
2312  */
postcopy_send_discard_bm_ram(MigrationState * ms,RAMBlock * block)2313 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2314 {
2315     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2316     unsigned long current;
2317     unsigned long *bitmap = block->bmap;
2318 
2319     for (current = 0; current < end; ) {
2320         unsigned long one = find_next_bit(bitmap, end, current);
2321         unsigned long zero, discard_length;
2322 
2323         if (one >= end) {
2324             break;
2325         }
2326 
2327         zero = find_next_zero_bit(bitmap, end, one + 1);
2328 
2329         if (zero >= end) {
2330             discard_length = end - one;
2331         } else {
2332             discard_length = zero - one;
2333         }
2334         postcopy_discard_send_range(ms, one, discard_length);
2335         current = one + discard_length;
2336     }
2337 
2338     return 0;
2339 }
2340 
2341 /**
2342  * postcopy_each_ram_send_discard: discard all RAMBlocks
2343  *
2344  * Returns 0 for success or negative for error
2345  *
2346  * Utility for the outgoing postcopy code.
2347  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2348  *   passing it bitmap indexes and name.
2349  * (qemu_ram_foreach_block ends up passing unscaled lengths
2350  *  which would mean postcopy code would have to deal with target page)
2351  *
2352  * @ms: current migration state
2353  */
postcopy_each_ram_send_discard(MigrationState * ms)2354 static int postcopy_each_ram_send_discard(MigrationState *ms)
2355 {
2356     struct RAMBlock *block;
2357     int ret;
2358 
2359     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2360         postcopy_discard_send_init(ms, block->idstr);
2361 
2362         /*
2363          * Postcopy sends chunks of bitmap over the wire, but it
2364          * just needs indexes at this point, avoids it having
2365          * target page specific code.
2366          */
2367         ret = postcopy_send_discard_bm_ram(ms, block);
2368         postcopy_discard_send_finish(ms);
2369         if (ret) {
2370             return ret;
2371         }
2372     }
2373 
2374     return 0;
2375 }
2376 
2377 /**
2378  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2379  *
2380  * Helper for postcopy_chunk_hostpages; it's called twice to
2381  * canonicalize the two bitmaps, that are similar, but one is
2382  * inverted.
2383  *
2384  * Postcopy requires that all target pages in a hostpage are dirty or
2385  * clean, not a mix.  This function canonicalizes the bitmaps.
2386  *
2387  * @ms: current migration state
2388  * @block: block that contains the page we want to canonicalize
2389  */
postcopy_chunk_hostpages_pass(MigrationState * ms,RAMBlock * block)2390 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2391 {
2392     RAMState *rs = ram_state;
2393     unsigned long *bitmap = block->bmap;
2394     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2395     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2396     unsigned long run_start;
2397 
2398     if (block->page_size == TARGET_PAGE_SIZE) {
2399         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2400         return;
2401     }
2402 
2403     /* Find a dirty page */
2404     run_start = find_next_bit(bitmap, pages, 0);
2405 
2406     while (run_start < pages) {
2407 
2408         /*
2409          * If the start of this run of pages is in the middle of a host
2410          * page, then we need to fixup this host page.
2411          */
2412         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2413             /* Find the end of this run */
2414             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2415             /*
2416              * If the end isn't at the start of a host page, then the
2417              * run doesn't finish at the end of a host page
2418              * and we need to discard.
2419              */
2420         }
2421 
2422         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2423             unsigned long page;
2424             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2425                                                              host_ratio);
2426             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2427 
2428             /* Clean up the bitmap */
2429             for (page = fixup_start_addr;
2430                  page < fixup_start_addr + host_ratio; page++) {
2431                 /*
2432                  * Remark them as dirty, updating the count for any pages
2433                  * that weren't previously dirty.
2434                  */
2435                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2436             }
2437         }
2438 
2439         /* Find the next dirty page for the next iteration */
2440         run_start = find_next_bit(bitmap, pages, run_start);
2441     }
2442 }
2443 
2444 /**
2445  * postcopy_chunk_hostpages: discard any partially sent host page
2446  *
2447  * Utility for the outgoing postcopy code.
2448  *
2449  * Discard any partially sent host-page size chunks, mark any partially
2450  * dirty host-page size chunks as all dirty.  In this case the host-page
2451  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2452  *
2453  * Returns zero on success
2454  *
2455  * @ms: current migration state
2456  * @block: block we want to work with
2457  */
postcopy_chunk_hostpages(MigrationState * ms,RAMBlock * block)2458 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2459 {
2460     postcopy_discard_send_init(ms, block->idstr);
2461 
2462     /*
2463      * Ensure that all partially dirty host pages are made fully dirty.
2464      */
2465     postcopy_chunk_hostpages_pass(ms, block);
2466 
2467     postcopy_discard_send_finish(ms);
2468     return 0;
2469 }
2470 
2471 /**
2472  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2473  *
2474  * Returns zero on success
2475  *
2476  * Transmit the set of pages to be discarded after precopy to the target
2477  * these are pages that:
2478  *     a) Have been previously transmitted but are now dirty again
2479  *     b) Pages that have never been transmitted, this ensures that
2480  *        any pages on the destination that have been mapped by background
2481  *        tasks get discarded (transparent huge pages is the specific concern)
2482  * Hopefully this is pretty sparse
2483  *
2484  * @ms: current migration state
2485  */
ram_postcopy_send_discard_bitmap(MigrationState * ms)2486 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2487 {
2488     RAMState *rs = ram_state;
2489     RAMBlock *block;
2490     int ret;
2491 
2492     RCU_READ_LOCK_GUARD();
2493 
2494     /* This should be our last sync, the src is now paused */
2495     migration_bitmap_sync(rs);
2496 
2497     /* Easiest way to make sure we don't resume in the middle of a host-page */
2498     rs->last_seen_block = NULL;
2499     rs->last_sent_block = NULL;
2500     rs->last_page = 0;
2501 
2502     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2503         /* Deal with TPS != HPS and huge pages */
2504         ret = postcopy_chunk_hostpages(ms, block);
2505         if (ret) {
2506             return ret;
2507         }
2508 
2509 #ifdef DEBUG_POSTCOPY
2510         ram_debug_dump_bitmap(block->bmap, true,
2511                               block->used_length >> TARGET_PAGE_BITS);
2512 #endif
2513     }
2514     trace_ram_postcopy_send_discard_bitmap();
2515 
2516     return postcopy_each_ram_send_discard(ms);
2517 }
2518 
2519 /**
2520  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2521  *
2522  * Returns zero on success
2523  *
2524  * @rbname: name of the RAMBlock of the request. NULL means the
2525  *          same that last one.
2526  * @start: RAMBlock starting page
2527  * @length: RAMBlock size
2528  */
ram_discard_range(const char * rbname,uint64_t start,size_t length)2529 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2530 {
2531     trace_ram_discard_range(rbname, start, length);
2532 
2533     RCU_READ_LOCK_GUARD();
2534     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2535 
2536     if (!rb) {
2537         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2538         return -1;
2539     }
2540 
2541     /*
2542      * On source VM, we don't need to update the received bitmap since
2543      * we don't even have one.
2544      */
2545     if (rb->receivedmap) {
2546         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2547                      length >> qemu_target_page_bits());
2548     }
2549 
2550     return ram_block_discard_range(rb, start, length);
2551 }
2552 
2553 /*
2554  * For every allocation, we will try not to crash the VM if the
2555  * allocation failed.
2556  */
xbzrle_init(void)2557 static int xbzrle_init(void)
2558 {
2559     Error *local_err = NULL;
2560 
2561     if (!migrate_use_xbzrle()) {
2562         return 0;
2563     }
2564 
2565     XBZRLE_cache_lock();
2566 
2567     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2568     if (!XBZRLE.zero_target_page) {
2569         error_report("%s: Error allocating zero page", __func__);
2570         goto err_out;
2571     }
2572 
2573     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2574                               TARGET_PAGE_SIZE, &local_err);
2575     if (!XBZRLE.cache) {
2576         error_report_err(local_err);
2577         goto free_zero_page;
2578     }
2579 
2580     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2581     if (!XBZRLE.encoded_buf) {
2582         error_report("%s: Error allocating encoded_buf", __func__);
2583         goto free_cache;
2584     }
2585 
2586     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2587     if (!XBZRLE.current_buf) {
2588         error_report("%s: Error allocating current_buf", __func__);
2589         goto free_encoded_buf;
2590     }
2591 
2592     /* We are all good */
2593     XBZRLE_cache_unlock();
2594     return 0;
2595 
2596 free_encoded_buf:
2597     g_free(XBZRLE.encoded_buf);
2598     XBZRLE.encoded_buf = NULL;
2599 free_cache:
2600     cache_fini(XBZRLE.cache);
2601     XBZRLE.cache = NULL;
2602 free_zero_page:
2603     g_free(XBZRLE.zero_target_page);
2604     XBZRLE.zero_target_page = NULL;
2605 err_out:
2606     XBZRLE_cache_unlock();
2607     return -ENOMEM;
2608 }
2609 
ram_state_init(RAMState ** rsp)2610 static int ram_state_init(RAMState **rsp)
2611 {
2612     *rsp = g_try_new0(RAMState, 1);
2613 
2614     if (!*rsp) {
2615         error_report("%s: Init ramstate fail", __func__);
2616         return -1;
2617     }
2618 
2619     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2620     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2621     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2622 
2623     /*
2624      * Count the total number of pages used by ram blocks not including any
2625      * gaps due to alignment or unplugs.
2626      * This must match with the initial values of dirty bitmap.
2627      */
2628     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2629     ram_state_reset(*rsp);
2630 
2631     return 0;
2632 }
2633 
ram_list_init_bitmaps(void)2634 static void ram_list_init_bitmaps(void)
2635 {
2636     MigrationState *ms = migrate_get_current();
2637     RAMBlock *block;
2638     unsigned long pages;
2639     uint8_t shift;
2640 
2641     /* Skip setting bitmap if there is no RAM */
2642     if (ram_bytes_total()) {
2643         shift = ms->clear_bitmap_shift;
2644         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2645             error_report("clear_bitmap_shift (%u) too big, using "
2646                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2647             shift = CLEAR_BITMAP_SHIFT_MAX;
2648         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2649             error_report("clear_bitmap_shift (%u) too small, using "
2650                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2651             shift = CLEAR_BITMAP_SHIFT_MIN;
2652         }
2653 
2654         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2655             pages = block->max_length >> TARGET_PAGE_BITS;
2656             /*
2657              * The initial dirty bitmap for migration must be set with all
2658              * ones to make sure we'll migrate every guest RAM page to
2659              * destination.
2660              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2661              * new migration after a failed migration, ram_list.
2662              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2663              * guest memory.
2664              */
2665             block->bmap = bitmap_new(pages);
2666             bitmap_set(block->bmap, 0, pages);
2667             block->clear_bmap_shift = shift;
2668             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2669         }
2670     }
2671 }
2672 
ram_init_bitmaps(RAMState * rs)2673 static void ram_init_bitmaps(RAMState *rs)
2674 {
2675     /* For memory_global_dirty_log_start below.  */
2676     qemu_mutex_lock_iothread();
2677     qemu_mutex_lock_ramlist();
2678 
2679     WITH_RCU_READ_LOCK_GUARD() {
2680         ram_list_init_bitmaps();
2681         /* We don't use dirty log with background snapshots */
2682         if (!migrate_background_snapshot()) {
2683             memory_global_dirty_log_start();
2684             migration_bitmap_sync_precopy(rs);
2685         }
2686     }
2687     qemu_mutex_unlock_ramlist();
2688     qemu_mutex_unlock_iothread();
2689 }
2690 
ram_init_all(RAMState ** rsp)2691 static int ram_init_all(RAMState **rsp)
2692 {
2693     if (ram_state_init(rsp)) {
2694         return -1;
2695     }
2696 
2697     if (xbzrle_init()) {
2698         ram_state_cleanup(rsp);
2699         return -1;
2700     }
2701 
2702     ram_init_bitmaps(*rsp);
2703 
2704     return 0;
2705 }
2706 
ram_state_resume_prepare(RAMState * rs,QEMUFile * out)2707 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2708 {
2709     RAMBlock *block;
2710     uint64_t pages = 0;
2711 
2712     /*
2713      * Postcopy is not using xbzrle/compression, so no need for that.
2714      * Also, since source are already halted, we don't need to care
2715      * about dirty page logging as well.
2716      */
2717 
2718     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2719         pages += bitmap_count_one(block->bmap,
2720                                   block->used_length >> TARGET_PAGE_BITS);
2721     }
2722 
2723     /* This may not be aligned with current bitmaps. Recalculate. */
2724     rs->migration_dirty_pages = pages;
2725 
2726     ram_state_reset(rs);
2727 
2728     /* Update RAMState cache of output QEMUFile */
2729     rs->f = out;
2730 
2731     trace_ram_state_resume_prepare(pages);
2732 }
2733 
2734 /*
2735  * This function clears bits of the free pages reported by the caller from the
2736  * migration dirty bitmap. @addr is the host address corresponding to the
2737  * start of the continuous guest free pages, and @len is the total bytes of
2738  * those pages.
2739  */
qemu_guest_free_page_hint(void * addr,size_t len)2740 void qemu_guest_free_page_hint(void *addr, size_t len)
2741 {
2742     RAMBlock *block;
2743     ram_addr_t offset;
2744     size_t used_len, start, npages;
2745     MigrationState *s = migrate_get_current();
2746 
2747     /* This function is currently expected to be used during live migration */
2748     if (!migration_is_setup_or_active(s->state)) {
2749         return;
2750     }
2751 
2752     for (; len > 0; len -= used_len, addr += used_len) {
2753         block = qemu_ram_block_from_host(addr, false, &offset);
2754         if (unlikely(!block || offset >= block->used_length)) {
2755             /*
2756              * The implementation might not support RAMBlock resize during
2757              * live migration, but it could happen in theory with future
2758              * updates. So we add a check here to capture that case.
2759              */
2760             error_report_once("%s unexpected error", __func__);
2761             return;
2762         }
2763 
2764         if (len <= block->used_length - offset) {
2765             used_len = len;
2766         } else {
2767             used_len = block->used_length - offset;
2768         }
2769 
2770         start = offset >> TARGET_PAGE_BITS;
2771         npages = used_len >> TARGET_PAGE_BITS;
2772 
2773         qemu_mutex_lock(&ram_state->bitmap_mutex);
2774         /*
2775          * The skipped free pages are equavalent to be sent from clear_bmap's
2776          * perspective, so clear the bits from the memory region bitmap which
2777          * are initially set. Otherwise those skipped pages will be sent in
2778          * the next round after syncing from the memory region bitmap.
2779          */
2780         migration_clear_memory_region_dirty_bitmap_range(ram_state, block,
2781                                                          start, npages);
2782         ram_state->migration_dirty_pages -=
2783                       bitmap_count_one_with_offset(block->bmap, start, npages);
2784         bitmap_clear(block->bmap, start, npages);
2785         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2786     }
2787 }
2788 
2789 /*
2790  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2791  * long-running RCU critical section.  When rcu-reclaims in the code
2792  * start to become numerous it will be necessary to reduce the
2793  * granularity of these critical sections.
2794  */
2795 
2796 /**
2797  * ram_save_setup: Setup RAM for migration
2798  *
2799  * Returns zero to indicate success and negative for error
2800  *
2801  * @f: QEMUFile where to send the data
2802  * @opaque: RAMState pointer
2803  */
ram_save_setup(QEMUFile * f,void * opaque)2804 static int ram_save_setup(QEMUFile *f, void *opaque)
2805 {
2806     RAMState **rsp = opaque;
2807     RAMBlock *block;
2808 
2809     if (compress_threads_save_setup()) {
2810         return -1;
2811     }
2812 
2813     /* migration has already setup the bitmap, reuse it. */
2814     if (!migration_in_colo_state()) {
2815         if (ram_init_all(rsp) != 0) {
2816             compress_threads_save_cleanup();
2817             return -1;
2818         }
2819     }
2820     (*rsp)->f = f;
2821 
2822     WITH_RCU_READ_LOCK_GUARD() {
2823         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2824 
2825         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2826             qemu_put_byte(f, strlen(block->idstr));
2827             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2828             qemu_put_be64(f, block->used_length);
2829             if (migrate_postcopy_ram() && block->page_size !=
2830                                           qemu_host_page_size) {
2831                 qemu_put_be64(f, block->page_size);
2832             }
2833             if (migrate_ignore_shared()) {
2834                 qemu_put_be64(f, block->mr->addr);
2835             }
2836         }
2837     }
2838 
2839     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2840     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2841 
2842     multifd_send_sync_main(f);
2843     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2844     qemu_fflush(f);
2845 
2846     return 0;
2847 }
2848 
2849 /**
2850  * ram_save_iterate: iterative stage for migration
2851  *
2852  * Returns zero to indicate success and negative for error
2853  *
2854  * @f: QEMUFile where to send the data
2855  * @opaque: RAMState pointer
2856  */
ram_save_iterate(QEMUFile * f,void * opaque)2857 static int ram_save_iterate(QEMUFile *f, void *opaque)
2858 {
2859     RAMState **temp = opaque;
2860     RAMState *rs = *temp;
2861     int ret = 0;
2862     int i;
2863     int64_t t0;
2864     int done = 0;
2865 
2866     if (blk_mig_bulk_active()) {
2867         /* Avoid transferring ram during bulk phase of block migration as
2868          * the bulk phase will usually take a long time and transferring
2869          * ram updates during that time is pointless. */
2870         goto out;
2871     }
2872 
2873     /*
2874      * We'll take this lock a little bit long, but it's okay for two reasons.
2875      * Firstly, the only possible other thread to take it is who calls
2876      * qemu_guest_free_page_hint(), which should be rare; secondly, see
2877      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
2878      * guarantees that we'll at least released it in a regular basis.
2879      */
2880     qemu_mutex_lock(&rs->bitmap_mutex);
2881     WITH_RCU_READ_LOCK_GUARD() {
2882         if (ram_list.version != rs->last_version) {
2883             ram_state_reset(rs);
2884         }
2885 
2886         /* Read version before ram_list.blocks */
2887         smp_rmb();
2888 
2889         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2890 
2891         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2892         i = 0;
2893         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2894                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2895             int pages;
2896 
2897             if (qemu_file_get_error(f)) {
2898                 break;
2899             }
2900 
2901             pages = ram_find_and_save_block(rs, false);
2902             /* no more pages to sent */
2903             if (pages == 0) {
2904                 done = 1;
2905                 break;
2906             }
2907 
2908             if (pages < 0) {
2909                 qemu_file_set_error(f, pages);
2910                 break;
2911             }
2912 
2913             rs->target_page_count += pages;
2914 
2915             /*
2916              * During postcopy, it is necessary to make sure one whole host
2917              * page is sent in one chunk.
2918              */
2919             if (migrate_postcopy_ram()) {
2920                 flush_compressed_data(rs);
2921             }
2922 
2923             /*
2924              * we want to check in the 1st loop, just in case it was the 1st
2925              * time and we had to sync the dirty bitmap.
2926              * qemu_clock_get_ns() is a bit expensive, so we only check each
2927              * some iterations
2928              */
2929             if ((i & 63) == 0) {
2930                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2931                               1000000;
2932                 if (t1 > MAX_WAIT) {
2933                     trace_ram_save_iterate_big_wait(t1, i);
2934                     break;
2935                 }
2936             }
2937             i++;
2938         }
2939     }
2940     qemu_mutex_unlock(&rs->bitmap_mutex);
2941 
2942     /*
2943      * Must occur before EOS (or any QEMUFile operation)
2944      * because of RDMA protocol.
2945      */
2946     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2947 
2948 out:
2949     if (ret >= 0
2950         && migration_is_setup_or_active(migrate_get_current()->state)) {
2951         multifd_send_sync_main(rs->f);
2952         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2953         qemu_fflush(f);
2954         ram_counters.transferred += 8;
2955 
2956         ret = qemu_file_get_error(f);
2957     }
2958     if (ret < 0) {
2959         return ret;
2960     }
2961 
2962     return done;
2963 }
2964 
2965 /**
2966  * ram_save_complete: function called to send the remaining amount of ram
2967  *
2968  * Returns zero to indicate success or negative on error
2969  *
2970  * Called with iothread lock
2971  *
2972  * @f: QEMUFile where to send the data
2973  * @opaque: RAMState pointer
2974  */
ram_save_complete(QEMUFile * f,void * opaque)2975 static int ram_save_complete(QEMUFile *f, void *opaque)
2976 {
2977     RAMState **temp = opaque;
2978     RAMState *rs = *temp;
2979     int ret = 0;
2980 
2981     WITH_RCU_READ_LOCK_GUARD() {
2982         if (!migration_in_postcopy()) {
2983             migration_bitmap_sync_precopy(rs);
2984         }
2985 
2986         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2987 
2988         /* try transferring iterative blocks of memory */
2989 
2990         /* flush all remaining blocks regardless of rate limiting */
2991         while (true) {
2992             int pages;
2993 
2994             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2995             /* no more blocks to sent */
2996             if (pages == 0) {
2997                 break;
2998             }
2999             if (pages < 0) {
3000                 ret = pages;
3001                 break;
3002             }
3003         }
3004 
3005         flush_compressed_data(rs);
3006         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3007     }
3008 
3009     if (ret >= 0) {
3010         multifd_send_sync_main(rs->f);
3011         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3012         qemu_fflush(f);
3013     }
3014 
3015     return ret;
3016 }
3017 
ram_save_pending(QEMUFile * f,void * opaque,uint64_t max_size,uint64_t * res_precopy_only,uint64_t * res_compatible,uint64_t * res_postcopy_only)3018 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3019                              uint64_t *res_precopy_only,
3020                              uint64_t *res_compatible,
3021                              uint64_t *res_postcopy_only)
3022 {
3023     RAMState **temp = opaque;
3024     RAMState *rs = *temp;
3025     uint64_t remaining_size;
3026 
3027     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3028 
3029     if (!migration_in_postcopy() &&
3030         remaining_size < max_size) {
3031         qemu_mutex_lock_iothread();
3032         WITH_RCU_READ_LOCK_GUARD() {
3033             migration_bitmap_sync_precopy(rs);
3034         }
3035         qemu_mutex_unlock_iothread();
3036         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3037     }
3038 
3039     if (migrate_postcopy_ram()) {
3040         /* We can do postcopy, and all the data is postcopiable */
3041         *res_compatible += remaining_size;
3042     } else {
3043         *res_precopy_only += remaining_size;
3044     }
3045 }
3046 
load_xbzrle(QEMUFile * f,ram_addr_t addr,void * host)3047 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3048 {
3049     unsigned int xh_len;
3050     int xh_flags;
3051     uint8_t *loaded_data;
3052 
3053     /* extract RLE header */
3054     xh_flags = qemu_get_byte(f);
3055     xh_len = qemu_get_be16(f);
3056 
3057     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3058         error_report("Failed to load XBZRLE page - wrong compression!");
3059         return -1;
3060     }
3061 
3062     if (xh_len > TARGET_PAGE_SIZE) {
3063         error_report("Failed to load XBZRLE page - len overflow!");
3064         return -1;
3065     }
3066     loaded_data = XBZRLE.decoded_buf;
3067     /* load data and decode */
3068     /* it can change loaded_data to point to an internal buffer */
3069     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3070 
3071     /* decode RLE */
3072     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3073                              TARGET_PAGE_SIZE) == -1) {
3074         error_report("Failed to load XBZRLE page - decode error!");
3075         return -1;
3076     }
3077 
3078     return 0;
3079 }
3080 
3081 /**
3082  * ram_block_from_stream: read a RAMBlock id from the migration stream
3083  *
3084  * Must be called from within a rcu critical section.
3085  *
3086  * Returns a pointer from within the RCU-protected ram_list.
3087  *
3088  * @f: QEMUFile where to read the data from
3089  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3090  */
ram_block_from_stream(QEMUFile * f,int flags)3091 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3092 {
3093     static RAMBlock *block;
3094     char id[256];
3095     uint8_t len;
3096 
3097     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3098         if (!block) {
3099             error_report("Ack, bad migration stream!");
3100             return NULL;
3101         }
3102         return block;
3103     }
3104 
3105     len = qemu_get_byte(f);
3106     qemu_get_buffer(f, (uint8_t *)id, len);
3107     id[len] = 0;
3108 
3109     block = qemu_ram_block_by_name(id);
3110     if (!block) {
3111         error_report("Can't find block %s", id);
3112         return NULL;
3113     }
3114 
3115     if (ramblock_is_ignored(block)) {
3116         error_report("block %s should not be migrated !", id);
3117         return NULL;
3118     }
3119 
3120     return block;
3121 }
3122 
host_from_ram_block_offset(RAMBlock * block,ram_addr_t offset)3123 static inline void *host_from_ram_block_offset(RAMBlock *block,
3124                                                ram_addr_t offset)
3125 {
3126     if (!offset_in_ramblock(block, offset)) {
3127         return NULL;
3128     }
3129 
3130     return block->host + offset;
3131 }
3132 
host_page_from_ram_block_offset(RAMBlock * block,ram_addr_t offset)3133 static void *host_page_from_ram_block_offset(RAMBlock *block,
3134                                              ram_addr_t offset)
3135 {
3136     /* Note: Explicitly no check against offset_in_ramblock(). */
3137     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3138                                    block->page_size);
3139 }
3140 
host_page_offset_from_ram_block_offset(RAMBlock * block,ram_addr_t offset)3141 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3142                                                          ram_addr_t offset)
3143 {
3144     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3145 }
3146 
colo_cache_from_block_offset(RAMBlock * block,ram_addr_t offset,bool record_bitmap)3147 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3148                              ram_addr_t offset, bool record_bitmap)
3149 {
3150     if (!offset_in_ramblock(block, offset)) {
3151         return NULL;
3152     }
3153     if (!block->colo_cache) {
3154         error_report("%s: colo_cache is NULL in block :%s",
3155                      __func__, block->idstr);
3156         return NULL;
3157     }
3158 
3159     /*
3160     * During colo checkpoint, we need bitmap of these migrated pages.
3161     * It help us to decide which pages in ram cache should be flushed
3162     * into VM's RAM later.
3163     */
3164     if (record_bitmap &&
3165         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3166         ram_state->migration_dirty_pages++;
3167     }
3168     return block->colo_cache + offset;
3169 }
3170 
3171 /**
3172  * ram_handle_compressed: handle the zero page case
3173  *
3174  * If a page (or a whole RDMA chunk) has been
3175  * determined to be zero, then zap it.
3176  *
3177  * @host: host address for the zero page
3178  * @ch: what the page is filled from.  We only support zero
3179  * @size: size of the zero page
3180  */
ram_handle_compressed(void * host,uint8_t ch,uint64_t size)3181 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3182 {
3183     if (ch != 0 || !is_zero_range(host, size)) {
3184         memset(host, ch, size);
3185     }
3186 }
3187 
3188 /* return the size after decompression, or negative value on error */
3189 static int
qemu_uncompress_data(z_stream * stream,uint8_t * dest,size_t dest_len,const uint8_t * source,size_t source_len)3190 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3191                      const uint8_t *source, size_t source_len)
3192 {
3193     int err;
3194 
3195     err = inflateReset(stream);
3196     if (err != Z_OK) {
3197         return -1;
3198     }
3199 
3200     stream->avail_in = source_len;
3201     stream->next_in = (uint8_t *)source;
3202     stream->avail_out = dest_len;
3203     stream->next_out = dest;
3204 
3205     err = inflate(stream, Z_NO_FLUSH);
3206     if (err != Z_STREAM_END) {
3207         return -1;
3208     }
3209 
3210     return stream->total_out;
3211 }
3212 
do_data_decompress(void * opaque)3213 static void *do_data_decompress(void *opaque)
3214 {
3215     DecompressParam *param = opaque;
3216     unsigned long pagesize;
3217     uint8_t *des;
3218     int len, ret;
3219 
3220     qemu_mutex_lock(&param->mutex);
3221     while (!param->quit) {
3222         if (param->des) {
3223             des = param->des;
3224             len = param->len;
3225             param->des = 0;
3226             qemu_mutex_unlock(&param->mutex);
3227 
3228             pagesize = TARGET_PAGE_SIZE;
3229 
3230             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3231                                        param->compbuf, len);
3232             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3233                 error_report("decompress data failed");
3234                 qemu_file_set_error(decomp_file, ret);
3235             }
3236 
3237             qemu_mutex_lock(&decomp_done_lock);
3238             param->done = true;
3239             qemu_cond_signal(&decomp_done_cond);
3240             qemu_mutex_unlock(&decomp_done_lock);
3241 
3242             qemu_mutex_lock(&param->mutex);
3243         } else {
3244             qemu_cond_wait(&param->cond, &param->mutex);
3245         }
3246     }
3247     qemu_mutex_unlock(&param->mutex);
3248 
3249     return NULL;
3250 }
3251 
wait_for_decompress_done(void)3252 static int wait_for_decompress_done(void)
3253 {
3254     int idx, thread_count;
3255 
3256     if (!migrate_use_compression()) {
3257         return 0;
3258     }
3259 
3260     thread_count = migrate_decompress_threads();
3261     qemu_mutex_lock(&decomp_done_lock);
3262     for (idx = 0; idx < thread_count; idx++) {
3263         while (!decomp_param[idx].done) {
3264             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3265         }
3266     }
3267     qemu_mutex_unlock(&decomp_done_lock);
3268     return qemu_file_get_error(decomp_file);
3269 }
3270 
compress_threads_load_cleanup(void)3271 static void compress_threads_load_cleanup(void)
3272 {
3273     int i, thread_count;
3274 
3275     if (!migrate_use_compression()) {
3276         return;
3277     }
3278     thread_count = migrate_decompress_threads();
3279     for (i = 0; i < thread_count; i++) {
3280         /*
3281          * we use it as a indicator which shows if the thread is
3282          * properly init'd or not
3283          */
3284         if (!decomp_param[i].compbuf) {
3285             break;
3286         }
3287 
3288         qemu_mutex_lock(&decomp_param[i].mutex);
3289         decomp_param[i].quit = true;
3290         qemu_cond_signal(&decomp_param[i].cond);
3291         qemu_mutex_unlock(&decomp_param[i].mutex);
3292     }
3293     for (i = 0; i < thread_count; i++) {
3294         if (!decomp_param[i].compbuf) {
3295             break;
3296         }
3297 
3298         qemu_thread_join(decompress_threads + i);
3299         qemu_mutex_destroy(&decomp_param[i].mutex);
3300         qemu_cond_destroy(&decomp_param[i].cond);
3301         inflateEnd(&decomp_param[i].stream);
3302         g_free(decomp_param[i].compbuf);
3303         decomp_param[i].compbuf = NULL;
3304     }
3305     g_free(decompress_threads);
3306     g_free(decomp_param);
3307     decompress_threads = NULL;
3308     decomp_param = NULL;
3309     decomp_file = NULL;
3310 }
3311 
compress_threads_load_setup(QEMUFile * f)3312 static int compress_threads_load_setup(QEMUFile *f)
3313 {
3314     int i, thread_count;
3315 
3316     if (!migrate_use_compression()) {
3317         return 0;
3318     }
3319 
3320     thread_count = migrate_decompress_threads();
3321     decompress_threads = g_new0(QemuThread, thread_count);
3322     decomp_param = g_new0(DecompressParam, thread_count);
3323     qemu_mutex_init(&decomp_done_lock);
3324     qemu_cond_init(&decomp_done_cond);
3325     decomp_file = f;
3326     for (i = 0; i < thread_count; i++) {
3327         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3328             goto exit;
3329         }
3330 
3331         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3332         qemu_mutex_init(&decomp_param[i].mutex);
3333         qemu_cond_init(&decomp_param[i].cond);
3334         decomp_param[i].done = true;
3335         decomp_param[i].quit = false;
3336         qemu_thread_create(decompress_threads + i, "decompress",
3337                            do_data_decompress, decomp_param + i,
3338                            QEMU_THREAD_JOINABLE);
3339     }
3340     return 0;
3341 exit:
3342     compress_threads_load_cleanup();
3343     return -1;
3344 }
3345 
decompress_data_with_multi_threads(QEMUFile * f,void * host,int len)3346 static void decompress_data_with_multi_threads(QEMUFile *f,
3347                                                void *host, int len)
3348 {
3349     int idx, thread_count;
3350 
3351     thread_count = migrate_decompress_threads();
3352     QEMU_LOCK_GUARD(&decomp_done_lock);
3353     while (true) {
3354         for (idx = 0; idx < thread_count; idx++) {
3355             if (decomp_param[idx].done) {
3356                 decomp_param[idx].done = false;
3357                 qemu_mutex_lock(&decomp_param[idx].mutex);
3358                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3359                 decomp_param[idx].des = host;
3360                 decomp_param[idx].len = len;
3361                 qemu_cond_signal(&decomp_param[idx].cond);
3362                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3363                 break;
3364             }
3365         }
3366         if (idx < thread_count) {
3367             break;
3368         } else {
3369             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3370         }
3371     }
3372 }
3373 
colo_init_ram_state(void)3374 static void colo_init_ram_state(void)
3375 {
3376     ram_state_init(&ram_state);
3377 }
3378 
3379 /*
3380  * colo cache: this is for secondary VM, we cache the whole
3381  * memory of the secondary VM, it is need to hold the global lock
3382  * to call this helper.
3383  */
colo_init_ram_cache(void)3384 int colo_init_ram_cache(void)
3385 {
3386     RAMBlock *block;
3387 
3388     WITH_RCU_READ_LOCK_GUARD() {
3389         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3390             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3391                                                     NULL, false, false);
3392             if (!block->colo_cache) {
3393                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3394                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3395                              block->used_length);
3396                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3397                     if (block->colo_cache) {
3398                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3399                         block->colo_cache = NULL;
3400                     }
3401                 }
3402                 return -errno;
3403             }
3404         }
3405     }
3406 
3407     /*
3408     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3409     * with to decide which page in cache should be flushed into SVM's RAM. Here
3410     * we use the same name 'ram_bitmap' as for migration.
3411     */
3412     if (ram_bytes_total()) {
3413         RAMBlock *block;
3414 
3415         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3416             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3417             block->bmap = bitmap_new(pages);
3418         }
3419     }
3420 
3421     colo_init_ram_state();
3422     return 0;
3423 }
3424 
3425 /* TODO: duplicated with ram_init_bitmaps */
colo_incoming_start_dirty_log(void)3426 void colo_incoming_start_dirty_log(void)
3427 {
3428     RAMBlock *block = NULL;
3429     /* For memory_global_dirty_log_start below. */
3430     qemu_mutex_lock_iothread();
3431     qemu_mutex_lock_ramlist();
3432 
3433     memory_global_dirty_log_sync();
3434     WITH_RCU_READ_LOCK_GUARD() {
3435         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3436             ramblock_sync_dirty_bitmap(ram_state, block);
3437             /* Discard this dirty bitmap record */
3438             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3439         }
3440         memory_global_dirty_log_start();
3441     }
3442     ram_state->migration_dirty_pages = 0;
3443     qemu_mutex_unlock_ramlist();
3444     qemu_mutex_unlock_iothread();
3445 }
3446 
3447 /* It is need to hold the global lock to call this helper */
colo_release_ram_cache(void)3448 void colo_release_ram_cache(void)
3449 {
3450     RAMBlock *block;
3451 
3452     memory_global_dirty_log_stop();
3453     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3454         g_free(block->bmap);
3455         block->bmap = NULL;
3456     }
3457 
3458     WITH_RCU_READ_LOCK_GUARD() {
3459         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3460             if (block->colo_cache) {
3461                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3462                 block->colo_cache = NULL;
3463             }
3464         }
3465     }
3466     ram_state_cleanup(&ram_state);
3467 }
3468 
3469 /**
3470  * ram_load_setup: Setup RAM for migration incoming side
3471  *
3472  * Returns zero to indicate success and negative for error
3473  *
3474  * @f: QEMUFile where to receive the data
3475  * @opaque: RAMState pointer
3476  */
ram_load_setup(QEMUFile * f,void * opaque)3477 static int ram_load_setup(QEMUFile *f, void *opaque)
3478 {
3479     if (compress_threads_load_setup(f)) {
3480         return -1;
3481     }
3482 
3483     xbzrle_load_setup();
3484     ramblock_recv_map_init();
3485 
3486     return 0;
3487 }
3488 
ram_load_cleanup(void * opaque)3489 static int ram_load_cleanup(void *opaque)
3490 {
3491     RAMBlock *rb;
3492 
3493     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3494         qemu_ram_block_writeback(rb);
3495     }
3496 
3497     xbzrle_load_cleanup();
3498     compress_threads_load_cleanup();
3499 
3500     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3501         g_free(rb->receivedmap);
3502         rb->receivedmap = NULL;
3503     }
3504 
3505     return 0;
3506 }
3507 
3508 /**
3509  * ram_postcopy_incoming_init: allocate postcopy data structures
3510  *
3511  * Returns 0 for success and negative if there was one error
3512  *
3513  * @mis: current migration incoming state
3514  *
3515  * Allocate data structures etc needed by incoming migration with
3516  * postcopy-ram. postcopy-ram's similarly names
3517  * postcopy_ram_incoming_init does the work.
3518  */
ram_postcopy_incoming_init(MigrationIncomingState * mis)3519 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3520 {
3521     return postcopy_ram_incoming_init(mis);
3522 }
3523 
3524 /**
3525  * ram_load_postcopy: load a page in postcopy case
3526  *
3527  * Returns 0 for success or -errno in case of error
3528  *
3529  * Called in postcopy mode by ram_load().
3530  * rcu_read_lock is taken prior to this being called.
3531  *
3532  * @f: QEMUFile where to send the data
3533  */
ram_load_postcopy(QEMUFile * f)3534 static int ram_load_postcopy(QEMUFile *f)
3535 {
3536     int flags = 0, ret = 0;
3537     bool place_needed = false;
3538     bool matches_target_page_size = false;
3539     MigrationIncomingState *mis = migration_incoming_get_current();
3540     /* Temporary page that is later 'placed' */
3541     void *postcopy_host_page = mis->postcopy_tmp_page;
3542     void *host_page = NULL;
3543     bool all_zero = true;
3544     int target_pages = 0;
3545 
3546     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3547         ram_addr_t addr;
3548         void *page_buffer = NULL;
3549         void *place_source = NULL;
3550         RAMBlock *block = NULL;
3551         uint8_t ch;
3552         int len;
3553 
3554         addr = qemu_get_be64(f);
3555 
3556         /*
3557          * If qemu file error, we should stop here, and then "addr"
3558          * may be invalid
3559          */
3560         ret = qemu_file_get_error(f);
3561         if (ret) {
3562             break;
3563         }
3564 
3565         flags = addr & ~TARGET_PAGE_MASK;
3566         addr &= TARGET_PAGE_MASK;
3567 
3568         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3569         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3570                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3571             block = ram_block_from_stream(f, flags);
3572             if (!block) {
3573                 ret = -EINVAL;
3574                 break;
3575             }
3576 
3577             /*
3578              * Relying on used_length is racy and can result in false positives.
3579              * We might place pages beyond used_length in case RAM was shrunk
3580              * while in postcopy, which is fine - trying to place via
3581              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3582              */
3583             if (!block->host || addr >= block->postcopy_length) {
3584                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3585                 ret = -EINVAL;
3586                 break;
3587             }
3588             target_pages++;
3589             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3590             /*
3591              * Postcopy requires that we place whole host pages atomically;
3592              * these may be huge pages for RAMBlocks that are backed by
3593              * hugetlbfs.
3594              * To make it atomic, the data is read into a temporary page
3595              * that's moved into place later.
3596              * The migration protocol uses,  possibly smaller, target-pages
3597              * however the source ensures it always sends all the components
3598              * of a host page in one chunk.
3599              */
3600             page_buffer = postcopy_host_page +
3601                           host_page_offset_from_ram_block_offset(block, addr);
3602             /* If all TP are zero then we can optimise the place */
3603             if (target_pages == 1) {
3604                 host_page = host_page_from_ram_block_offset(block, addr);
3605             } else if (host_page != host_page_from_ram_block_offset(block,
3606                                                                     addr)) {
3607                 /* not the 1st TP within the HP */
3608                 error_report("Non-same host page %p/%p", host_page,
3609                              host_page_from_ram_block_offset(block, addr));
3610                 ret = -EINVAL;
3611                 break;
3612             }
3613 
3614             /*
3615              * If it's the last part of a host page then we place the host
3616              * page
3617              */
3618             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3619                 place_needed = true;
3620             }
3621             place_source = postcopy_host_page;
3622         }
3623 
3624         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3625         case RAM_SAVE_FLAG_ZERO:
3626             ch = qemu_get_byte(f);
3627             /*
3628              * Can skip to set page_buffer when
3629              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3630              */
3631             if (ch || !matches_target_page_size) {
3632                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3633             }
3634             if (ch) {
3635                 all_zero = false;
3636             }
3637             break;
3638 
3639         case RAM_SAVE_FLAG_PAGE:
3640             all_zero = false;
3641             if (!matches_target_page_size) {
3642                 /* For huge pages, we always use temporary buffer */
3643                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3644             } else {
3645                 /*
3646                  * For small pages that matches target page size, we
3647                  * avoid the qemu_file copy.  Instead we directly use
3648                  * the buffer of QEMUFile to place the page.  Note: we
3649                  * cannot do any QEMUFile operation before using that
3650                  * buffer to make sure the buffer is valid when
3651                  * placing the page.
3652                  */
3653                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3654                                          TARGET_PAGE_SIZE);
3655             }
3656             break;
3657         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3658             all_zero = false;
3659             len = qemu_get_be32(f);
3660             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3661                 error_report("Invalid compressed data length: %d", len);
3662                 ret = -EINVAL;
3663                 break;
3664             }
3665             decompress_data_with_multi_threads(f, page_buffer, len);
3666             break;
3667 
3668         case RAM_SAVE_FLAG_EOS:
3669             /* normal exit */
3670             multifd_recv_sync_main();
3671             break;
3672         default:
3673             error_report("Unknown combination of migration flags: 0x%x"
3674                          " (postcopy mode)", flags);
3675             ret = -EINVAL;
3676             break;
3677         }
3678 
3679         /* Got the whole host page, wait for decompress before placing. */
3680         if (place_needed) {
3681             ret |= wait_for_decompress_done();
3682         }
3683 
3684         /* Detect for any possible file errors */
3685         if (!ret && qemu_file_get_error(f)) {
3686             ret = qemu_file_get_error(f);
3687         }
3688 
3689         if (!ret && place_needed) {
3690             if (all_zero) {
3691                 ret = postcopy_place_page_zero(mis, host_page, block);
3692             } else {
3693                 ret = postcopy_place_page(mis, host_page, place_source,
3694                                           block);
3695             }
3696             place_needed = false;
3697             target_pages = 0;
3698             /* Assume we have a zero page until we detect something different */
3699             all_zero = true;
3700         }
3701     }
3702 
3703     return ret;
3704 }
3705 
postcopy_is_advised(void)3706 static bool postcopy_is_advised(void)
3707 {
3708     PostcopyState ps = postcopy_state_get();
3709     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3710 }
3711 
postcopy_is_running(void)3712 static bool postcopy_is_running(void)
3713 {
3714     PostcopyState ps = postcopy_state_get();
3715     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3716 }
3717 
3718 /*
3719  * Flush content of RAM cache into SVM's memory.
3720  * Only flush the pages that be dirtied by PVM or SVM or both.
3721  */
colo_flush_ram_cache(void)3722 void colo_flush_ram_cache(void)
3723 {
3724     RAMBlock *block = NULL;
3725     void *dst_host;
3726     void *src_host;
3727     unsigned long offset = 0;
3728 
3729     memory_global_dirty_log_sync();
3730     qemu_mutex_lock(&ram_state->bitmap_mutex);
3731     WITH_RCU_READ_LOCK_GUARD() {
3732         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3733             ramblock_sync_dirty_bitmap(ram_state, block);
3734         }
3735     }
3736 
3737     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3738     WITH_RCU_READ_LOCK_GUARD() {
3739         block = QLIST_FIRST_RCU(&ram_list.blocks);
3740 
3741         while (block) {
3742             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3743 
3744             if (!offset_in_ramblock(block,
3745                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3746                 offset = 0;
3747                 block = QLIST_NEXT_RCU(block, next);
3748             } else {
3749                 migration_bitmap_clear_dirty(ram_state, block, offset);
3750                 dst_host = block->host
3751                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3752                 src_host = block->colo_cache
3753                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3754                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3755             }
3756         }
3757     }
3758     trace_colo_flush_ram_cache_end();
3759     qemu_mutex_unlock(&ram_state->bitmap_mutex);
3760 }
3761 
3762 /**
3763  * ram_load_precopy: load pages in precopy case
3764  *
3765  * Returns 0 for success or -errno in case of error
3766  *
3767  * Called in precopy mode by ram_load().
3768  * rcu_read_lock is taken prior to this being called.
3769  *
3770  * @f: QEMUFile where to send the data
3771  */
ram_load_precopy(QEMUFile * f)3772 static int ram_load_precopy(QEMUFile *f)
3773 {
3774     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3775     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3776     bool postcopy_advised = postcopy_is_advised();
3777     if (!migrate_use_compression()) {
3778         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3779     }
3780 
3781     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3782         ram_addr_t addr, total_ram_bytes;
3783         void *host = NULL, *host_bak = NULL;
3784         uint8_t ch;
3785 
3786         /*
3787          * Yield periodically to let main loop run, but an iteration of
3788          * the main loop is expensive, so do it each some iterations
3789          */
3790         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3791             aio_co_schedule(qemu_get_current_aio_context(),
3792                             qemu_coroutine_self());
3793             qemu_coroutine_yield();
3794         }
3795         i++;
3796 
3797         addr = qemu_get_be64(f);
3798         flags = addr & ~TARGET_PAGE_MASK;
3799         addr &= TARGET_PAGE_MASK;
3800 
3801         if (flags & invalid_flags) {
3802             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3803                 error_report("Received an unexpected compressed page");
3804             }
3805 
3806             ret = -EINVAL;
3807             break;
3808         }
3809 
3810         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3811                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3812             RAMBlock *block = ram_block_from_stream(f, flags);
3813 
3814             host = host_from_ram_block_offset(block, addr);
3815             /*
3816              * After going into COLO stage, we should not load the page
3817              * into SVM's memory directly, we put them into colo_cache firstly.
3818              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3819              * Previously, we copied all these memory in preparing stage of COLO
3820              * while we need to stop VM, which is a time-consuming process.
3821              * Here we optimize it by a trick, back-up every page while in
3822              * migration process while COLO is enabled, though it affects the
3823              * speed of the migration, but it obviously reduce the downtime of
3824              * back-up all SVM'S memory in COLO preparing stage.
3825              */
3826             if (migration_incoming_colo_enabled()) {
3827                 if (migration_incoming_in_colo_state()) {
3828                     /* In COLO stage, put all pages into cache temporarily */
3829                     host = colo_cache_from_block_offset(block, addr, true);
3830                 } else {
3831                    /*
3832                     * In migration stage but before COLO stage,
3833                     * Put all pages into both cache and SVM's memory.
3834                     */
3835                     host_bak = colo_cache_from_block_offset(block, addr, false);
3836                 }
3837             }
3838             if (!host) {
3839                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3840                 ret = -EINVAL;
3841                 break;
3842             }
3843             if (!migration_incoming_in_colo_state()) {
3844                 ramblock_recv_bitmap_set(block, host);
3845             }
3846 
3847             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3848         }
3849 
3850         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3851         case RAM_SAVE_FLAG_MEM_SIZE:
3852             /* Synchronize RAM block list */
3853             total_ram_bytes = addr;
3854             while (!ret && total_ram_bytes) {
3855                 RAMBlock *block;
3856                 char id[256];
3857                 ram_addr_t length;
3858 
3859                 len = qemu_get_byte(f);
3860                 qemu_get_buffer(f, (uint8_t *)id, len);
3861                 id[len] = 0;
3862                 length = qemu_get_be64(f);
3863 
3864                 block = qemu_ram_block_by_name(id);
3865                 if (block && !qemu_ram_is_migratable(block)) {
3866                     error_report("block %s should not be migrated !", id);
3867                     ret = -EINVAL;
3868                 } else if (block) {
3869                     if (length != block->used_length) {
3870                         Error *local_err = NULL;
3871 
3872                         ret = qemu_ram_resize(block, length,
3873                                               &local_err);
3874                         if (local_err) {
3875                             error_report_err(local_err);
3876                         }
3877                     }
3878                     /* For postcopy we need to check hugepage sizes match */
3879                     if (postcopy_advised && migrate_postcopy_ram() &&
3880                         block->page_size != qemu_host_page_size) {
3881                         uint64_t remote_page_size = qemu_get_be64(f);
3882                         if (remote_page_size != block->page_size) {
3883                             error_report("Mismatched RAM page size %s "
3884                                          "(local) %zd != %" PRId64,
3885                                          id, block->page_size,
3886                                          remote_page_size);
3887                             ret = -EINVAL;
3888                         }
3889                     }
3890                     if (migrate_ignore_shared()) {
3891                         hwaddr addr = qemu_get_be64(f);
3892                         if (ramblock_is_ignored(block) &&
3893                             block->mr->addr != addr) {
3894                             error_report("Mismatched GPAs for block %s "
3895                                          "%" PRId64 "!= %" PRId64,
3896                                          id, (uint64_t)addr,
3897                                          (uint64_t)block->mr->addr);
3898                             ret = -EINVAL;
3899                         }
3900                     }
3901                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3902                                           block->idstr);
3903                 } else {
3904                     error_report("Unknown ramblock \"%s\", cannot "
3905                                  "accept migration", id);
3906                     ret = -EINVAL;
3907                 }
3908 
3909                 total_ram_bytes -= length;
3910             }
3911             break;
3912 
3913         case RAM_SAVE_FLAG_ZERO:
3914             ch = qemu_get_byte(f);
3915             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3916             break;
3917 
3918         case RAM_SAVE_FLAG_PAGE:
3919             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3920             break;
3921 
3922         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3923             len = qemu_get_be32(f);
3924             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3925                 error_report("Invalid compressed data length: %d", len);
3926                 ret = -EINVAL;
3927                 break;
3928             }
3929             decompress_data_with_multi_threads(f, host, len);
3930             break;
3931 
3932         case RAM_SAVE_FLAG_XBZRLE:
3933             if (load_xbzrle(f, addr, host) < 0) {
3934                 error_report("Failed to decompress XBZRLE page at "
3935                              RAM_ADDR_FMT, addr);
3936                 ret = -EINVAL;
3937                 break;
3938             }
3939             break;
3940         case RAM_SAVE_FLAG_EOS:
3941             /* normal exit */
3942             multifd_recv_sync_main();
3943             break;
3944         default:
3945             if (flags & RAM_SAVE_FLAG_HOOK) {
3946                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3947             } else {
3948                 error_report("Unknown combination of migration flags: 0x%x",
3949                              flags);
3950                 ret = -EINVAL;
3951             }
3952         }
3953         if (!ret) {
3954             ret = qemu_file_get_error(f);
3955         }
3956         if (!ret && host_bak) {
3957             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3958         }
3959     }
3960 
3961     ret |= wait_for_decompress_done();
3962     return ret;
3963 }
3964 
ram_load(QEMUFile * f,void * opaque,int version_id)3965 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3966 {
3967     int ret = 0;
3968     static uint64_t seq_iter;
3969     /*
3970      * If system is running in postcopy mode, page inserts to host memory must
3971      * be atomic
3972      */
3973     bool postcopy_running = postcopy_is_running();
3974 
3975     seq_iter++;
3976 
3977     if (version_id != 4) {
3978         return -EINVAL;
3979     }
3980 
3981     /*
3982      * This RCU critical section can be very long running.
3983      * When RCU reclaims in the code start to become numerous,
3984      * it will be necessary to reduce the granularity of this
3985      * critical section.
3986      */
3987     WITH_RCU_READ_LOCK_GUARD() {
3988         if (postcopy_running) {
3989             ret = ram_load_postcopy(f);
3990         } else {
3991             ret = ram_load_precopy(f);
3992         }
3993     }
3994     trace_ram_load_complete(ret, seq_iter);
3995 
3996     return ret;
3997 }
3998 
ram_has_postcopy(void * opaque)3999 static bool ram_has_postcopy(void *opaque)
4000 {
4001     RAMBlock *rb;
4002     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4003         if (ramblock_is_pmem(rb)) {
4004             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4005                          "is not supported now!", rb->idstr, rb->host);
4006             return false;
4007         }
4008     }
4009 
4010     return migrate_postcopy_ram();
4011 }
4012 
4013 /* Sync all the dirty bitmap with destination VM.  */
ram_dirty_bitmap_sync_all(MigrationState * s,RAMState * rs)4014 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4015 {
4016     RAMBlock *block;
4017     QEMUFile *file = s->to_dst_file;
4018     int ramblock_count = 0;
4019 
4020     trace_ram_dirty_bitmap_sync_start();
4021 
4022     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4023         qemu_savevm_send_recv_bitmap(file, block->idstr);
4024         trace_ram_dirty_bitmap_request(block->idstr);
4025         ramblock_count++;
4026     }
4027 
4028     trace_ram_dirty_bitmap_sync_wait();
4029 
4030     /* Wait until all the ramblocks' dirty bitmap synced */
4031     while (ramblock_count--) {
4032         qemu_sem_wait(&s->rp_state.rp_sem);
4033     }
4034 
4035     trace_ram_dirty_bitmap_sync_complete();
4036 
4037     return 0;
4038 }
4039 
ram_dirty_bitmap_reload_notify(MigrationState * s)4040 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4041 {
4042     qemu_sem_post(&s->rp_state.rp_sem);
4043 }
4044 
4045 /*
4046  * Read the received bitmap, revert it as the initial dirty bitmap.
4047  * This is only used when the postcopy migration is paused but wants
4048  * to resume from a middle point.
4049  */
ram_dirty_bitmap_reload(MigrationState * s,RAMBlock * block)4050 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4051 {
4052     int ret = -EINVAL;
4053     /* from_dst_file is always valid because we're within rp_thread */
4054     QEMUFile *file = s->rp_state.from_dst_file;
4055     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4056     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4057     uint64_t size, end_mark;
4058 
4059     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4060 
4061     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4062         error_report("%s: incorrect state %s", __func__,
4063                      MigrationStatus_str(s->state));
4064         return -EINVAL;
4065     }
4066 
4067     /*
4068      * Note: see comments in ramblock_recv_bitmap_send() on why we
4069      * need the endianness conversion, and the paddings.
4070      */
4071     local_size = ROUND_UP(local_size, 8);
4072 
4073     /* Add paddings */
4074     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4075 
4076     size = qemu_get_be64(file);
4077 
4078     /* The size of the bitmap should match with our ramblock */
4079     if (size != local_size) {
4080         error_report("%s: ramblock '%s' bitmap size mismatch "
4081                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4082                      block->idstr, size, local_size);
4083         ret = -EINVAL;
4084         goto out;
4085     }
4086 
4087     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4088     end_mark = qemu_get_be64(file);
4089 
4090     ret = qemu_file_get_error(file);
4091     if (ret || size != local_size) {
4092         error_report("%s: read bitmap failed for ramblock '%s': %d"
4093                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4094                      __func__, block->idstr, ret, local_size, size);
4095         ret = -EIO;
4096         goto out;
4097     }
4098 
4099     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4100         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4101                      __func__, block->idstr, end_mark);
4102         ret = -EINVAL;
4103         goto out;
4104     }
4105 
4106     /*
4107      * Endianness conversion. We are during postcopy (though paused).
4108      * The dirty bitmap won't change. We can directly modify it.
4109      */
4110     bitmap_from_le(block->bmap, le_bitmap, nbits);
4111 
4112     /*
4113      * What we received is "received bitmap". Revert it as the initial
4114      * dirty bitmap for this ramblock.
4115      */
4116     bitmap_complement(block->bmap, block->bmap, nbits);
4117 
4118     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4119 
4120     /*
4121      * We succeeded to sync bitmap for current ramblock. If this is
4122      * the last one to sync, we need to notify the main send thread.
4123      */
4124     ram_dirty_bitmap_reload_notify(s);
4125 
4126     ret = 0;
4127 out:
4128     g_free(le_bitmap);
4129     return ret;
4130 }
4131 
ram_resume_prepare(MigrationState * s,void * opaque)4132 static int ram_resume_prepare(MigrationState *s, void *opaque)
4133 {
4134     RAMState *rs = *(RAMState **)opaque;
4135     int ret;
4136 
4137     ret = ram_dirty_bitmap_sync_all(s, rs);
4138     if (ret) {
4139         return ret;
4140     }
4141 
4142     ram_state_resume_prepare(rs, s->to_dst_file);
4143 
4144     return 0;
4145 }
4146 
4147 static SaveVMHandlers savevm_ram_handlers = {
4148     .save_setup = ram_save_setup,
4149     .save_live_iterate = ram_save_iterate,
4150     .save_live_complete_postcopy = ram_save_complete,
4151     .save_live_complete_precopy = ram_save_complete,
4152     .has_postcopy = ram_has_postcopy,
4153     .save_live_pending = ram_save_pending,
4154     .load_state = ram_load,
4155     .save_cleanup = ram_save_cleanup,
4156     .load_setup = ram_load_setup,
4157     .load_cleanup = ram_load_cleanup,
4158     .resume_prepare = ram_resume_prepare,
4159 };
4160 
ram_mig_ram_block_resized(RAMBlockNotifier * n,void * host,size_t old_size,size_t new_size)4161 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4162                                       size_t old_size, size_t new_size)
4163 {
4164     PostcopyState ps = postcopy_state_get();
4165     ram_addr_t offset;
4166     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4167     Error *err = NULL;
4168 
4169     if (ramblock_is_ignored(rb)) {
4170         return;
4171     }
4172 
4173     if (!migration_is_idle()) {
4174         /*
4175          * Precopy code on the source cannot deal with the size of RAM blocks
4176          * changing at random points in time - especially after sending the
4177          * RAM block sizes in the migration stream, they must no longer change.
4178          * Abort and indicate a proper reason.
4179          */
4180         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4181         migrate_set_error(migrate_get_current(), err);
4182         error_free(err);
4183         migration_cancel();
4184     }
4185 
4186     switch (ps) {
4187     case POSTCOPY_INCOMING_ADVISE:
4188         /*
4189          * Update what ram_postcopy_incoming_init()->init_range() does at the
4190          * time postcopy was advised. Syncing RAM blocks with the source will
4191          * result in RAM resizes.
4192          */
4193         if (old_size < new_size) {
4194             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4195                 error_report("RAM block '%s' discard of resized RAM failed",
4196                              rb->idstr);
4197             }
4198         }
4199         rb->postcopy_length = new_size;
4200         break;
4201     case POSTCOPY_INCOMING_NONE:
4202     case POSTCOPY_INCOMING_RUNNING:
4203     case POSTCOPY_INCOMING_END:
4204         /*
4205          * Once our guest is running, postcopy does no longer care about
4206          * resizes. When growing, the new memory was not available on the
4207          * source, no handler needed.
4208          */
4209         break;
4210     default:
4211         error_report("RAM block '%s' resized during postcopy state: %d",
4212                      rb->idstr, ps);
4213         exit(-1);
4214     }
4215 }
4216 
4217 static RAMBlockNotifier ram_mig_ram_notifier = {
4218     .ram_block_resized = ram_mig_ram_block_resized,
4219 };
4220 
ram_mig_init(void)4221 void ram_mig_init(void)
4222 {
4223     qemu_mutex_init(&XBZRLE.lock);
4224     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4225     ram_block_notifier_add(&ram_mig_ram_notifier);
4226 }
4227