xref: /qemu/migration/ram.c (revision 814bb12a)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 #include "qemu/osdep.h"
29 #include "qemu-common.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qapi-event.h"
33 #include "qemu/cutils.h"
34 #include "qemu/bitops.h"
35 #include "qemu/bitmap.h"
36 #include "qemu/timer.h"
37 #include "qemu/main-loop.h"
38 #include "migration/migration.h"
39 #include "migration/postcopy-ram.h"
40 #include "exec/address-spaces.h"
41 #include "migration/page_cache.h"
42 #include "qemu/error-report.h"
43 #include "trace.h"
44 #include "exec/ram_addr.h"
45 #include "qemu/rcu_queue.h"
46 
47 #ifdef DEBUG_MIGRATION_RAM
48 #define DPRINTF(fmt, ...) \
49     do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0)
50 #else
51 #define DPRINTF(fmt, ...) \
52     do { } while (0)
53 #endif
54 
55 static int dirty_rate_high_cnt;
56 
57 static uint64_t bitmap_sync_count;
58 
59 /***********************************************************/
60 /* ram save/restore */
61 
62 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
63 #define RAM_SAVE_FLAG_COMPRESS 0x02
64 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
65 #define RAM_SAVE_FLAG_PAGE     0x08
66 #define RAM_SAVE_FLAG_EOS      0x10
67 #define RAM_SAVE_FLAG_CONTINUE 0x20
68 #define RAM_SAVE_FLAG_XBZRLE   0x40
69 /* 0x80 is reserved in migration.h start with 0x100 next */
70 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
71 
72 static uint8_t *ZERO_TARGET_PAGE;
73 
74 static inline bool is_zero_range(uint8_t *p, uint64_t size)
75 {
76     return buffer_is_zero(p, size);
77 }
78 
79 /* struct contains XBZRLE cache and a static page
80    used by the compression */
81 static struct {
82     /* buffer used for XBZRLE encoding */
83     uint8_t *encoded_buf;
84     /* buffer for storing page content */
85     uint8_t *current_buf;
86     /* Cache for XBZRLE, Protected by lock. */
87     PageCache *cache;
88     QemuMutex lock;
89 } XBZRLE;
90 
91 /* buffer used for XBZRLE decoding */
92 static uint8_t *xbzrle_decoded_buf;
93 
94 static void XBZRLE_cache_lock(void)
95 {
96     if (migrate_use_xbzrle())
97         qemu_mutex_lock(&XBZRLE.lock);
98 }
99 
100 static void XBZRLE_cache_unlock(void)
101 {
102     if (migrate_use_xbzrle())
103         qemu_mutex_unlock(&XBZRLE.lock);
104 }
105 
106 /*
107  * called from qmp_migrate_set_cache_size in main thread, possibly while
108  * a migration is in progress.
109  * A running migration maybe using the cache and might finish during this
110  * call, hence changes to the cache are protected by XBZRLE.lock().
111  */
112 int64_t xbzrle_cache_resize(int64_t new_size)
113 {
114     PageCache *new_cache;
115     int64_t ret;
116 
117     if (new_size < TARGET_PAGE_SIZE) {
118         return -1;
119     }
120 
121     XBZRLE_cache_lock();
122 
123     if (XBZRLE.cache != NULL) {
124         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
125             goto out_new_size;
126         }
127         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
128                                         TARGET_PAGE_SIZE);
129         if (!new_cache) {
130             error_report("Error creating cache");
131             ret = -1;
132             goto out;
133         }
134 
135         cache_fini(XBZRLE.cache);
136         XBZRLE.cache = new_cache;
137     }
138 
139 out_new_size:
140     ret = pow2floor(new_size);
141 out:
142     XBZRLE_cache_unlock();
143     return ret;
144 }
145 
146 /* accounting for migration statistics */
147 typedef struct AccountingInfo {
148     uint64_t dup_pages;
149     uint64_t skipped_pages;
150     uint64_t norm_pages;
151     uint64_t iterations;
152     uint64_t xbzrle_bytes;
153     uint64_t xbzrle_pages;
154     uint64_t xbzrle_cache_miss;
155     double xbzrle_cache_miss_rate;
156     uint64_t xbzrle_overflows;
157 } AccountingInfo;
158 
159 static AccountingInfo acct_info;
160 
161 static void acct_clear(void)
162 {
163     memset(&acct_info, 0, sizeof(acct_info));
164 }
165 
166 uint64_t dup_mig_bytes_transferred(void)
167 {
168     return acct_info.dup_pages * TARGET_PAGE_SIZE;
169 }
170 
171 uint64_t dup_mig_pages_transferred(void)
172 {
173     return acct_info.dup_pages;
174 }
175 
176 uint64_t skipped_mig_bytes_transferred(void)
177 {
178     return acct_info.skipped_pages * TARGET_PAGE_SIZE;
179 }
180 
181 uint64_t skipped_mig_pages_transferred(void)
182 {
183     return acct_info.skipped_pages;
184 }
185 
186 uint64_t norm_mig_bytes_transferred(void)
187 {
188     return acct_info.norm_pages * TARGET_PAGE_SIZE;
189 }
190 
191 uint64_t norm_mig_pages_transferred(void)
192 {
193     return acct_info.norm_pages;
194 }
195 
196 uint64_t xbzrle_mig_bytes_transferred(void)
197 {
198     return acct_info.xbzrle_bytes;
199 }
200 
201 uint64_t xbzrle_mig_pages_transferred(void)
202 {
203     return acct_info.xbzrle_pages;
204 }
205 
206 uint64_t xbzrle_mig_pages_cache_miss(void)
207 {
208     return acct_info.xbzrle_cache_miss;
209 }
210 
211 double xbzrle_mig_cache_miss_rate(void)
212 {
213     return acct_info.xbzrle_cache_miss_rate;
214 }
215 
216 uint64_t xbzrle_mig_pages_overflow(void)
217 {
218     return acct_info.xbzrle_overflows;
219 }
220 
221 /* This is the last block that we have visited serching for dirty pages
222  */
223 static RAMBlock *last_seen_block;
224 /* This is the last block from where we have sent data */
225 static RAMBlock *last_sent_block;
226 static ram_addr_t last_offset;
227 static QemuMutex migration_bitmap_mutex;
228 static uint64_t migration_dirty_pages;
229 static uint32_t last_version;
230 static bool ram_bulk_stage;
231 
232 /* used by the search for pages to send */
233 struct PageSearchStatus {
234     /* Current block being searched */
235     RAMBlock    *block;
236     /* Current offset to search from */
237     ram_addr_t   offset;
238     /* Set once we wrap around */
239     bool         complete_round;
240 };
241 typedef struct PageSearchStatus PageSearchStatus;
242 
243 static struct BitmapRcu {
244     struct rcu_head rcu;
245     /* Main migration bitmap */
246     unsigned long *bmap;
247     /* bitmap of pages that haven't been sent even once
248      * only maintained and used in postcopy at the moment
249      * where it's used to send the dirtymap at the start
250      * of the postcopy phase
251      */
252     unsigned long *unsentmap;
253 } *migration_bitmap_rcu;
254 
255 struct CompressParam {
256     bool done;
257     bool quit;
258     QEMUFile *file;
259     QemuMutex mutex;
260     QemuCond cond;
261     RAMBlock *block;
262     ram_addr_t offset;
263 };
264 typedef struct CompressParam CompressParam;
265 
266 struct DecompressParam {
267     bool done;
268     bool quit;
269     QemuMutex mutex;
270     QemuCond cond;
271     void *des;
272     uint8_t *compbuf;
273     int len;
274 };
275 typedef struct DecompressParam DecompressParam;
276 
277 static CompressParam *comp_param;
278 static QemuThread *compress_threads;
279 /* comp_done_cond is used to wake up the migration thread when
280  * one of the compression threads has finished the compression.
281  * comp_done_lock is used to co-work with comp_done_cond.
282  */
283 static QemuMutex comp_done_lock;
284 static QemuCond comp_done_cond;
285 /* The empty QEMUFileOps will be used by file in CompressParam */
286 static const QEMUFileOps empty_ops = { };
287 
288 static bool compression_switch;
289 static DecompressParam *decomp_param;
290 static QemuThread *decompress_threads;
291 static QemuMutex decomp_done_lock;
292 static QemuCond decomp_done_cond;
293 
294 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
295                                 ram_addr_t offset);
296 
297 static void *do_data_compress(void *opaque)
298 {
299     CompressParam *param = opaque;
300     RAMBlock *block;
301     ram_addr_t offset;
302 
303     qemu_mutex_lock(&param->mutex);
304     while (!param->quit) {
305         if (param->block) {
306             block = param->block;
307             offset = param->offset;
308             param->block = NULL;
309             qemu_mutex_unlock(&param->mutex);
310 
311             do_compress_ram_page(param->file, block, offset);
312 
313             qemu_mutex_lock(&comp_done_lock);
314             param->done = true;
315             qemu_cond_signal(&comp_done_cond);
316             qemu_mutex_unlock(&comp_done_lock);
317 
318             qemu_mutex_lock(&param->mutex);
319         } else {
320             qemu_cond_wait(&param->cond, &param->mutex);
321         }
322     }
323     qemu_mutex_unlock(&param->mutex);
324 
325     return NULL;
326 }
327 
328 static inline void terminate_compression_threads(void)
329 {
330     int idx, thread_count;
331 
332     thread_count = migrate_compress_threads();
333     for (idx = 0; idx < thread_count; idx++) {
334         qemu_mutex_lock(&comp_param[idx].mutex);
335         comp_param[idx].quit = true;
336         qemu_cond_signal(&comp_param[idx].cond);
337         qemu_mutex_unlock(&comp_param[idx].mutex);
338     }
339 }
340 
341 void migrate_compress_threads_join(void)
342 {
343     int i, thread_count;
344 
345     if (!migrate_use_compression()) {
346         return;
347     }
348     terminate_compression_threads();
349     thread_count = migrate_compress_threads();
350     for (i = 0; i < thread_count; i++) {
351         qemu_thread_join(compress_threads + i);
352         qemu_fclose(comp_param[i].file);
353         qemu_mutex_destroy(&comp_param[i].mutex);
354         qemu_cond_destroy(&comp_param[i].cond);
355     }
356     qemu_mutex_destroy(&comp_done_lock);
357     qemu_cond_destroy(&comp_done_cond);
358     g_free(compress_threads);
359     g_free(comp_param);
360     compress_threads = NULL;
361     comp_param = NULL;
362 }
363 
364 void migrate_compress_threads_create(void)
365 {
366     int i, thread_count;
367 
368     if (!migrate_use_compression()) {
369         return;
370     }
371     compression_switch = true;
372     thread_count = migrate_compress_threads();
373     compress_threads = g_new0(QemuThread, thread_count);
374     comp_param = g_new0(CompressParam, thread_count);
375     qemu_cond_init(&comp_done_cond);
376     qemu_mutex_init(&comp_done_lock);
377     for (i = 0; i < thread_count; i++) {
378         /* comp_param[i].file is just used as a dummy buffer to save data,
379          * set its ops to empty.
380          */
381         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
382         comp_param[i].done = true;
383         comp_param[i].quit = false;
384         qemu_mutex_init(&comp_param[i].mutex);
385         qemu_cond_init(&comp_param[i].cond);
386         qemu_thread_create(compress_threads + i, "compress",
387                            do_data_compress, comp_param + i,
388                            QEMU_THREAD_JOINABLE);
389     }
390 }
391 
392 /**
393  * save_page_header: Write page header to wire
394  *
395  * If this is the 1st block, it also writes the block identification
396  *
397  * Returns: Number of bytes written
398  *
399  * @f: QEMUFile where to send the data
400  * @block: block that contains the page we want to send
401  * @offset: offset inside the block for the page
402  *          in the lower bits, it contains flags
403  */
404 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
405 {
406     size_t size, len;
407 
408     qemu_put_be64(f, offset);
409     size = 8;
410 
411     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
412         len = strlen(block->idstr);
413         qemu_put_byte(f, len);
414         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
415         size += 1 + len;
416     }
417     return size;
418 }
419 
420 /* Reduce amount of guest cpu execution to hopefully slow down memory writes.
421  * If guest dirty memory rate is reduced below the rate at which we can
422  * transfer pages to the destination then we should be able to complete
423  * migration. Some workloads dirty memory way too fast and will not effectively
424  * converge, even with auto-converge.
425  */
426 static void mig_throttle_guest_down(void)
427 {
428     MigrationState *s = migrate_get_current();
429     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
430     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
431 
432     /* We have not started throttling yet. Let's start it. */
433     if (!cpu_throttle_active()) {
434         cpu_throttle_set(pct_initial);
435     } else {
436         /* Throttling already on, just increase the rate */
437         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
438     }
439 }
440 
441 /* Update the xbzrle cache to reflect a page that's been sent as all 0.
442  * The important thing is that a stale (not-yet-0'd) page be replaced
443  * by the new data.
444  * As a bonus, if the page wasn't in the cache it gets added so that
445  * when a small write is made into the 0'd page it gets XBZRLE sent
446  */
447 static void xbzrle_cache_zero_page(ram_addr_t current_addr)
448 {
449     if (ram_bulk_stage || !migrate_use_xbzrle()) {
450         return;
451     }
452 
453     /* We don't care if this fails to allocate a new cache page
454      * as long as it updated an old one */
455     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
456                  bitmap_sync_count);
457 }
458 
459 #define ENCODING_FLAG_XBZRLE 0x1
460 
461 /**
462  * save_xbzrle_page: compress and send current page
463  *
464  * Returns: 1 means that we wrote the page
465  *          0 means that page is identical to the one already sent
466  *          -1 means that xbzrle would be longer than normal
467  *
468  * @f: QEMUFile where to send the data
469  * @current_data:
470  * @current_addr:
471  * @block: block that contains the page we want to send
472  * @offset: offset inside the block for the page
473  * @last_stage: if we are at the completion stage
474  * @bytes_transferred: increase it with the number of transferred bytes
475  */
476 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
477                             ram_addr_t current_addr, RAMBlock *block,
478                             ram_addr_t offset, bool last_stage,
479                             uint64_t *bytes_transferred)
480 {
481     int encoded_len = 0, bytes_xbzrle;
482     uint8_t *prev_cached_page;
483 
484     if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
485         acct_info.xbzrle_cache_miss++;
486         if (!last_stage) {
487             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
488                              bitmap_sync_count) == -1) {
489                 return -1;
490             } else {
491                 /* update *current_data when the page has been
492                    inserted into cache */
493                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
494             }
495         }
496         return -1;
497     }
498 
499     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
500 
501     /* save current buffer into memory */
502     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
503 
504     /* XBZRLE encoding (if there is no overflow) */
505     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
506                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
507                                        TARGET_PAGE_SIZE);
508     if (encoded_len == 0) {
509         DPRINTF("Skipping unmodified page\n");
510         return 0;
511     } else if (encoded_len == -1) {
512         DPRINTF("Overflow\n");
513         acct_info.xbzrle_overflows++;
514         /* update data in the cache */
515         if (!last_stage) {
516             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
517             *current_data = prev_cached_page;
518         }
519         return -1;
520     }
521 
522     /* we need to update the data in the cache, in order to get the same data */
523     if (!last_stage) {
524         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
525     }
526 
527     /* Send XBZRLE based compressed page */
528     bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
529     qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
530     qemu_put_be16(f, encoded_len);
531     qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
532     bytes_xbzrle += encoded_len + 1 + 2;
533     acct_info.xbzrle_pages++;
534     acct_info.xbzrle_bytes += bytes_xbzrle;
535     *bytes_transferred += bytes_xbzrle;
536 
537     return 1;
538 }
539 
540 /* Called with rcu_read_lock() to protect migration_bitmap
541  * rb: The RAMBlock  to search for dirty pages in
542  * start: Start address (typically so we can continue from previous page)
543  * ram_addr_abs: Pointer into which to store the address of the dirty page
544  *               within the global ram_addr space
545  *
546  * Returns: byte offset within memory region of the start of a dirty page
547  */
548 static inline
549 ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
550                                        ram_addr_t start,
551                                        ram_addr_t *ram_addr_abs)
552 {
553     unsigned long base = rb->offset >> TARGET_PAGE_BITS;
554     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
555     uint64_t rb_size = rb->used_length;
556     unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
557     unsigned long *bitmap;
558 
559     unsigned long next;
560 
561     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
562     if (ram_bulk_stage && nr > base) {
563         next = nr + 1;
564     } else {
565         next = find_next_bit(bitmap, size, nr);
566     }
567 
568     *ram_addr_abs = next << TARGET_PAGE_BITS;
569     return (next - base) << TARGET_PAGE_BITS;
570 }
571 
572 static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
573 {
574     bool ret;
575     int nr = addr >> TARGET_PAGE_BITS;
576     unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
577 
578     ret = test_and_clear_bit(nr, bitmap);
579 
580     if (ret) {
581         migration_dirty_pages--;
582     }
583     return ret;
584 }
585 
586 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
587 {
588     unsigned long *bitmap;
589     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
590     migration_dirty_pages +=
591         cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
592 }
593 
594 /* Fix me: there are too many global variables used in migration process. */
595 static int64_t start_time;
596 static int64_t bytes_xfer_prev;
597 static int64_t num_dirty_pages_period;
598 static uint64_t xbzrle_cache_miss_prev;
599 static uint64_t iterations_prev;
600 
601 static void migration_bitmap_sync_init(void)
602 {
603     start_time = 0;
604     bytes_xfer_prev = 0;
605     num_dirty_pages_period = 0;
606     xbzrle_cache_miss_prev = 0;
607     iterations_prev = 0;
608 }
609 
610 static void migration_bitmap_sync(void)
611 {
612     RAMBlock *block;
613     uint64_t num_dirty_pages_init = migration_dirty_pages;
614     MigrationState *s = migrate_get_current();
615     int64_t end_time;
616     int64_t bytes_xfer_now;
617 
618     bitmap_sync_count++;
619 
620     if (!bytes_xfer_prev) {
621         bytes_xfer_prev = ram_bytes_transferred();
622     }
623 
624     if (!start_time) {
625         start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
626     }
627 
628     trace_migration_bitmap_sync_start();
629     memory_global_dirty_log_sync();
630 
631     qemu_mutex_lock(&migration_bitmap_mutex);
632     rcu_read_lock();
633     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
634         migration_bitmap_sync_range(block->offset, block->used_length);
635     }
636     rcu_read_unlock();
637     qemu_mutex_unlock(&migration_bitmap_mutex);
638 
639     trace_migration_bitmap_sync_end(migration_dirty_pages
640                                     - num_dirty_pages_init);
641     num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
642     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
643 
644     /* more than 1 second = 1000 millisecons */
645     if (end_time > start_time + 1000) {
646         if (migrate_auto_converge()) {
647             /* The following detection logic can be refined later. For now:
648                Check to see if the dirtied bytes is 50% more than the approx.
649                amount of bytes that just got transferred since the last time we
650                were in this routine. If that happens twice, start or increase
651                throttling */
652             bytes_xfer_now = ram_bytes_transferred();
653 
654             if (s->dirty_pages_rate &&
655                (num_dirty_pages_period * TARGET_PAGE_SIZE >
656                    (bytes_xfer_now - bytes_xfer_prev)/2) &&
657                (dirty_rate_high_cnt++ >= 2)) {
658                     trace_migration_throttle();
659                     dirty_rate_high_cnt = 0;
660                     mig_throttle_guest_down();
661              }
662              bytes_xfer_prev = bytes_xfer_now;
663         }
664 
665         if (migrate_use_xbzrle()) {
666             if (iterations_prev != acct_info.iterations) {
667                 acct_info.xbzrle_cache_miss_rate =
668                    (double)(acct_info.xbzrle_cache_miss -
669                             xbzrle_cache_miss_prev) /
670                    (acct_info.iterations - iterations_prev);
671             }
672             iterations_prev = acct_info.iterations;
673             xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
674         }
675         s->dirty_pages_rate = num_dirty_pages_period * 1000
676             / (end_time - start_time);
677         s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
678         start_time = end_time;
679         num_dirty_pages_period = 0;
680     }
681     s->dirty_sync_count = bitmap_sync_count;
682     if (migrate_use_events()) {
683         qapi_event_send_migration_pass(bitmap_sync_count, NULL);
684     }
685 }
686 
687 /**
688  * save_zero_page: Send the zero page to the stream
689  *
690  * Returns: Number of pages written.
691  *
692  * @f: QEMUFile where to send the data
693  * @block: block that contains the page we want to send
694  * @offset: offset inside the block for the page
695  * @p: pointer to the page
696  * @bytes_transferred: increase it with the number of transferred bytes
697  */
698 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
699                           uint8_t *p, uint64_t *bytes_transferred)
700 {
701     int pages = -1;
702 
703     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
704         acct_info.dup_pages++;
705         *bytes_transferred += save_page_header(f, block,
706                                                offset | RAM_SAVE_FLAG_COMPRESS);
707         qemu_put_byte(f, 0);
708         *bytes_transferred += 1;
709         pages = 1;
710     }
711 
712     return pages;
713 }
714 
715 /**
716  * ram_save_page: Send the given page to the stream
717  *
718  * Returns: Number of pages written.
719  *          < 0 - error
720  *          >=0 - Number of pages written - this might legally be 0
721  *                if xbzrle noticed the page was the same.
722  *
723  * @f: QEMUFile where to send the data
724  * @block: block that contains the page we want to send
725  * @offset: offset inside the block for the page
726  * @last_stage: if we are at the completion stage
727  * @bytes_transferred: increase it with the number of transferred bytes
728  */
729 static int ram_save_page(QEMUFile *f, PageSearchStatus *pss,
730                          bool last_stage, uint64_t *bytes_transferred)
731 {
732     int pages = -1;
733     uint64_t bytes_xmit;
734     ram_addr_t current_addr;
735     uint8_t *p;
736     int ret;
737     bool send_async = true;
738     RAMBlock *block = pss->block;
739     ram_addr_t offset = pss->offset;
740 
741     p = block->host + offset;
742 
743     /* In doubt sent page as normal */
744     bytes_xmit = 0;
745     ret = ram_control_save_page(f, block->offset,
746                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
747     if (bytes_xmit) {
748         *bytes_transferred += bytes_xmit;
749         pages = 1;
750     }
751 
752     XBZRLE_cache_lock();
753 
754     current_addr = block->offset + offset;
755 
756     if (block == last_sent_block) {
757         offset |= RAM_SAVE_FLAG_CONTINUE;
758     }
759     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
760         if (ret != RAM_SAVE_CONTROL_DELAYED) {
761             if (bytes_xmit > 0) {
762                 acct_info.norm_pages++;
763             } else if (bytes_xmit == 0) {
764                 acct_info.dup_pages++;
765             }
766         }
767     } else {
768         pages = save_zero_page(f, block, offset, p, bytes_transferred);
769         if (pages > 0) {
770             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
771              * page would be stale
772              */
773             xbzrle_cache_zero_page(current_addr);
774         } else if (!ram_bulk_stage &&
775                    !migration_in_postcopy(migrate_get_current()) &&
776                    migrate_use_xbzrle()) {
777             pages = save_xbzrle_page(f, &p, current_addr, block,
778                                      offset, last_stage, bytes_transferred);
779             if (!last_stage) {
780                 /* Can't send this cached data async, since the cache page
781                  * might get updated before it gets to the wire
782                  */
783                 send_async = false;
784             }
785         }
786     }
787 
788     /* XBZRLE overflow or normal page */
789     if (pages == -1) {
790         *bytes_transferred += save_page_header(f, block,
791                                                offset | RAM_SAVE_FLAG_PAGE);
792         if (send_async) {
793             qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
794         } else {
795             qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
796         }
797         *bytes_transferred += TARGET_PAGE_SIZE;
798         pages = 1;
799         acct_info.norm_pages++;
800     }
801 
802     XBZRLE_cache_unlock();
803 
804     return pages;
805 }
806 
807 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
808                                 ram_addr_t offset)
809 {
810     int bytes_sent, blen;
811     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
812 
813     bytes_sent = save_page_header(f, block, offset |
814                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
815     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
816                                      migrate_compress_level());
817     if (blen < 0) {
818         bytes_sent = 0;
819         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
820         error_report("compressed data failed!");
821     } else {
822         bytes_sent += blen;
823     }
824 
825     return bytes_sent;
826 }
827 
828 static uint64_t bytes_transferred;
829 
830 static void flush_compressed_data(QEMUFile *f)
831 {
832     int idx, len, thread_count;
833 
834     if (!migrate_use_compression()) {
835         return;
836     }
837     thread_count = migrate_compress_threads();
838 
839     qemu_mutex_lock(&comp_done_lock);
840     for (idx = 0; idx < thread_count; idx++) {
841         while (!comp_param[idx].done) {
842             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
843         }
844     }
845     qemu_mutex_unlock(&comp_done_lock);
846 
847     for (idx = 0; idx < thread_count; idx++) {
848         qemu_mutex_lock(&comp_param[idx].mutex);
849         if (!comp_param[idx].quit) {
850             len = qemu_put_qemu_file(f, comp_param[idx].file);
851             bytes_transferred += len;
852         }
853         qemu_mutex_unlock(&comp_param[idx].mutex);
854     }
855 }
856 
857 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
858                                        ram_addr_t offset)
859 {
860     param->block = block;
861     param->offset = offset;
862 }
863 
864 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
865                                            ram_addr_t offset,
866                                            uint64_t *bytes_transferred)
867 {
868     int idx, thread_count, bytes_xmit = -1, pages = -1;
869 
870     thread_count = migrate_compress_threads();
871     qemu_mutex_lock(&comp_done_lock);
872     while (true) {
873         for (idx = 0; idx < thread_count; idx++) {
874             if (comp_param[idx].done) {
875                 comp_param[idx].done = false;
876                 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
877                 qemu_mutex_lock(&comp_param[idx].mutex);
878                 set_compress_params(&comp_param[idx], block, offset);
879                 qemu_cond_signal(&comp_param[idx].cond);
880                 qemu_mutex_unlock(&comp_param[idx].mutex);
881                 pages = 1;
882                 acct_info.norm_pages++;
883                 *bytes_transferred += bytes_xmit;
884                 break;
885             }
886         }
887         if (pages > 0) {
888             break;
889         } else {
890             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
891         }
892     }
893     qemu_mutex_unlock(&comp_done_lock);
894 
895     return pages;
896 }
897 
898 /**
899  * ram_save_compressed_page: compress the given page and send it to the stream
900  *
901  * Returns: Number of pages written.
902  *
903  * @f: QEMUFile where to send the data
904  * @block: block that contains the page we want to send
905  * @offset: offset inside the block for the page
906  * @last_stage: if we are at the completion stage
907  * @bytes_transferred: increase it with the number of transferred bytes
908  */
909 static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss,
910                                     bool last_stage,
911                                     uint64_t *bytes_transferred)
912 {
913     int pages = -1;
914     uint64_t bytes_xmit = 0;
915     uint8_t *p;
916     int ret, blen;
917     RAMBlock *block = pss->block;
918     ram_addr_t offset = pss->offset;
919 
920     p = block->host + offset;
921 
922     ret = ram_control_save_page(f, block->offset,
923                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
924     if (bytes_xmit) {
925         *bytes_transferred += bytes_xmit;
926         pages = 1;
927     }
928     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
929         if (ret != RAM_SAVE_CONTROL_DELAYED) {
930             if (bytes_xmit > 0) {
931                 acct_info.norm_pages++;
932             } else if (bytes_xmit == 0) {
933                 acct_info.dup_pages++;
934             }
935         }
936     } else {
937         /* When starting the process of a new block, the first page of
938          * the block should be sent out before other pages in the same
939          * block, and all the pages in last block should have been sent
940          * out, keeping this order is important, because the 'cont' flag
941          * is used to avoid resending the block name.
942          */
943         if (block != last_sent_block) {
944             flush_compressed_data(f);
945             pages = save_zero_page(f, block, offset, p, bytes_transferred);
946             if (pages == -1) {
947                 /* Make sure the first page is sent out before other pages */
948                 bytes_xmit = save_page_header(f, block, offset |
949                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
950                 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
951                                                  migrate_compress_level());
952                 if (blen > 0) {
953                     *bytes_transferred += bytes_xmit + blen;
954                     acct_info.norm_pages++;
955                     pages = 1;
956                 } else {
957                     qemu_file_set_error(f, blen);
958                     error_report("compressed data failed!");
959                 }
960             }
961         } else {
962             offset |= RAM_SAVE_FLAG_CONTINUE;
963             pages = save_zero_page(f, block, offset, p, bytes_transferred);
964             if (pages == -1) {
965                 pages = compress_page_with_multi_thread(f, block, offset,
966                                                         bytes_transferred);
967             }
968         }
969     }
970 
971     return pages;
972 }
973 
974 /*
975  * Find the next dirty page and update any state associated with
976  * the search process.
977  *
978  * Returns: True if a page is found
979  *
980  * @f: Current migration stream.
981  * @pss: Data about the state of the current dirty page scan.
982  * @*again: Set to false if the search has scanned the whole of RAM
983  * *ram_addr_abs: Pointer into which to store the address of the dirty page
984  *               within the global ram_addr space
985  */
986 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
987                              bool *again, ram_addr_t *ram_addr_abs)
988 {
989     pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
990                                               ram_addr_abs);
991     if (pss->complete_round && pss->block == last_seen_block &&
992         pss->offset >= last_offset) {
993         /*
994          * We've been once around the RAM and haven't found anything.
995          * Give up.
996          */
997         *again = false;
998         return false;
999     }
1000     if (pss->offset >= pss->block->used_length) {
1001         /* Didn't find anything in this RAM Block */
1002         pss->offset = 0;
1003         pss->block = QLIST_NEXT_RCU(pss->block, next);
1004         if (!pss->block) {
1005             /* Hit the end of the list */
1006             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1007             /* Flag that we've looped */
1008             pss->complete_round = true;
1009             ram_bulk_stage = false;
1010             if (migrate_use_xbzrle()) {
1011                 /* If xbzrle is on, stop using the data compression at this
1012                  * point. In theory, xbzrle can do better than compression.
1013                  */
1014                 flush_compressed_data(f);
1015                 compression_switch = false;
1016             }
1017         }
1018         /* Didn't find anything this time, but try again on the new block */
1019         *again = true;
1020         return false;
1021     } else {
1022         /* Can go around again, but... */
1023         *again = true;
1024         /* We've found something so probably don't need to */
1025         return true;
1026     }
1027 }
1028 
1029 /*
1030  * Helper for 'get_queued_page' - gets a page off the queue
1031  *      ms:      MigrationState in
1032  * *offset:      Used to return the offset within the RAMBlock
1033  * ram_addr_abs: global offset in the dirty/sent bitmaps
1034  *
1035  * Returns:      block (or NULL if none available)
1036  */
1037 static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1038                               ram_addr_t *ram_addr_abs)
1039 {
1040     RAMBlock *block = NULL;
1041 
1042     qemu_mutex_lock(&ms->src_page_req_mutex);
1043     if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1044         struct MigrationSrcPageRequest *entry =
1045                                 QSIMPLEQ_FIRST(&ms->src_page_requests);
1046         block = entry->rb;
1047         *offset = entry->offset;
1048         *ram_addr_abs = (entry->offset + entry->rb->offset) &
1049                         TARGET_PAGE_MASK;
1050 
1051         if (entry->len > TARGET_PAGE_SIZE) {
1052             entry->len -= TARGET_PAGE_SIZE;
1053             entry->offset += TARGET_PAGE_SIZE;
1054         } else {
1055             memory_region_unref(block->mr);
1056             QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1057             g_free(entry);
1058         }
1059     }
1060     qemu_mutex_unlock(&ms->src_page_req_mutex);
1061 
1062     return block;
1063 }
1064 
1065 /*
1066  * Unqueue a page from the queue fed by postcopy page requests; skips pages
1067  * that are already sent (!dirty)
1068  *
1069  *      ms:      MigrationState in
1070  *     pss:      PageSearchStatus structure updated with found block/offset
1071  * ram_addr_abs: global offset in the dirty/sent bitmaps
1072  *
1073  * Returns:      true if a queued page is found
1074  */
1075 static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
1076                             ram_addr_t *ram_addr_abs)
1077 {
1078     RAMBlock  *block;
1079     ram_addr_t offset;
1080     bool dirty;
1081 
1082     do {
1083         block = unqueue_page(ms, &offset, ram_addr_abs);
1084         /*
1085          * We're sending this page, and since it's postcopy nothing else
1086          * will dirty it, and we must make sure it doesn't get sent again
1087          * even if this queue request was received after the background
1088          * search already sent it.
1089          */
1090         if (block) {
1091             unsigned long *bitmap;
1092             bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1093             dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1094             if (!dirty) {
1095                 trace_get_queued_page_not_dirty(
1096                     block->idstr, (uint64_t)offset,
1097                     (uint64_t)*ram_addr_abs,
1098                     test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1099                          atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1100             } else {
1101                 trace_get_queued_page(block->idstr,
1102                                       (uint64_t)offset,
1103                                       (uint64_t)*ram_addr_abs);
1104             }
1105         }
1106 
1107     } while (block && !dirty);
1108 
1109     if (block) {
1110         /*
1111          * As soon as we start servicing pages out of order, then we have
1112          * to kill the bulk stage, since the bulk stage assumes
1113          * in (migration_bitmap_find_and_reset_dirty) that every page is
1114          * dirty, that's no longer true.
1115          */
1116         ram_bulk_stage = false;
1117 
1118         /*
1119          * We want the background search to continue from the queued page
1120          * since the guest is likely to want other pages near to the page
1121          * it just requested.
1122          */
1123         pss->block = block;
1124         pss->offset = offset;
1125     }
1126 
1127     return !!block;
1128 }
1129 
1130 /**
1131  * flush_page_queue: Flush any remaining pages in the ram request queue
1132  *    it should be empty at the end anyway, but in error cases there may be
1133  *    some left.
1134  *
1135  * ms: MigrationState
1136  */
1137 void flush_page_queue(MigrationState *ms)
1138 {
1139     struct MigrationSrcPageRequest *mspr, *next_mspr;
1140     /* This queue generally should be empty - but in the case of a failed
1141      * migration might have some droppings in.
1142      */
1143     rcu_read_lock();
1144     QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1145         memory_region_unref(mspr->rb->mr);
1146         QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1147         g_free(mspr);
1148     }
1149     rcu_read_unlock();
1150 }
1151 
1152 /**
1153  * Queue the pages for transmission, e.g. a request from postcopy destination
1154  *   ms: MigrationStatus in which the queue is held
1155  *   rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
1156  *   start: Offset from the start of the RAMBlock
1157  *   len: Length (in bytes) to send
1158  *   Return: 0 on success
1159  */
1160 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1161                          ram_addr_t start, ram_addr_t len)
1162 {
1163     RAMBlock *ramblock;
1164 
1165     ms->postcopy_requests++;
1166     rcu_read_lock();
1167     if (!rbname) {
1168         /* Reuse last RAMBlock */
1169         ramblock = ms->last_req_rb;
1170 
1171         if (!ramblock) {
1172             /*
1173              * Shouldn't happen, we can't reuse the last RAMBlock if
1174              * it's the 1st request.
1175              */
1176             error_report("ram_save_queue_pages no previous block");
1177             goto err;
1178         }
1179     } else {
1180         ramblock = qemu_ram_block_by_name(rbname);
1181 
1182         if (!ramblock) {
1183             /* We shouldn't be asked for a non-existent RAMBlock */
1184             error_report("ram_save_queue_pages no block '%s'", rbname);
1185             goto err;
1186         }
1187         ms->last_req_rb = ramblock;
1188     }
1189     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1190     if (start+len > ramblock->used_length) {
1191         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1192                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1193                      __func__, start, len, ramblock->used_length);
1194         goto err;
1195     }
1196 
1197     struct MigrationSrcPageRequest *new_entry =
1198         g_malloc0(sizeof(struct MigrationSrcPageRequest));
1199     new_entry->rb = ramblock;
1200     new_entry->offset = start;
1201     new_entry->len = len;
1202 
1203     memory_region_ref(ramblock->mr);
1204     qemu_mutex_lock(&ms->src_page_req_mutex);
1205     QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1206     qemu_mutex_unlock(&ms->src_page_req_mutex);
1207     rcu_read_unlock();
1208 
1209     return 0;
1210 
1211 err:
1212     rcu_read_unlock();
1213     return -1;
1214 }
1215 
1216 /**
1217  * ram_save_target_page: Save one target page
1218  *
1219  *
1220  * @f: QEMUFile where to send the data
1221  * @block: pointer to block that contains the page we want to send
1222  * @offset: offset inside the block for the page;
1223  * @last_stage: if we are at the completion stage
1224  * @bytes_transferred: increase it with the number of transferred bytes
1225  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1226  *
1227  * Returns: Number of pages written.
1228  */
1229 static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
1230                                 PageSearchStatus *pss,
1231                                 bool last_stage,
1232                                 uint64_t *bytes_transferred,
1233                                 ram_addr_t dirty_ram_abs)
1234 {
1235     int res = 0;
1236 
1237     /* Check the pages is dirty and if it is send it */
1238     if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1239         unsigned long *unsentmap;
1240         if (compression_switch && migrate_use_compression()) {
1241             res = ram_save_compressed_page(f, pss,
1242                                            last_stage,
1243                                            bytes_transferred);
1244         } else {
1245             res = ram_save_page(f, pss, last_stage,
1246                                 bytes_transferred);
1247         }
1248 
1249         if (res < 0) {
1250             return res;
1251         }
1252         unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1253         if (unsentmap) {
1254             clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1255         }
1256         /* Only update last_sent_block if a block was actually sent; xbzrle
1257          * might have decided the page was identical so didn't bother writing
1258          * to the stream.
1259          */
1260         if (res > 0) {
1261             last_sent_block = pss->block;
1262         }
1263     }
1264 
1265     return res;
1266 }
1267 
1268 /**
1269  * ram_save_host_page: Starting at *offset send pages up to the end
1270  *                     of the current host page.  It's valid for the initial
1271  *                     offset to point into the middle of a host page
1272  *                     in which case the remainder of the hostpage is sent.
1273  *                     Only dirty target pages are sent.
1274  *
1275  * Returns: Number of pages written.
1276  *
1277  * @f: QEMUFile where to send the data
1278  * @block: pointer to block that contains the page we want to send
1279  * @offset: offset inside the block for the page; updated to last target page
1280  *          sent
1281  * @last_stage: if we are at the completion stage
1282  * @bytes_transferred: increase it with the number of transferred bytes
1283  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1284  */
1285 static int ram_save_host_page(MigrationState *ms, QEMUFile *f,
1286                               PageSearchStatus *pss,
1287                               bool last_stage,
1288                               uint64_t *bytes_transferred,
1289                               ram_addr_t dirty_ram_abs)
1290 {
1291     int tmppages, pages = 0;
1292     do {
1293         tmppages = ram_save_target_page(ms, f, pss, last_stage,
1294                                         bytes_transferred, dirty_ram_abs);
1295         if (tmppages < 0) {
1296             return tmppages;
1297         }
1298 
1299         pages += tmppages;
1300         pss->offset += TARGET_PAGE_SIZE;
1301         dirty_ram_abs += TARGET_PAGE_SIZE;
1302     } while (pss->offset & (qemu_host_page_size - 1));
1303 
1304     /* The offset we leave with is the last one we looked at */
1305     pss->offset -= TARGET_PAGE_SIZE;
1306     return pages;
1307 }
1308 
1309 /**
1310  * ram_find_and_save_block: Finds a dirty page and sends it to f
1311  *
1312  * Called within an RCU critical section.
1313  *
1314  * Returns:  The number of pages written
1315  *           0 means no dirty pages
1316  *
1317  * @f: QEMUFile where to send the data
1318  * @last_stage: if we are at the completion stage
1319  * @bytes_transferred: increase it with the number of transferred bytes
1320  *
1321  * On systems where host-page-size > target-page-size it will send all the
1322  * pages in a host page that are dirty.
1323  */
1324 
1325 static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1326                                    uint64_t *bytes_transferred)
1327 {
1328     PageSearchStatus pss;
1329     MigrationState *ms = migrate_get_current();
1330     int pages = 0;
1331     bool again, found;
1332     ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1333                                  ram_addr_t space */
1334 
1335     pss.block = last_seen_block;
1336     pss.offset = last_offset;
1337     pss.complete_round = false;
1338 
1339     if (!pss.block) {
1340         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1341     }
1342 
1343     do {
1344         again = true;
1345         found = get_queued_page(ms, &pss, &dirty_ram_abs);
1346 
1347         if (!found) {
1348             /* priority queue empty, so just search for something dirty */
1349             found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
1350         }
1351 
1352         if (found) {
1353             pages = ram_save_host_page(ms, f, &pss,
1354                                        last_stage, bytes_transferred,
1355                                        dirty_ram_abs);
1356         }
1357     } while (!pages && again);
1358 
1359     last_seen_block = pss.block;
1360     last_offset = pss.offset;
1361 
1362     return pages;
1363 }
1364 
1365 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1366 {
1367     uint64_t pages = size / TARGET_PAGE_SIZE;
1368     if (zero) {
1369         acct_info.dup_pages += pages;
1370     } else {
1371         acct_info.norm_pages += pages;
1372         bytes_transferred += size;
1373         qemu_update_position(f, size);
1374     }
1375 }
1376 
1377 static ram_addr_t ram_save_remaining(void)
1378 {
1379     return migration_dirty_pages;
1380 }
1381 
1382 uint64_t ram_bytes_remaining(void)
1383 {
1384     return ram_save_remaining() * TARGET_PAGE_SIZE;
1385 }
1386 
1387 uint64_t ram_bytes_transferred(void)
1388 {
1389     return bytes_transferred;
1390 }
1391 
1392 uint64_t ram_bytes_total(void)
1393 {
1394     RAMBlock *block;
1395     uint64_t total = 0;
1396 
1397     rcu_read_lock();
1398     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1399         total += block->used_length;
1400     rcu_read_unlock();
1401     return total;
1402 }
1403 
1404 void free_xbzrle_decoded_buf(void)
1405 {
1406     g_free(xbzrle_decoded_buf);
1407     xbzrle_decoded_buf = NULL;
1408 }
1409 
1410 static void migration_bitmap_free(struct BitmapRcu *bmap)
1411 {
1412     g_free(bmap->bmap);
1413     g_free(bmap->unsentmap);
1414     g_free(bmap);
1415 }
1416 
1417 static void ram_migration_cleanup(void *opaque)
1418 {
1419     /* caller have hold iothread lock or is in a bh, so there is
1420      * no writing race against this migration_bitmap
1421      */
1422     struct BitmapRcu *bitmap = migration_bitmap_rcu;
1423     atomic_rcu_set(&migration_bitmap_rcu, NULL);
1424     if (bitmap) {
1425         memory_global_dirty_log_stop();
1426         call_rcu(bitmap, migration_bitmap_free, rcu);
1427     }
1428 
1429     XBZRLE_cache_lock();
1430     if (XBZRLE.cache) {
1431         cache_fini(XBZRLE.cache);
1432         g_free(XBZRLE.encoded_buf);
1433         g_free(XBZRLE.current_buf);
1434         g_free(ZERO_TARGET_PAGE);
1435         XBZRLE.cache = NULL;
1436         XBZRLE.encoded_buf = NULL;
1437         XBZRLE.current_buf = NULL;
1438     }
1439     XBZRLE_cache_unlock();
1440 }
1441 
1442 static void reset_ram_globals(void)
1443 {
1444     last_seen_block = NULL;
1445     last_sent_block = NULL;
1446     last_offset = 0;
1447     last_version = ram_list.version;
1448     ram_bulk_stage = true;
1449 }
1450 
1451 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1452 
1453 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1454 {
1455     /* called in qemu main thread, so there is
1456      * no writing race against this migration_bitmap
1457      */
1458     if (migration_bitmap_rcu) {
1459         struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1460         bitmap = g_new(struct BitmapRcu, 1);
1461         bitmap->bmap = bitmap_new(new);
1462 
1463         /* prevent migration_bitmap content from being set bit
1464          * by migration_bitmap_sync_range() at the same time.
1465          * it is safe to migration if migration_bitmap is cleared bit
1466          * at the same time.
1467          */
1468         qemu_mutex_lock(&migration_bitmap_mutex);
1469         bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1470         bitmap_set(bitmap->bmap, old, new - old);
1471 
1472         /* We don't have a way to safely extend the sentmap
1473          * with RCU; so mark it as missing, entry to postcopy
1474          * will fail.
1475          */
1476         bitmap->unsentmap = NULL;
1477 
1478         atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1479         qemu_mutex_unlock(&migration_bitmap_mutex);
1480         migration_dirty_pages += new - old;
1481         call_rcu(old_bitmap, migration_bitmap_free, rcu);
1482     }
1483 }
1484 
1485 /*
1486  * 'expected' is the value you expect the bitmap mostly to be full
1487  * of; it won't bother printing lines that are all this value.
1488  * If 'todump' is null the migration bitmap is dumped.
1489  */
1490 void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1491 {
1492     int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1493 
1494     int64_t cur;
1495     int64_t linelen = 128;
1496     char linebuf[129];
1497 
1498     if (!todump) {
1499         todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1500     }
1501 
1502     for (cur = 0; cur < ram_pages; cur += linelen) {
1503         int64_t curb;
1504         bool found = false;
1505         /*
1506          * Last line; catch the case where the line length
1507          * is longer than remaining ram
1508          */
1509         if (cur + linelen > ram_pages) {
1510             linelen = ram_pages - cur;
1511         }
1512         for (curb = 0; curb < linelen; curb++) {
1513             bool thisbit = test_bit(cur + curb, todump);
1514             linebuf[curb] = thisbit ? '1' : '.';
1515             found = found || (thisbit != expected);
1516         }
1517         if (found) {
1518             linebuf[curb] = '\0';
1519             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1520         }
1521     }
1522 }
1523 
1524 /* **** functions for postcopy ***** */
1525 
1526 /*
1527  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1528  * Note: At this point the 'unsentmap' is the processed bitmap combined
1529  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1530  * start,length: Indexes into the bitmap for the first bit
1531  *            representing the named block and length in target-pages
1532  */
1533 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1534                                         PostcopyDiscardState *pds,
1535                                         unsigned long start,
1536                                         unsigned long length)
1537 {
1538     unsigned long end = start + length; /* one after the end */
1539     unsigned long current;
1540     unsigned long *unsentmap;
1541 
1542     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1543     for (current = start; current < end; ) {
1544         unsigned long one = find_next_bit(unsentmap, end, current);
1545 
1546         if (one <= end) {
1547             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1548             unsigned long discard_length;
1549 
1550             if (zero >= end) {
1551                 discard_length = end - one;
1552             } else {
1553                 discard_length = zero - one;
1554             }
1555             if (discard_length) {
1556                 postcopy_discard_send_range(ms, pds, one, discard_length);
1557             }
1558             current = one + discard_length;
1559         } else {
1560             current = one;
1561         }
1562     }
1563 
1564     return 0;
1565 }
1566 
1567 /*
1568  * Utility for the outgoing postcopy code.
1569  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1570  *   passing it bitmap indexes and name.
1571  * Returns: 0 on success
1572  * (qemu_ram_foreach_block ends up passing unscaled lengths
1573  *  which would mean postcopy code would have to deal with target page)
1574  */
1575 static int postcopy_each_ram_send_discard(MigrationState *ms)
1576 {
1577     struct RAMBlock *block;
1578     int ret;
1579 
1580     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1581         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1582         PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1583                                                                first,
1584                                                                block->idstr);
1585 
1586         /*
1587          * Postcopy sends chunks of bitmap over the wire, but it
1588          * just needs indexes at this point, avoids it having
1589          * target page specific code.
1590          */
1591         ret = postcopy_send_discard_bm_ram(ms, pds, first,
1592                                     block->used_length >> TARGET_PAGE_BITS);
1593         postcopy_discard_send_finish(ms, pds);
1594         if (ret) {
1595             return ret;
1596         }
1597     }
1598 
1599     return 0;
1600 }
1601 
1602 /*
1603  * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
1604  *   the two bitmaps, that are similar, but one is inverted.
1605  *
1606  * We search for runs of target-pages that don't start or end on a
1607  * host page boundary;
1608  * unsent_pass=true: Cleans up partially unsent host pages by searching
1609  *                 the unsentmap
1610  * unsent_pass=false: Cleans up partially dirty host pages by searching
1611  *                 the main migration bitmap
1612  *
1613  */
1614 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1615                                           RAMBlock *block,
1616                                           PostcopyDiscardState *pds)
1617 {
1618     unsigned long *bitmap;
1619     unsigned long *unsentmap;
1620     unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE;
1621     unsigned long first = block->offset >> TARGET_PAGE_BITS;
1622     unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1623     unsigned long last = first + (len - 1);
1624     unsigned long run_start;
1625 
1626     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1627     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1628 
1629     if (unsent_pass) {
1630         /* Find a sent page */
1631         run_start = find_next_zero_bit(unsentmap, last + 1, first);
1632     } else {
1633         /* Find a dirty page */
1634         run_start = find_next_bit(bitmap, last + 1, first);
1635     }
1636 
1637     while (run_start <= last) {
1638         bool do_fixup = false;
1639         unsigned long fixup_start_addr;
1640         unsigned long host_offset;
1641 
1642         /*
1643          * If the start of this run of pages is in the middle of a host
1644          * page, then we need to fixup this host page.
1645          */
1646         host_offset = run_start % host_ratio;
1647         if (host_offset) {
1648             do_fixup = true;
1649             run_start -= host_offset;
1650             fixup_start_addr = run_start;
1651             /* For the next pass */
1652             run_start = run_start + host_ratio;
1653         } else {
1654             /* Find the end of this run */
1655             unsigned long run_end;
1656             if (unsent_pass) {
1657                 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1658             } else {
1659                 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1660             }
1661             /*
1662              * If the end isn't at the start of a host page, then the
1663              * run doesn't finish at the end of a host page
1664              * and we need to discard.
1665              */
1666             host_offset = run_end % host_ratio;
1667             if (host_offset) {
1668                 do_fixup = true;
1669                 fixup_start_addr = run_end - host_offset;
1670                 /*
1671                  * This host page has gone, the next loop iteration starts
1672                  * from after the fixup
1673                  */
1674                 run_start = fixup_start_addr + host_ratio;
1675             } else {
1676                 /*
1677                  * No discards on this iteration, next loop starts from
1678                  * next sent/dirty page
1679                  */
1680                 run_start = run_end + 1;
1681             }
1682         }
1683 
1684         if (do_fixup) {
1685             unsigned long page;
1686 
1687             /* Tell the destination to discard this page */
1688             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1689                 /* For the unsent_pass we:
1690                  *     discard partially sent pages
1691                  * For the !unsent_pass (dirty) we:
1692                  *     discard partially dirty pages that were sent
1693                  *     (any partially sent pages were already discarded
1694                  *     by the previous unsent_pass)
1695                  */
1696                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1697                                             host_ratio);
1698             }
1699 
1700             /* Clean up the bitmap */
1701             for (page = fixup_start_addr;
1702                  page < fixup_start_addr + host_ratio; page++) {
1703                 /* All pages in this host page are now not sent */
1704                 set_bit(page, unsentmap);
1705 
1706                 /*
1707                  * Remark them as dirty, updating the count for any pages
1708                  * that weren't previously dirty.
1709                  */
1710                 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1711             }
1712         }
1713 
1714         if (unsent_pass) {
1715             /* Find the next sent page for the next iteration */
1716             run_start = find_next_zero_bit(unsentmap, last + 1,
1717                                            run_start);
1718         } else {
1719             /* Find the next dirty page for the next iteration */
1720             run_start = find_next_bit(bitmap, last + 1, run_start);
1721         }
1722     }
1723 }
1724 
1725 /*
1726  * Utility for the outgoing postcopy code.
1727  *
1728  * Discard any partially sent host-page size chunks, mark any partially
1729  * dirty host-page size chunks as all dirty.
1730  *
1731  * Returns: 0 on success
1732  */
1733 static int postcopy_chunk_hostpages(MigrationState *ms)
1734 {
1735     struct RAMBlock *block;
1736 
1737     if (qemu_host_page_size == TARGET_PAGE_SIZE) {
1738         /* Easy case - TPS==HPS - nothing to be done */
1739         return 0;
1740     }
1741 
1742     /* Easiest way to make sure we don't resume in the middle of a host-page */
1743     last_seen_block = NULL;
1744     last_sent_block = NULL;
1745     last_offset     = 0;
1746 
1747     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1748         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1749 
1750         PostcopyDiscardState *pds =
1751                          postcopy_discard_send_init(ms, first, block->idstr);
1752 
1753         /* First pass: Discard all partially sent host pages */
1754         postcopy_chunk_hostpages_pass(ms, true, block, pds);
1755         /*
1756          * Second pass: Ensure that all partially dirty host pages are made
1757          * fully dirty.
1758          */
1759         postcopy_chunk_hostpages_pass(ms, false, block, pds);
1760 
1761         postcopy_discard_send_finish(ms, pds);
1762     } /* ram_list loop */
1763 
1764     return 0;
1765 }
1766 
1767 /*
1768  * Transmit the set of pages to be discarded after precopy to the target
1769  * these are pages that:
1770  *     a) Have been previously transmitted but are now dirty again
1771  *     b) Pages that have never been transmitted, this ensures that
1772  *        any pages on the destination that have been mapped by background
1773  *        tasks get discarded (transparent huge pages is the specific concern)
1774  * Hopefully this is pretty sparse
1775  */
1776 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1777 {
1778     int ret;
1779     unsigned long *bitmap, *unsentmap;
1780 
1781     rcu_read_lock();
1782 
1783     /* This should be our last sync, the src is now paused */
1784     migration_bitmap_sync();
1785 
1786     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1787     if (!unsentmap) {
1788         /* We don't have a safe way to resize the sentmap, so
1789          * if the bitmap was resized it will be NULL at this
1790          * point.
1791          */
1792         error_report("migration ram resized during precopy phase");
1793         rcu_read_unlock();
1794         return -EINVAL;
1795     }
1796 
1797     /* Deal with TPS != HPS */
1798     ret = postcopy_chunk_hostpages(ms);
1799     if (ret) {
1800         rcu_read_unlock();
1801         return ret;
1802     }
1803 
1804     /*
1805      * Update the unsentmap to be unsentmap = unsentmap | dirty
1806      */
1807     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1808     bitmap_or(unsentmap, unsentmap, bitmap,
1809                last_ram_offset() >> TARGET_PAGE_BITS);
1810 
1811 
1812     trace_ram_postcopy_send_discard_bitmap();
1813 #ifdef DEBUG_POSTCOPY
1814     ram_debug_dump_bitmap(unsentmap, true);
1815 #endif
1816 
1817     ret = postcopy_each_ram_send_discard(ms);
1818     rcu_read_unlock();
1819 
1820     return ret;
1821 }
1822 
1823 /*
1824  * At the start of the postcopy phase of migration, any now-dirty
1825  * precopied pages are discarded.
1826  *
1827  * start, length describe a byte address range within the RAMBlock
1828  *
1829  * Returns 0 on success.
1830  */
1831 int ram_discard_range(MigrationIncomingState *mis,
1832                       const char *block_name,
1833                       uint64_t start, size_t length)
1834 {
1835     int ret = -1;
1836 
1837     rcu_read_lock();
1838     RAMBlock *rb = qemu_ram_block_by_name(block_name);
1839 
1840     if (!rb) {
1841         error_report("ram_discard_range: Failed to find block '%s'",
1842                      block_name);
1843         goto err;
1844     }
1845 
1846     uint8_t *host_startaddr = rb->host + start;
1847 
1848     if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) {
1849         error_report("ram_discard_range: Unaligned start address: %p",
1850                      host_startaddr);
1851         goto err;
1852     }
1853 
1854     if ((start + length) <= rb->used_length) {
1855         uint8_t *host_endaddr = host_startaddr + length;
1856         if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) {
1857             error_report("ram_discard_range: Unaligned end address: %p",
1858                          host_endaddr);
1859             goto err;
1860         }
1861         ret = postcopy_ram_discard_range(mis, host_startaddr, length);
1862     } else {
1863         error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
1864                      "/%zx/" RAM_ADDR_FMT")",
1865                      block_name, start, length, rb->used_length);
1866     }
1867 
1868 err:
1869     rcu_read_unlock();
1870 
1871     return ret;
1872 }
1873 
1874 
1875 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1876  * long-running RCU critical section.  When rcu-reclaims in the code
1877  * start to become numerous it will be necessary to reduce the
1878  * granularity of these critical sections.
1879  */
1880 
1881 static int ram_save_setup(QEMUFile *f, void *opaque)
1882 {
1883     RAMBlock *block;
1884     int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1885 
1886     dirty_rate_high_cnt = 0;
1887     bitmap_sync_count = 0;
1888     migration_bitmap_sync_init();
1889     qemu_mutex_init(&migration_bitmap_mutex);
1890 
1891     if (migrate_use_xbzrle()) {
1892         XBZRLE_cache_lock();
1893         ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1894         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1895                                   TARGET_PAGE_SIZE,
1896                                   TARGET_PAGE_SIZE);
1897         if (!XBZRLE.cache) {
1898             XBZRLE_cache_unlock();
1899             error_report("Error creating cache");
1900             return -1;
1901         }
1902         XBZRLE_cache_unlock();
1903 
1904         /* We prefer not to abort if there is no memory */
1905         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1906         if (!XBZRLE.encoded_buf) {
1907             error_report("Error allocating encoded_buf");
1908             return -1;
1909         }
1910 
1911         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1912         if (!XBZRLE.current_buf) {
1913             error_report("Error allocating current_buf");
1914             g_free(XBZRLE.encoded_buf);
1915             XBZRLE.encoded_buf = NULL;
1916             return -1;
1917         }
1918 
1919         acct_clear();
1920     }
1921 
1922     /* For memory_global_dirty_log_start below.  */
1923     qemu_mutex_lock_iothread();
1924 
1925     qemu_mutex_lock_ramlist();
1926     rcu_read_lock();
1927     bytes_transferred = 0;
1928     reset_ram_globals();
1929 
1930     ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1931     migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
1932     migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1933     bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
1934 
1935     if (migrate_postcopy_ram()) {
1936         migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
1937         bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
1938     }
1939 
1940     /*
1941      * Count the total number of pages used by ram blocks not including any
1942      * gaps due to alignment or unplugs.
1943      */
1944     migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1945 
1946     memory_global_dirty_log_start();
1947     migration_bitmap_sync();
1948     qemu_mutex_unlock_ramlist();
1949     qemu_mutex_unlock_iothread();
1950 
1951     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1952 
1953     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1954         qemu_put_byte(f, strlen(block->idstr));
1955         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1956         qemu_put_be64(f, block->used_length);
1957     }
1958 
1959     rcu_read_unlock();
1960 
1961     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1962     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1963 
1964     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1965 
1966     return 0;
1967 }
1968 
1969 static int ram_save_iterate(QEMUFile *f, void *opaque)
1970 {
1971     int ret;
1972     int i;
1973     int64_t t0;
1974     int pages_sent = 0;
1975 
1976     rcu_read_lock();
1977     if (ram_list.version != last_version) {
1978         reset_ram_globals();
1979     }
1980 
1981     /* Read version before ram_list.blocks */
1982     smp_rmb();
1983 
1984     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1985 
1986     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1987     i = 0;
1988     while ((ret = qemu_file_rate_limit(f)) == 0) {
1989         int pages;
1990 
1991         pages = ram_find_and_save_block(f, false, &bytes_transferred);
1992         /* no more pages to sent */
1993         if (pages == 0) {
1994             break;
1995         }
1996         pages_sent += pages;
1997         acct_info.iterations++;
1998 
1999         /* we want to check in the 1st loop, just in case it was the 1st time
2000            and we had to sync the dirty bitmap.
2001            qemu_get_clock_ns() is a bit expensive, so we only check each some
2002            iterations
2003         */
2004         if ((i & 63) == 0) {
2005             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2006             if (t1 > MAX_WAIT) {
2007                 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
2008                         t1, i);
2009                 break;
2010             }
2011         }
2012         i++;
2013     }
2014     flush_compressed_data(f);
2015     rcu_read_unlock();
2016 
2017     /*
2018      * Must occur before EOS (or any QEMUFile operation)
2019      * because of RDMA protocol.
2020      */
2021     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2022 
2023     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2024     bytes_transferred += 8;
2025 
2026     ret = qemu_file_get_error(f);
2027     if (ret < 0) {
2028         return ret;
2029     }
2030 
2031     return pages_sent;
2032 }
2033 
2034 /* Called with iothread lock */
2035 static int ram_save_complete(QEMUFile *f, void *opaque)
2036 {
2037     rcu_read_lock();
2038 
2039     if (!migration_in_postcopy(migrate_get_current())) {
2040         migration_bitmap_sync();
2041     }
2042 
2043     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2044 
2045     /* try transferring iterative blocks of memory */
2046 
2047     /* flush all remaining blocks regardless of rate limiting */
2048     while (true) {
2049         int pages;
2050 
2051         pages = ram_find_and_save_block(f, true, &bytes_transferred);
2052         /* no more blocks to sent */
2053         if (pages == 0) {
2054             break;
2055         }
2056     }
2057 
2058     flush_compressed_data(f);
2059     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2060 
2061     rcu_read_unlock();
2062 
2063     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2064 
2065     return 0;
2066 }
2067 
2068 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2069                              uint64_t *non_postcopiable_pending,
2070                              uint64_t *postcopiable_pending)
2071 {
2072     uint64_t remaining_size;
2073 
2074     remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2075 
2076     if (!migration_in_postcopy(migrate_get_current()) &&
2077         remaining_size < max_size) {
2078         qemu_mutex_lock_iothread();
2079         rcu_read_lock();
2080         migration_bitmap_sync();
2081         rcu_read_unlock();
2082         qemu_mutex_unlock_iothread();
2083         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2084     }
2085 
2086     /* We can do postcopy, and all the data is postcopiable */
2087     *postcopiable_pending += remaining_size;
2088 }
2089 
2090 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2091 {
2092     unsigned int xh_len;
2093     int xh_flags;
2094     uint8_t *loaded_data;
2095 
2096     if (!xbzrle_decoded_buf) {
2097         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2098     }
2099     loaded_data = xbzrle_decoded_buf;
2100 
2101     /* extract RLE header */
2102     xh_flags = qemu_get_byte(f);
2103     xh_len = qemu_get_be16(f);
2104 
2105     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2106         error_report("Failed to load XBZRLE page - wrong compression!");
2107         return -1;
2108     }
2109 
2110     if (xh_len > TARGET_PAGE_SIZE) {
2111         error_report("Failed to load XBZRLE page - len overflow!");
2112         return -1;
2113     }
2114     /* load data and decode */
2115     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2116 
2117     /* decode RLE */
2118     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2119                              TARGET_PAGE_SIZE) == -1) {
2120         error_report("Failed to load XBZRLE page - decode error!");
2121         return -1;
2122     }
2123 
2124     return 0;
2125 }
2126 
2127 /* Must be called from within a rcu critical section.
2128  * Returns a pointer from within the RCU-protected ram_list.
2129  */
2130 /*
2131  * Read a RAMBlock ID from the stream f.
2132  *
2133  * f: Stream to read from
2134  * flags: Page flags (mostly to see if it's a continuation of previous block)
2135  */
2136 static inline RAMBlock *ram_block_from_stream(QEMUFile *f,
2137                                               int flags)
2138 {
2139     static RAMBlock *block = NULL;
2140     char id[256];
2141     uint8_t len;
2142 
2143     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2144         if (!block) {
2145             error_report("Ack, bad migration stream!");
2146             return NULL;
2147         }
2148         return block;
2149     }
2150 
2151     len = qemu_get_byte(f);
2152     qemu_get_buffer(f, (uint8_t *)id, len);
2153     id[len] = 0;
2154 
2155     block = qemu_ram_block_by_name(id);
2156     if (!block) {
2157         error_report("Can't find block %s", id);
2158         return NULL;
2159     }
2160 
2161     return block;
2162 }
2163 
2164 static inline void *host_from_ram_block_offset(RAMBlock *block,
2165                                                ram_addr_t offset)
2166 {
2167     if (!offset_in_ramblock(block, offset)) {
2168         return NULL;
2169     }
2170 
2171     return block->host + offset;
2172 }
2173 
2174 /*
2175  * If a page (or a whole RDMA chunk) has been
2176  * determined to be zero, then zap it.
2177  */
2178 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2179 {
2180     if (ch != 0 || !is_zero_range(host, size)) {
2181         memset(host, ch, size);
2182     }
2183 }
2184 
2185 static void *do_data_decompress(void *opaque)
2186 {
2187     DecompressParam *param = opaque;
2188     unsigned long pagesize;
2189     uint8_t *des;
2190     int len;
2191 
2192     qemu_mutex_lock(&param->mutex);
2193     while (!param->quit) {
2194         if (param->des) {
2195             des = param->des;
2196             len = param->len;
2197             param->des = 0;
2198             qemu_mutex_unlock(&param->mutex);
2199 
2200             pagesize = TARGET_PAGE_SIZE;
2201             /* uncompress() will return failed in some case, especially
2202              * when the page is dirted when doing the compression, it's
2203              * not a problem because the dirty page will be retransferred
2204              * and uncompress() won't break the data in other pages.
2205              */
2206             uncompress((Bytef *)des, &pagesize,
2207                        (const Bytef *)param->compbuf, len);
2208 
2209             qemu_mutex_lock(&decomp_done_lock);
2210             param->done = true;
2211             qemu_cond_signal(&decomp_done_cond);
2212             qemu_mutex_unlock(&decomp_done_lock);
2213 
2214             qemu_mutex_lock(&param->mutex);
2215         } else {
2216             qemu_cond_wait(&param->cond, &param->mutex);
2217         }
2218     }
2219     qemu_mutex_unlock(&param->mutex);
2220 
2221     return NULL;
2222 }
2223 
2224 static void wait_for_decompress_done(void)
2225 {
2226     int idx, thread_count;
2227 
2228     if (!migrate_use_compression()) {
2229         return;
2230     }
2231 
2232     thread_count = migrate_decompress_threads();
2233     qemu_mutex_lock(&decomp_done_lock);
2234     for (idx = 0; idx < thread_count; idx++) {
2235         while (!decomp_param[idx].done) {
2236             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2237         }
2238     }
2239     qemu_mutex_unlock(&decomp_done_lock);
2240 }
2241 
2242 void migrate_decompress_threads_create(void)
2243 {
2244     int i, thread_count;
2245 
2246     thread_count = migrate_decompress_threads();
2247     decompress_threads = g_new0(QemuThread, thread_count);
2248     decomp_param = g_new0(DecompressParam, thread_count);
2249     qemu_mutex_init(&decomp_done_lock);
2250     qemu_cond_init(&decomp_done_cond);
2251     for (i = 0; i < thread_count; i++) {
2252         qemu_mutex_init(&decomp_param[i].mutex);
2253         qemu_cond_init(&decomp_param[i].cond);
2254         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2255         decomp_param[i].done = true;
2256         decomp_param[i].quit = false;
2257         qemu_thread_create(decompress_threads + i, "decompress",
2258                            do_data_decompress, decomp_param + i,
2259                            QEMU_THREAD_JOINABLE);
2260     }
2261 }
2262 
2263 void migrate_decompress_threads_join(void)
2264 {
2265     int i, thread_count;
2266 
2267     thread_count = migrate_decompress_threads();
2268     for (i = 0; i < thread_count; i++) {
2269         qemu_mutex_lock(&decomp_param[i].mutex);
2270         decomp_param[i].quit = true;
2271         qemu_cond_signal(&decomp_param[i].cond);
2272         qemu_mutex_unlock(&decomp_param[i].mutex);
2273     }
2274     for (i = 0; i < thread_count; i++) {
2275         qemu_thread_join(decompress_threads + i);
2276         qemu_mutex_destroy(&decomp_param[i].mutex);
2277         qemu_cond_destroy(&decomp_param[i].cond);
2278         g_free(decomp_param[i].compbuf);
2279     }
2280     g_free(decompress_threads);
2281     g_free(decomp_param);
2282     decompress_threads = NULL;
2283     decomp_param = NULL;
2284 }
2285 
2286 static void decompress_data_with_multi_threads(QEMUFile *f,
2287                                                void *host, int len)
2288 {
2289     int idx, thread_count;
2290 
2291     thread_count = migrate_decompress_threads();
2292     qemu_mutex_lock(&decomp_done_lock);
2293     while (true) {
2294         for (idx = 0; idx < thread_count; idx++) {
2295             if (decomp_param[idx].done) {
2296                 decomp_param[idx].done = false;
2297                 qemu_mutex_lock(&decomp_param[idx].mutex);
2298                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2299                 decomp_param[idx].des = host;
2300                 decomp_param[idx].len = len;
2301                 qemu_cond_signal(&decomp_param[idx].cond);
2302                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2303                 break;
2304             }
2305         }
2306         if (idx < thread_count) {
2307             break;
2308         } else {
2309             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2310         }
2311     }
2312     qemu_mutex_unlock(&decomp_done_lock);
2313 }
2314 
2315 /*
2316  * Allocate data structures etc needed by incoming migration with postcopy-ram
2317  * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
2318  */
2319 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2320 {
2321     size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2322 
2323     return postcopy_ram_incoming_init(mis, ram_pages);
2324 }
2325 
2326 /*
2327  * Called in postcopy mode by ram_load().
2328  * rcu_read_lock is taken prior to this being called.
2329  */
2330 static int ram_load_postcopy(QEMUFile *f)
2331 {
2332     int flags = 0, ret = 0;
2333     bool place_needed = false;
2334     bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE;
2335     MigrationIncomingState *mis = migration_incoming_get_current();
2336     /* Temporary page that is later 'placed' */
2337     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2338     void *last_host = NULL;
2339     bool all_zero = false;
2340 
2341     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2342         ram_addr_t addr;
2343         void *host = NULL;
2344         void *page_buffer = NULL;
2345         void *place_source = NULL;
2346         uint8_t ch;
2347 
2348         addr = qemu_get_be64(f);
2349         flags = addr & ~TARGET_PAGE_MASK;
2350         addr &= TARGET_PAGE_MASK;
2351 
2352         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2353         place_needed = false;
2354         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2355             RAMBlock *block = ram_block_from_stream(f, flags);
2356 
2357             host = host_from_ram_block_offset(block, addr);
2358             if (!host) {
2359                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2360                 ret = -EINVAL;
2361                 break;
2362             }
2363             /*
2364              * Postcopy requires that we place whole host pages atomically.
2365              * To make it atomic, the data is read into a temporary page
2366              * that's moved into place later.
2367              * The migration protocol uses,  possibly smaller, target-pages
2368              * however the source ensures it always sends all the components
2369              * of a host page in order.
2370              */
2371             page_buffer = postcopy_host_page +
2372                           ((uintptr_t)host & ~qemu_host_page_mask);
2373             /* If all TP are zero then we can optimise the place */
2374             if (!((uintptr_t)host & ~qemu_host_page_mask)) {
2375                 all_zero = true;
2376             } else {
2377                 /* not the 1st TP within the HP */
2378                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2379                     error_report("Non-sequential target page %p/%p",
2380                                   host, last_host);
2381                     ret = -EINVAL;
2382                     break;
2383                 }
2384             }
2385 
2386 
2387             /*
2388              * If it's the last part of a host page then we place the host
2389              * page
2390              */
2391             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2392                                      ~qemu_host_page_mask) == 0;
2393             place_source = postcopy_host_page;
2394         }
2395         last_host = host;
2396 
2397         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2398         case RAM_SAVE_FLAG_COMPRESS:
2399             ch = qemu_get_byte(f);
2400             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2401             if (ch) {
2402                 all_zero = false;
2403             }
2404             break;
2405 
2406         case RAM_SAVE_FLAG_PAGE:
2407             all_zero = false;
2408             if (!place_needed || !matching_page_sizes) {
2409                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2410             } else {
2411                 /* Avoids the qemu_file copy during postcopy, which is
2412                  * going to do a copy later; can only do it when we
2413                  * do this read in one go (matching page sizes)
2414                  */
2415                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2416                                          TARGET_PAGE_SIZE);
2417             }
2418             break;
2419         case RAM_SAVE_FLAG_EOS:
2420             /* normal exit */
2421             break;
2422         default:
2423             error_report("Unknown combination of migration flags: %#x"
2424                          " (postcopy mode)", flags);
2425             ret = -EINVAL;
2426         }
2427 
2428         if (place_needed) {
2429             /* This gets called at the last target page in the host page */
2430             if (all_zero) {
2431                 ret = postcopy_place_page_zero(mis,
2432                                                host + TARGET_PAGE_SIZE -
2433                                                qemu_host_page_size);
2434             } else {
2435                 ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE -
2436                                                qemu_host_page_size,
2437                                                place_source);
2438             }
2439         }
2440         if (!ret) {
2441             ret = qemu_file_get_error(f);
2442         }
2443     }
2444 
2445     return ret;
2446 }
2447 
2448 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2449 {
2450     int flags = 0, ret = 0;
2451     static uint64_t seq_iter;
2452     int len = 0;
2453     /*
2454      * If system is running in postcopy mode, page inserts to host memory must
2455      * be atomic
2456      */
2457     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2458 
2459     seq_iter++;
2460 
2461     if (version_id != 4) {
2462         ret = -EINVAL;
2463     }
2464 
2465     /* This RCU critical section can be very long running.
2466      * When RCU reclaims in the code start to become numerous,
2467      * it will be necessary to reduce the granularity of this
2468      * critical section.
2469      */
2470     rcu_read_lock();
2471 
2472     if (postcopy_running) {
2473         ret = ram_load_postcopy(f);
2474     }
2475 
2476     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2477         ram_addr_t addr, total_ram_bytes;
2478         void *host = NULL;
2479         uint8_t ch;
2480 
2481         addr = qemu_get_be64(f);
2482         flags = addr & ~TARGET_PAGE_MASK;
2483         addr &= TARGET_PAGE_MASK;
2484 
2485         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2486                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2487             RAMBlock *block = ram_block_from_stream(f, flags);
2488 
2489             host = host_from_ram_block_offset(block, addr);
2490             if (!host) {
2491                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2492                 ret = -EINVAL;
2493                 break;
2494             }
2495         }
2496 
2497         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2498         case RAM_SAVE_FLAG_MEM_SIZE:
2499             /* Synchronize RAM block list */
2500             total_ram_bytes = addr;
2501             while (!ret && total_ram_bytes) {
2502                 RAMBlock *block;
2503                 char id[256];
2504                 ram_addr_t length;
2505 
2506                 len = qemu_get_byte(f);
2507                 qemu_get_buffer(f, (uint8_t *)id, len);
2508                 id[len] = 0;
2509                 length = qemu_get_be64(f);
2510 
2511                 block = qemu_ram_block_by_name(id);
2512                 if (block) {
2513                     if (length != block->used_length) {
2514                         Error *local_err = NULL;
2515 
2516                         ret = qemu_ram_resize(block, length,
2517                                               &local_err);
2518                         if (local_err) {
2519                             error_report_err(local_err);
2520                         }
2521                     }
2522                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2523                                           block->idstr);
2524                 } else {
2525                     error_report("Unknown ramblock \"%s\", cannot "
2526                                  "accept migration", id);
2527                     ret = -EINVAL;
2528                 }
2529 
2530                 total_ram_bytes -= length;
2531             }
2532             break;
2533 
2534         case RAM_SAVE_FLAG_COMPRESS:
2535             ch = qemu_get_byte(f);
2536             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2537             break;
2538 
2539         case RAM_SAVE_FLAG_PAGE:
2540             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2541             break;
2542 
2543         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2544             len = qemu_get_be32(f);
2545             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2546                 error_report("Invalid compressed data length: %d", len);
2547                 ret = -EINVAL;
2548                 break;
2549             }
2550             decompress_data_with_multi_threads(f, host, len);
2551             break;
2552 
2553         case RAM_SAVE_FLAG_XBZRLE:
2554             if (load_xbzrle(f, addr, host) < 0) {
2555                 error_report("Failed to decompress XBZRLE page at "
2556                              RAM_ADDR_FMT, addr);
2557                 ret = -EINVAL;
2558                 break;
2559             }
2560             break;
2561         case RAM_SAVE_FLAG_EOS:
2562             /* normal exit */
2563             break;
2564         default:
2565             if (flags & RAM_SAVE_FLAG_HOOK) {
2566                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2567             } else {
2568                 error_report("Unknown combination of migration flags: %#x",
2569                              flags);
2570                 ret = -EINVAL;
2571             }
2572         }
2573         if (!ret) {
2574             ret = qemu_file_get_error(f);
2575         }
2576     }
2577 
2578     wait_for_decompress_done();
2579     rcu_read_unlock();
2580     DPRINTF("Completed load of VM with exit code %d seq iteration "
2581             "%" PRIu64 "\n", ret, seq_iter);
2582     return ret;
2583 }
2584 
2585 static SaveVMHandlers savevm_ram_handlers = {
2586     .save_live_setup = ram_save_setup,
2587     .save_live_iterate = ram_save_iterate,
2588     .save_live_complete_postcopy = ram_save_complete,
2589     .save_live_complete_precopy = ram_save_complete,
2590     .save_live_pending = ram_save_pending,
2591     .load_state = ram_load,
2592     .cleanup = ram_migration_cleanup,
2593 };
2594 
2595 void ram_mig_init(void)
2596 {
2597     qemu_mutex_init(&XBZRLE.lock);
2598     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
2599 }
2600