xref: /qemu/migration/multifd.c (revision d9d3e4f2)
1 /*
2  * Multifd common code
3  *
4  * Copyright (c) 2019-2020 Red Hat Inc
5  *
6  * Authors:
7  *  Juan Quintela <quintela@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2 or later.
10  * See the COPYING file in the top-level directory.
11  */
12 
13 #include "qemu/osdep.h"
14 #include "qemu/cutils.h"
15 #include "qemu/rcu.h"
16 #include "exec/target_page.h"
17 #include "sysemu/sysemu.h"
18 #include "exec/ramblock.h"
19 #include "qemu/error-report.h"
20 #include "qapi/error.h"
21 #include "file.h"
22 #include "migration.h"
23 #include "migration-stats.h"
24 #include "socket.h"
25 #include "tls.h"
26 #include "qemu-file.h"
27 #include "trace.h"
28 #include "multifd.h"
29 #include "threadinfo.h"
30 #include "options.h"
31 #include "qemu/yank.h"
32 #include "io/channel-file.h"
33 #include "io/channel-socket.h"
34 #include "yank_functions.h"
35 
36 /* Multiple fd's */
37 
38 #define MULTIFD_MAGIC 0x11223344U
39 #define MULTIFD_VERSION 1
40 
41 typedef struct {
42     uint32_t magic;
43     uint32_t version;
44     unsigned char uuid[16]; /* QemuUUID */
45     uint8_t id;
46     uint8_t unused1[7];     /* Reserved for future use */
47     uint64_t unused2[4];    /* Reserved for future use */
48 } __attribute__((packed)) MultiFDInit_t;
49 
50 struct {
51     MultiFDSendParams *params;
52     /* array of pages to sent */
53     MultiFDPages_t *pages;
54     /*
55      * Global number of generated multifd packets.
56      *
57      * Note that we used 'uintptr_t' because it'll naturally support atomic
58      * operations on both 32bit / 64 bits hosts.  It means on 32bit systems
59      * multifd will overflow the packet_num easier, but that should be
60      * fine.
61      *
62      * Another option is to use QEMU's Stat64 then it'll be 64 bits on all
63      * hosts, however so far it does not support atomic fetch_add() yet.
64      * Make it easy for now.
65      */
66     uintptr_t packet_num;
67     /*
68      * Synchronization point past which no more channels will be
69      * created.
70      */
71     QemuSemaphore channels_created;
72     /* send channels ready */
73     QemuSemaphore channels_ready;
74     /*
75      * Have we already run terminate threads.  There is a race when it
76      * happens that we got one error while we are exiting.
77      * We will use atomic operations.  Only valid values are 0 and 1.
78      */
79     int exiting;
80     /* multifd ops */
81     MultiFDMethods *ops;
82 } *multifd_send_state;
83 
84 struct {
85     MultiFDRecvParams *params;
86     MultiFDRecvData *data;
87     /* number of created threads */
88     int count;
89     /*
90      * This is always posted by the recv threads, the migration thread
91      * uses it to wait for recv threads to finish assigned tasks.
92      */
93     QemuSemaphore sem_sync;
94     /* global number of generated multifd packets */
95     uint64_t packet_num;
96     int exiting;
97     /* multifd ops */
98     MultiFDMethods *ops;
99 } *multifd_recv_state;
100 
multifd_use_packets(void)101 static bool multifd_use_packets(void)
102 {
103     return !migrate_mapped_ram();
104 }
105 
multifd_send_channel_created(void)106 void multifd_send_channel_created(void)
107 {
108     qemu_sem_post(&multifd_send_state->channels_created);
109 }
110 
multifd_set_file_bitmap(MultiFDSendParams * p)111 static void multifd_set_file_bitmap(MultiFDSendParams *p)
112 {
113     MultiFDPages_t *pages = p->pages;
114 
115     assert(pages->block);
116 
117     for (int i = 0; i < p->pages->normal_num; i++) {
118         ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], true);
119     }
120 
121     for (int i = p->pages->normal_num; i < p->pages->num; i++) {
122         ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], false);
123     }
124 }
125 
126 /* Multifd without compression */
127 
128 /**
129  * nocomp_send_setup: setup send side
130  *
131  * @p: Params for the channel that we are using
132  * @errp: pointer to an error
133  */
nocomp_send_setup(MultiFDSendParams * p,Error ** errp)134 static int nocomp_send_setup(MultiFDSendParams *p, Error **errp)
135 {
136     if (migrate_zero_copy_send()) {
137         p->write_flags |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY;
138     }
139 
140     if (multifd_use_packets()) {
141         /* We need one extra place for the packet header */
142         p->iov = g_new0(struct iovec, p->page_count + 1);
143     } else {
144         p->iov = g_new0(struct iovec, p->page_count);
145     }
146 
147     return 0;
148 }
149 
150 /**
151  * nocomp_send_cleanup: cleanup send side
152  *
153  * For no compression this function does nothing.
154  *
155  * @p: Params for the channel that we are using
156  * @errp: pointer to an error
157  */
nocomp_send_cleanup(MultiFDSendParams * p,Error ** errp)158 static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp)
159 {
160     g_free(p->iov);
161     p->iov = NULL;
162     return;
163 }
164 
multifd_send_prepare_iovs(MultiFDSendParams * p)165 static void multifd_send_prepare_iovs(MultiFDSendParams *p)
166 {
167     MultiFDPages_t *pages = p->pages;
168 
169     for (int i = 0; i < pages->normal_num; i++) {
170         p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i];
171         p->iov[p->iovs_num].iov_len = p->page_size;
172         p->iovs_num++;
173     }
174 
175     p->next_packet_size = pages->normal_num * p->page_size;
176 }
177 
178 /**
179  * nocomp_send_prepare: prepare date to be able to send
180  *
181  * For no compression we just have to calculate the size of the
182  * packet.
183  *
184  * Returns 0 for success or -1 for error
185  *
186  * @p: Params for the channel that we are using
187  * @errp: pointer to an error
188  */
nocomp_send_prepare(MultiFDSendParams * p,Error ** errp)189 static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
190 {
191     bool use_zero_copy_send = migrate_zero_copy_send();
192     int ret;
193 
194     multifd_send_zero_page_detect(p);
195 
196     if (!multifd_use_packets()) {
197         multifd_send_prepare_iovs(p);
198         multifd_set_file_bitmap(p);
199 
200         return 0;
201     }
202 
203     if (!use_zero_copy_send) {
204         /*
205          * Only !zerocopy needs the header in IOV; zerocopy will
206          * send it separately.
207          */
208         multifd_send_prepare_header(p);
209     }
210 
211     multifd_send_prepare_iovs(p);
212     p->flags |= MULTIFD_FLAG_NOCOMP;
213 
214     multifd_send_fill_packet(p);
215 
216     if (use_zero_copy_send) {
217         /* Send header first, without zerocopy */
218         ret = qio_channel_write_all(p->c, (void *)p->packet,
219                                     p->packet_len, errp);
220         if (ret != 0) {
221             return -1;
222         }
223     }
224 
225     return 0;
226 }
227 
228 /**
229  * nocomp_recv_setup: setup receive side
230  *
231  * For no compression this function does nothing.
232  *
233  * Returns 0 for success or -1 for error
234  *
235  * @p: Params for the channel that we are using
236  * @errp: pointer to an error
237  */
nocomp_recv_setup(MultiFDRecvParams * p,Error ** errp)238 static int nocomp_recv_setup(MultiFDRecvParams *p, Error **errp)
239 {
240     p->iov = g_new0(struct iovec, p->page_count);
241     return 0;
242 }
243 
244 /**
245  * nocomp_recv_cleanup: setup receive side
246  *
247  * For no compression this function does nothing.
248  *
249  * @p: Params for the channel that we are using
250  */
nocomp_recv_cleanup(MultiFDRecvParams * p)251 static void nocomp_recv_cleanup(MultiFDRecvParams *p)
252 {
253     g_free(p->iov);
254     p->iov = NULL;
255 }
256 
257 /**
258  * nocomp_recv: read the data from the channel
259  *
260  * For no compression we just need to read things into the correct place.
261  *
262  * Returns 0 for success or -1 for error
263  *
264  * @p: Params for the channel that we are using
265  * @errp: pointer to an error
266  */
nocomp_recv(MultiFDRecvParams * p,Error ** errp)267 static int nocomp_recv(MultiFDRecvParams *p, Error **errp)
268 {
269     uint32_t flags;
270 
271     if (!multifd_use_packets()) {
272         return multifd_file_recv_data(p, errp);
273     }
274 
275     flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
276 
277     if (flags != MULTIFD_FLAG_NOCOMP) {
278         error_setg(errp, "multifd %u: flags received %x flags expected %x",
279                    p->id, flags, MULTIFD_FLAG_NOCOMP);
280         return -1;
281     }
282 
283     multifd_recv_zero_page_process(p);
284 
285     if (!p->normal_num) {
286         return 0;
287     }
288 
289     for (int i = 0; i < p->normal_num; i++) {
290         p->iov[i].iov_base = p->host + p->normal[i];
291         p->iov[i].iov_len = p->page_size;
292         ramblock_recv_bitmap_set_offset(p->block, p->normal[i]);
293     }
294     return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp);
295 }
296 
297 static MultiFDMethods multifd_nocomp_ops = {
298     .send_setup = nocomp_send_setup,
299     .send_cleanup = nocomp_send_cleanup,
300     .send_prepare = nocomp_send_prepare,
301     .recv_setup = nocomp_recv_setup,
302     .recv_cleanup = nocomp_recv_cleanup,
303     .recv = nocomp_recv
304 };
305 
306 static MultiFDMethods *multifd_ops[MULTIFD_COMPRESSION__MAX] = {
307     [MULTIFD_COMPRESSION_NONE] = &multifd_nocomp_ops,
308 };
309 
multifd_register_ops(int method,MultiFDMethods * ops)310 void multifd_register_ops(int method, MultiFDMethods *ops)
311 {
312     assert(0 < method && method < MULTIFD_COMPRESSION__MAX);
313     multifd_ops[method] = ops;
314 }
315 
316 /* Reset a MultiFDPages_t* object for the next use */
multifd_pages_reset(MultiFDPages_t * pages)317 static void multifd_pages_reset(MultiFDPages_t *pages)
318 {
319     /*
320      * We don't need to touch offset[] array, because it will be
321      * overwritten later when reused.
322      */
323     pages->num = 0;
324     pages->normal_num = 0;
325     pages->block = NULL;
326 }
327 
multifd_send_initial_packet(MultiFDSendParams * p,Error ** errp)328 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
329 {
330     MultiFDInit_t msg = {};
331     size_t size = sizeof(msg);
332     int ret;
333 
334     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
335     msg.version = cpu_to_be32(MULTIFD_VERSION);
336     msg.id = p->id;
337     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
338 
339     ret = qio_channel_write_all(p->c, (char *)&msg, size, errp);
340     if (ret != 0) {
341         return -1;
342     }
343     stat64_add(&mig_stats.multifd_bytes, size);
344     return 0;
345 }
346 
multifd_recv_initial_packet(QIOChannel * c,Error ** errp)347 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
348 {
349     MultiFDInit_t msg;
350     int ret;
351 
352     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
353     if (ret != 0) {
354         return -1;
355     }
356 
357     msg.magic = be32_to_cpu(msg.magic);
358     msg.version = be32_to_cpu(msg.version);
359 
360     if (msg.magic != MULTIFD_MAGIC) {
361         error_setg(errp, "multifd: received packet magic %x "
362                    "expected %x", msg.magic, MULTIFD_MAGIC);
363         return -1;
364     }
365 
366     if (msg.version != MULTIFD_VERSION) {
367         error_setg(errp, "multifd: received packet version %u "
368                    "expected %u", msg.version, MULTIFD_VERSION);
369         return -1;
370     }
371 
372     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
373         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
374         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
375 
376         error_setg(errp, "multifd: received uuid '%s' and expected "
377                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
378         g_free(uuid);
379         g_free(msg_uuid);
380         return -1;
381     }
382 
383     if (msg.id > migrate_multifd_channels()) {
384         error_setg(errp, "multifd: received channel id %u is greater than "
385                    "number of channels %u", msg.id, migrate_multifd_channels());
386         return -1;
387     }
388 
389     return msg.id;
390 }
391 
multifd_pages_init(uint32_t n)392 static MultiFDPages_t *multifd_pages_init(uint32_t n)
393 {
394     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
395 
396     pages->allocated = n;
397     pages->offset = g_new0(ram_addr_t, n);
398 
399     return pages;
400 }
401 
multifd_pages_clear(MultiFDPages_t * pages)402 static void multifd_pages_clear(MultiFDPages_t *pages)
403 {
404     multifd_pages_reset(pages);
405     pages->allocated = 0;
406     g_free(pages->offset);
407     pages->offset = NULL;
408     g_free(pages);
409 }
410 
multifd_send_fill_packet(MultiFDSendParams * p)411 void multifd_send_fill_packet(MultiFDSendParams *p)
412 {
413     MultiFDPacket_t *packet = p->packet;
414     MultiFDPages_t *pages = p->pages;
415     uint64_t packet_num;
416     uint32_t zero_num = pages->num - pages->normal_num;
417     int i;
418 
419     packet->flags = cpu_to_be32(p->flags);
420     packet->pages_alloc = cpu_to_be32(p->pages->allocated);
421     packet->normal_pages = cpu_to_be32(pages->normal_num);
422     packet->zero_pages = cpu_to_be32(zero_num);
423     packet->next_packet_size = cpu_to_be32(p->next_packet_size);
424 
425     packet_num = qatomic_fetch_inc(&multifd_send_state->packet_num);
426     packet->packet_num = cpu_to_be64(packet_num);
427 
428     if (pages->block) {
429         strncpy(packet->ramblock, pages->block->idstr, 256);
430     }
431 
432     for (i = 0; i < pages->num; i++) {
433         /* there are architectures where ram_addr_t is 32 bit */
434         uint64_t temp = pages->offset[i];
435 
436         packet->offset[i] = cpu_to_be64(temp);
437     }
438 
439     p->packets_sent++;
440     p->total_normal_pages += pages->normal_num;
441     p->total_zero_pages += zero_num;
442 
443     trace_multifd_send(p->id, packet_num, pages->normal_num, zero_num,
444                        p->flags, p->next_packet_size);
445 }
446 
multifd_recv_unfill_packet(MultiFDRecvParams * p,Error ** errp)447 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
448 {
449     MultiFDPacket_t *packet = p->packet;
450     int i;
451 
452     packet->magic = be32_to_cpu(packet->magic);
453     if (packet->magic != MULTIFD_MAGIC) {
454         error_setg(errp, "multifd: received packet "
455                    "magic %x and expected magic %x",
456                    packet->magic, MULTIFD_MAGIC);
457         return -1;
458     }
459 
460     packet->version = be32_to_cpu(packet->version);
461     if (packet->version != MULTIFD_VERSION) {
462         error_setg(errp, "multifd: received packet "
463                    "version %u and expected version %u",
464                    packet->version, MULTIFD_VERSION);
465         return -1;
466     }
467 
468     p->flags = be32_to_cpu(packet->flags);
469 
470     packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
471     /*
472      * If we received a packet that is 100 times bigger than expected
473      * just stop migration.  It is a magic number.
474      */
475     if (packet->pages_alloc > p->page_count) {
476         error_setg(errp, "multifd: received packet "
477                    "with size %u and expected a size of %u",
478                    packet->pages_alloc, p->page_count) ;
479         return -1;
480     }
481 
482     p->normal_num = be32_to_cpu(packet->normal_pages);
483     if (p->normal_num > packet->pages_alloc) {
484         error_setg(errp, "multifd: received packet "
485                    "with %u normal pages and expected maximum pages are %u",
486                    p->normal_num, packet->pages_alloc) ;
487         return -1;
488     }
489 
490     p->zero_num = be32_to_cpu(packet->zero_pages);
491     if (p->zero_num > packet->pages_alloc - p->normal_num) {
492         error_setg(errp, "multifd: received packet "
493                    "with %u zero pages and expected maximum zero pages are %u",
494                    p->zero_num, packet->pages_alloc - p->normal_num) ;
495         return -1;
496     }
497 
498     p->next_packet_size = be32_to_cpu(packet->next_packet_size);
499     p->packet_num = be64_to_cpu(packet->packet_num);
500     p->packets_recved++;
501     p->total_normal_pages += p->normal_num;
502     p->total_zero_pages += p->zero_num;
503 
504     trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->zero_num,
505                        p->flags, p->next_packet_size);
506 
507     if (p->normal_num == 0 && p->zero_num == 0) {
508         return 0;
509     }
510 
511     /* make sure that ramblock is 0 terminated */
512     packet->ramblock[255] = 0;
513     p->block = qemu_ram_block_by_name(packet->ramblock);
514     if (!p->block) {
515         error_setg(errp, "multifd: unknown ram block %s",
516                    packet->ramblock);
517         return -1;
518     }
519 
520     p->host = p->block->host;
521     for (i = 0; i < p->normal_num; i++) {
522         uint64_t offset = be64_to_cpu(packet->offset[i]);
523 
524         if (offset > (p->block->used_length - p->page_size)) {
525             error_setg(errp, "multifd: offset too long %" PRIu64
526                        " (max " RAM_ADDR_FMT ")",
527                        offset, p->block->used_length);
528             return -1;
529         }
530         p->normal[i] = offset;
531     }
532 
533     for (i = 0; i < p->zero_num; i++) {
534         uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]);
535 
536         if (offset > (p->block->used_length - p->page_size)) {
537             error_setg(errp, "multifd: offset too long %" PRIu64
538                        " (max " RAM_ADDR_FMT ")",
539                        offset, p->block->used_length);
540             return -1;
541         }
542         p->zero[i] = offset;
543     }
544 
545     return 0;
546 }
547 
multifd_send_should_exit(void)548 static bool multifd_send_should_exit(void)
549 {
550     return qatomic_read(&multifd_send_state->exiting);
551 }
552 
multifd_recv_should_exit(void)553 static bool multifd_recv_should_exit(void)
554 {
555     return qatomic_read(&multifd_recv_state->exiting);
556 }
557 
558 /*
559  * The migration thread can wait on either of the two semaphores.  This
560  * function can be used to kick the main thread out of waiting on either of
561  * them.  Should mostly only be called when something wrong happened with
562  * the current multifd send thread.
563  */
multifd_send_kick_main(MultiFDSendParams * p)564 static void multifd_send_kick_main(MultiFDSendParams *p)
565 {
566     qemu_sem_post(&p->sem_sync);
567     qemu_sem_post(&multifd_send_state->channels_ready);
568 }
569 
570 /*
571  * How we use multifd_send_state->pages and channel->pages?
572  *
573  * We create a pages for each channel, and a main one.  Each time that
574  * we need to send a batch of pages we interchange the ones between
575  * multifd_send_state and the channel that is sending it.  There are
576  * two reasons for that:
577  *    - to not have to do so many mallocs during migration
578  *    - to make easier to know what to free at the end of migration
579  *
580  * This way we always know who is the owner of each "pages" struct,
581  * and we don't need any locking.  It belongs to the migration thread
582  * or to the channel thread.  Switching is safe because the migration
583  * thread is using the channel mutex when changing it, and the channel
584  * have to had finish with its own, otherwise pending_job can't be
585  * false.
586  *
587  * Returns true if succeed, false otherwise.
588  */
multifd_send_pages(void)589 static bool multifd_send_pages(void)
590 {
591     int i;
592     static int next_channel;
593     MultiFDSendParams *p = NULL; /* make happy gcc */
594     MultiFDPages_t *pages = multifd_send_state->pages;
595 
596     if (multifd_send_should_exit()) {
597         return false;
598     }
599 
600     /* We wait here, until at least one channel is ready */
601     qemu_sem_wait(&multifd_send_state->channels_ready);
602 
603     /*
604      * next_channel can remain from a previous migration that was
605      * using more channels, so ensure it doesn't overflow if the
606      * limit is lower now.
607      */
608     next_channel %= migrate_multifd_channels();
609     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
610         if (multifd_send_should_exit()) {
611             return false;
612         }
613         p = &multifd_send_state->params[i];
614         /*
615          * Lockless read to p->pending_job is safe, because only multifd
616          * sender thread can clear it.
617          */
618         if (qatomic_read(&p->pending_job) == false) {
619             next_channel = (i + 1) % migrate_multifd_channels();
620             break;
621         }
622     }
623 
624     /*
625      * Make sure we read p->pending_job before all the rest.  Pairs with
626      * qatomic_store_release() in multifd_send_thread().
627      */
628     smp_mb_acquire();
629     assert(!p->pages->num);
630     multifd_send_state->pages = p->pages;
631     p->pages = pages;
632     /*
633      * Making sure p->pages is setup before marking pending_job=true. Pairs
634      * with the qatomic_load_acquire() in multifd_send_thread().
635      */
636     qatomic_store_release(&p->pending_job, true);
637     qemu_sem_post(&p->sem);
638 
639     return true;
640 }
641 
multifd_queue_empty(MultiFDPages_t * pages)642 static inline bool multifd_queue_empty(MultiFDPages_t *pages)
643 {
644     return pages->num == 0;
645 }
646 
multifd_queue_full(MultiFDPages_t * pages)647 static inline bool multifd_queue_full(MultiFDPages_t *pages)
648 {
649     return pages->num == pages->allocated;
650 }
651 
multifd_enqueue(MultiFDPages_t * pages,ram_addr_t offset)652 static inline void multifd_enqueue(MultiFDPages_t *pages, ram_addr_t offset)
653 {
654     pages->offset[pages->num++] = offset;
655 }
656 
657 /* Returns true if enqueue successful, false otherwise */
multifd_queue_page(RAMBlock * block,ram_addr_t offset)658 bool multifd_queue_page(RAMBlock *block, ram_addr_t offset)
659 {
660     MultiFDPages_t *pages;
661 
662 retry:
663     pages = multifd_send_state->pages;
664 
665     /* If the queue is empty, we can already enqueue now */
666     if (multifd_queue_empty(pages)) {
667         pages->block = block;
668         multifd_enqueue(pages, offset);
669         return true;
670     }
671 
672     /*
673      * Not empty, meanwhile we need a flush.  It can because of either:
674      *
675      * (1) The page is not on the same ramblock of previous ones, or,
676      * (2) The queue is full.
677      *
678      * After flush, always retry.
679      */
680     if (pages->block != block || multifd_queue_full(pages)) {
681         if (!multifd_send_pages()) {
682             return false;
683         }
684         goto retry;
685     }
686 
687     /* Not empty, and we still have space, do it! */
688     multifd_enqueue(pages, offset);
689     return true;
690 }
691 
692 /* Multifd send side hit an error; remember it and prepare to quit */
multifd_send_set_error(Error * err)693 static void multifd_send_set_error(Error *err)
694 {
695     /*
696      * We don't want to exit each threads twice.  Depending on where
697      * we get the error, or if there are two independent errors in two
698      * threads at the same time, we can end calling this function
699      * twice.
700      */
701     if (qatomic_xchg(&multifd_send_state->exiting, 1)) {
702         return;
703     }
704 
705     if (err) {
706         MigrationState *s = migrate_get_current();
707         migrate_set_error(s, err);
708         if (s->state == MIGRATION_STATUS_SETUP ||
709             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
710             s->state == MIGRATION_STATUS_DEVICE ||
711             s->state == MIGRATION_STATUS_ACTIVE) {
712             migrate_set_state(&s->state, s->state,
713                               MIGRATION_STATUS_FAILED);
714         }
715     }
716 }
717 
multifd_send_terminate_threads(void)718 static void multifd_send_terminate_threads(void)
719 {
720     int i;
721 
722     trace_multifd_send_terminate_threads();
723 
724     /*
725      * Tell everyone we're quitting.  No xchg() needed here; we simply
726      * always set it.
727      */
728     qatomic_set(&multifd_send_state->exiting, 1);
729 
730     /*
731      * Firstly, kick all threads out; no matter whether they are just idle,
732      * or blocked in an IO system call.
733      */
734     for (i = 0; i < migrate_multifd_channels(); i++) {
735         MultiFDSendParams *p = &multifd_send_state->params[i];
736 
737         qemu_sem_post(&p->sem);
738         if (p->c) {
739             qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
740         }
741     }
742 
743     /*
744      * Finally recycle all the threads.
745      */
746     for (i = 0; i < migrate_multifd_channels(); i++) {
747         MultiFDSendParams *p = &multifd_send_state->params[i];
748 
749         if (p->tls_thread_created) {
750             qemu_thread_join(&p->tls_thread);
751         }
752 
753         if (p->thread_created) {
754             qemu_thread_join(&p->thread);
755         }
756     }
757 }
758 
multifd_send_cleanup_channel(MultiFDSendParams * p,Error ** errp)759 static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp)
760 {
761     if (p->c) {
762         migration_ioc_unregister_yank(p->c);
763         /*
764          * The object_unref() cannot guarantee the fd will always be
765          * released because finalize() of the iochannel is only
766          * triggered on the last reference and it's not guaranteed
767          * that we always hold the last refcount when reaching here.
768          *
769          * Closing the fd explicitly has the benefit that if there is any
770          * registered I/O handler callbacks on such fd, that will get a
771          * POLLNVAL event and will further trigger the cleanup to finally
772          * release the IOC.
773          *
774          * FIXME: It should logically be guaranteed that all multifd
775          * channels have no I/O handler callback registered when reaching
776          * here, because migration thread will wait for all multifd channel
777          * establishments to complete during setup.  Since
778          * migrate_fd_cleanup() will be scheduled in main thread too, all
779          * previous callbacks should guarantee to be completed when
780          * reaching here.  See multifd_send_state.channels_created and its
781          * usage.  In the future, we could replace this with an assert
782          * making sure we're the last reference, or simply drop it if above
783          * is more clear to be justified.
784          */
785         qio_channel_close(p->c, &error_abort);
786         object_unref(OBJECT(p->c));
787         p->c = NULL;
788     }
789     qemu_sem_destroy(&p->sem);
790     qemu_sem_destroy(&p->sem_sync);
791     g_free(p->name);
792     p->name = NULL;
793     multifd_pages_clear(p->pages);
794     p->pages = NULL;
795     p->packet_len = 0;
796     g_free(p->packet);
797     p->packet = NULL;
798     multifd_send_state->ops->send_cleanup(p, errp);
799 
800     return *errp == NULL;
801 }
802 
multifd_send_cleanup_state(void)803 static void multifd_send_cleanup_state(void)
804 {
805     file_cleanup_outgoing_migration();
806     socket_cleanup_outgoing_migration();
807     qemu_sem_destroy(&multifd_send_state->channels_created);
808     qemu_sem_destroy(&multifd_send_state->channels_ready);
809     g_free(multifd_send_state->params);
810     multifd_send_state->params = NULL;
811     multifd_pages_clear(multifd_send_state->pages);
812     multifd_send_state->pages = NULL;
813     g_free(multifd_send_state);
814     multifd_send_state = NULL;
815 }
816 
multifd_send_shutdown(void)817 void multifd_send_shutdown(void)
818 {
819     int i;
820 
821     if (!migrate_multifd()) {
822         return;
823     }
824 
825     multifd_send_terminate_threads();
826 
827     for (i = 0; i < migrate_multifd_channels(); i++) {
828         MultiFDSendParams *p = &multifd_send_state->params[i];
829         Error *local_err = NULL;
830 
831         if (!multifd_send_cleanup_channel(p, &local_err)) {
832             migrate_set_error(migrate_get_current(), local_err);
833             error_free(local_err);
834         }
835     }
836 
837     multifd_send_cleanup_state();
838 }
839 
multifd_zero_copy_flush(QIOChannel * c)840 static int multifd_zero_copy_flush(QIOChannel *c)
841 {
842     int ret;
843     Error *err = NULL;
844 
845     ret = qio_channel_flush(c, &err);
846     if (ret < 0) {
847         error_report_err(err);
848         return -1;
849     }
850     if (ret == 1) {
851         stat64_add(&mig_stats.dirty_sync_missed_zero_copy, 1);
852     }
853 
854     return ret;
855 }
856 
multifd_send_sync_main(void)857 int multifd_send_sync_main(void)
858 {
859     int i;
860     bool flush_zero_copy;
861 
862     if (!migrate_multifd()) {
863         return 0;
864     }
865     if (multifd_send_state->pages->num) {
866         if (!multifd_send_pages()) {
867             error_report("%s: multifd_send_pages fail", __func__);
868             return -1;
869         }
870     }
871 
872     flush_zero_copy = migrate_zero_copy_send();
873 
874     for (i = 0; i < migrate_multifd_channels(); i++) {
875         MultiFDSendParams *p = &multifd_send_state->params[i];
876 
877         if (multifd_send_should_exit()) {
878             return -1;
879         }
880 
881         trace_multifd_send_sync_main_signal(p->id);
882 
883         /*
884          * We should be the only user so far, so not possible to be set by
885          * others concurrently.
886          */
887         assert(qatomic_read(&p->pending_sync) == false);
888         qatomic_set(&p->pending_sync, true);
889         qemu_sem_post(&p->sem);
890     }
891     for (i = 0; i < migrate_multifd_channels(); i++) {
892         MultiFDSendParams *p = &multifd_send_state->params[i];
893 
894         if (multifd_send_should_exit()) {
895             return -1;
896         }
897 
898         qemu_sem_wait(&multifd_send_state->channels_ready);
899         trace_multifd_send_sync_main_wait(p->id);
900         qemu_sem_wait(&p->sem_sync);
901 
902         if (flush_zero_copy && p->c && (multifd_zero_copy_flush(p->c) < 0)) {
903             return -1;
904         }
905     }
906     trace_multifd_send_sync_main(multifd_send_state->packet_num);
907 
908     return 0;
909 }
910 
multifd_send_thread(void * opaque)911 static void *multifd_send_thread(void *opaque)
912 {
913     MultiFDSendParams *p = opaque;
914     MigrationThread *thread = NULL;
915     Error *local_err = NULL;
916     int ret = 0;
917     bool use_packets = multifd_use_packets();
918 
919     thread = migration_threads_add(p->name, qemu_get_thread_id());
920 
921     trace_multifd_send_thread_start(p->id);
922     rcu_register_thread();
923 
924     if (use_packets) {
925         if (multifd_send_initial_packet(p, &local_err) < 0) {
926             ret = -1;
927             goto out;
928         }
929     }
930 
931     while (true) {
932         qemu_sem_post(&multifd_send_state->channels_ready);
933         qemu_sem_wait(&p->sem);
934 
935         if (multifd_send_should_exit()) {
936             break;
937         }
938 
939         /*
940          * Read pending_job flag before p->pages.  Pairs with the
941          * qatomic_store_release() in multifd_send_pages().
942          */
943         if (qatomic_load_acquire(&p->pending_job)) {
944             MultiFDPages_t *pages = p->pages;
945 
946             p->iovs_num = 0;
947             assert(pages->num);
948 
949             ret = multifd_send_state->ops->send_prepare(p, &local_err);
950             if (ret != 0) {
951                 break;
952             }
953 
954             if (migrate_mapped_ram()) {
955                 ret = file_write_ramblock_iov(p->c, p->iov, p->iovs_num,
956                                               p->pages->block, &local_err);
957             } else {
958                 ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num,
959                                                   NULL, 0, p->write_flags,
960                                                   &local_err);
961             }
962 
963             if (ret != 0) {
964                 break;
965             }
966 
967             stat64_add(&mig_stats.multifd_bytes,
968                        p->next_packet_size + p->packet_len);
969             stat64_add(&mig_stats.normal_pages, pages->normal_num);
970             stat64_add(&mig_stats.zero_pages, pages->num - pages->normal_num);
971 
972             multifd_pages_reset(p->pages);
973             p->next_packet_size = 0;
974 
975             /*
976              * Making sure p->pages is published before saying "we're
977              * free".  Pairs with the smp_mb_acquire() in
978              * multifd_send_pages().
979              */
980             qatomic_store_release(&p->pending_job, false);
981         } else {
982             /*
983              * If not a normal job, must be a sync request.  Note that
984              * pending_sync is a standalone flag (unlike pending_job), so
985              * it doesn't require explicit memory barriers.
986              */
987             assert(qatomic_read(&p->pending_sync));
988 
989             if (use_packets) {
990                 p->flags = MULTIFD_FLAG_SYNC;
991                 multifd_send_fill_packet(p);
992                 ret = qio_channel_write_all(p->c, (void *)p->packet,
993                                             p->packet_len, &local_err);
994                 if (ret != 0) {
995                     break;
996                 }
997                 /* p->next_packet_size will always be zero for a SYNC packet */
998                 stat64_add(&mig_stats.multifd_bytes, p->packet_len);
999                 p->flags = 0;
1000             }
1001 
1002             qatomic_set(&p->pending_sync, false);
1003             qemu_sem_post(&p->sem_sync);
1004         }
1005     }
1006 
1007 out:
1008     if (ret) {
1009         assert(local_err);
1010         trace_multifd_send_error(p->id);
1011         multifd_send_set_error(local_err);
1012         multifd_send_kick_main(p);
1013         error_free(local_err);
1014     }
1015 
1016     rcu_unregister_thread();
1017     migration_threads_remove(thread);
1018     trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages,
1019                                   p->total_zero_pages);
1020 
1021     return NULL;
1022 }
1023 
1024 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque);
1025 
1026 typedef struct {
1027     MultiFDSendParams *p;
1028     QIOChannelTLS *tioc;
1029 } MultiFDTLSThreadArgs;
1030 
multifd_tls_handshake_thread(void * opaque)1031 static void *multifd_tls_handshake_thread(void *opaque)
1032 {
1033     MultiFDTLSThreadArgs *args = opaque;
1034 
1035     qio_channel_tls_handshake(args->tioc,
1036                               multifd_new_send_channel_async,
1037                               args->p,
1038                               NULL,
1039                               NULL);
1040     g_free(args);
1041 
1042     return NULL;
1043 }
1044 
multifd_tls_channel_connect(MultiFDSendParams * p,QIOChannel * ioc,Error ** errp)1045 static bool multifd_tls_channel_connect(MultiFDSendParams *p,
1046                                         QIOChannel *ioc,
1047                                         Error **errp)
1048 {
1049     MigrationState *s = migrate_get_current();
1050     const char *hostname = s->hostname;
1051     MultiFDTLSThreadArgs *args;
1052     QIOChannelTLS *tioc;
1053 
1054     tioc = migration_tls_client_create(ioc, hostname, errp);
1055     if (!tioc) {
1056         return false;
1057     }
1058 
1059     /*
1060      * Ownership of the socket channel now transfers to the newly
1061      * created TLS channel, which has already taken a reference.
1062      */
1063     object_unref(OBJECT(ioc));
1064     trace_multifd_tls_outgoing_handshake_start(ioc, tioc, hostname);
1065     qio_channel_set_name(QIO_CHANNEL(tioc), "multifd-tls-outgoing");
1066 
1067     args = g_new0(MultiFDTLSThreadArgs, 1);
1068     args->tioc = tioc;
1069     args->p = p;
1070 
1071     p->tls_thread_created = true;
1072     qemu_thread_create(&p->tls_thread, "multifd-tls-handshake-worker",
1073                        multifd_tls_handshake_thread, args,
1074                        QEMU_THREAD_JOINABLE);
1075     return true;
1076 }
1077 
multifd_channel_connect(MultiFDSendParams * p,QIOChannel * ioc)1078 void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc)
1079 {
1080     qio_channel_set_delay(ioc, false);
1081 
1082     migration_ioc_register_yank(ioc);
1083     /* Setup p->c only if the channel is completely setup */
1084     p->c = ioc;
1085 
1086     p->thread_created = true;
1087     qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1088                        QEMU_THREAD_JOINABLE);
1089 }
1090 
1091 /*
1092  * When TLS is enabled this function is called once to establish the
1093  * TLS connection and a second time after the TLS handshake to create
1094  * the multifd channel. Without TLS it goes straight into the channel
1095  * creation.
1096  */
multifd_new_send_channel_async(QIOTask * task,gpointer opaque)1097 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1098 {
1099     MultiFDSendParams *p = opaque;
1100     QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task));
1101     Error *local_err = NULL;
1102     bool ret;
1103 
1104     trace_multifd_new_send_channel_async(p->id);
1105 
1106     if (qio_task_propagate_error(task, &local_err)) {
1107         ret = false;
1108         goto out;
1109     }
1110 
1111     trace_multifd_set_outgoing_channel(ioc, object_get_typename(OBJECT(ioc)),
1112                                        migrate_get_current()->hostname);
1113 
1114     if (migrate_channel_requires_tls_upgrade(ioc)) {
1115         ret = multifd_tls_channel_connect(p, ioc, &local_err);
1116         if (ret) {
1117             return;
1118         }
1119     } else {
1120         multifd_channel_connect(p, ioc);
1121         ret = true;
1122     }
1123 
1124 out:
1125     /*
1126      * Here we're not interested whether creation succeeded, only that
1127      * it happened at all.
1128      */
1129     multifd_send_channel_created();
1130 
1131     if (ret) {
1132         return;
1133     }
1134 
1135     trace_multifd_new_send_channel_async_error(p->id, local_err);
1136     multifd_send_set_error(local_err);
1137     /*
1138      * For error cases (TLS or non-TLS), IO channel is always freed here
1139      * rather than when cleanup multifd: since p->c is not set, multifd
1140      * cleanup code doesn't even know its existence.
1141      */
1142     object_unref(OBJECT(ioc));
1143     error_free(local_err);
1144 }
1145 
multifd_new_send_channel_create(gpointer opaque,Error ** errp)1146 static bool multifd_new_send_channel_create(gpointer opaque, Error **errp)
1147 {
1148     if (!multifd_use_packets()) {
1149         return file_send_channel_create(opaque, errp);
1150     }
1151 
1152     socket_send_channel_create(multifd_new_send_channel_async, opaque);
1153     return true;
1154 }
1155 
multifd_send_setup(void)1156 bool multifd_send_setup(void)
1157 {
1158     MigrationState *s = migrate_get_current();
1159     Error *local_err = NULL;
1160     int thread_count, ret = 0;
1161     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1162     bool use_packets = multifd_use_packets();
1163     uint8_t i;
1164 
1165     if (!migrate_multifd()) {
1166         return true;
1167     }
1168 
1169     thread_count = migrate_multifd_channels();
1170     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1171     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1172     multifd_send_state->pages = multifd_pages_init(page_count);
1173     qemu_sem_init(&multifd_send_state->channels_created, 0);
1174     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1175     qatomic_set(&multifd_send_state->exiting, 0);
1176     multifd_send_state->ops = multifd_ops[migrate_multifd_compression()];
1177 
1178     for (i = 0; i < thread_count; i++) {
1179         MultiFDSendParams *p = &multifd_send_state->params[i];
1180 
1181         qemu_sem_init(&p->sem, 0);
1182         qemu_sem_init(&p->sem_sync, 0);
1183         p->id = i;
1184         p->pages = multifd_pages_init(page_count);
1185 
1186         if (use_packets) {
1187             p->packet_len = sizeof(MultiFDPacket_t)
1188                           + sizeof(uint64_t) * page_count;
1189             p->packet = g_malloc0(p->packet_len);
1190             p->packet->magic = cpu_to_be32(MULTIFD_MAGIC);
1191             p->packet->version = cpu_to_be32(MULTIFD_VERSION);
1192         }
1193         p->name = g_strdup_printf("multifdsend_%d", i);
1194         p->page_size = qemu_target_page_size();
1195         p->page_count = page_count;
1196         p->write_flags = 0;
1197 
1198         if (!multifd_new_send_channel_create(p, &local_err)) {
1199             return false;
1200         }
1201     }
1202 
1203     /*
1204      * Wait until channel creation has started for all channels. The
1205      * creation can still fail, but no more channels will be created
1206      * past this point.
1207      */
1208     for (i = 0; i < thread_count; i++) {
1209         qemu_sem_wait(&multifd_send_state->channels_created);
1210     }
1211 
1212     for (i = 0; i < thread_count; i++) {
1213         MultiFDSendParams *p = &multifd_send_state->params[i];
1214 
1215         ret = multifd_send_state->ops->send_setup(p, &local_err);
1216         if (ret) {
1217             break;
1218         }
1219     }
1220 
1221     if (ret) {
1222         migrate_set_error(s, local_err);
1223         error_report_err(local_err);
1224         migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
1225                           MIGRATION_STATUS_FAILED);
1226         return false;
1227     }
1228 
1229     return true;
1230 }
1231 
multifd_recv(void)1232 bool multifd_recv(void)
1233 {
1234     int i;
1235     static int next_recv_channel;
1236     MultiFDRecvParams *p = NULL;
1237     MultiFDRecvData *data = multifd_recv_state->data;
1238 
1239     /*
1240      * next_channel can remain from a previous migration that was
1241      * using more channels, so ensure it doesn't overflow if the
1242      * limit is lower now.
1243      */
1244     next_recv_channel %= migrate_multifd_channels();
1245     for (i = next_recv_channel;; i = (i + 1) % migrate_multifd_channels()) {
1246         if (multifd_recv_should_exit()) {
1247             return false;
1248         }
1249 
1250         p = &multifd_recv_state->params[i];
1251 
1252         if (qatomic_read(&p->pending_job) == false) {
1253             next_recv_channel = (i + 1) % migrate_multifd_channels();
1254             break;
1255         }
1256     }
1257 
1258     /*
1259      * Order pending_job read before manipulating p->data below. Pairs
1260      * with qatomic_store_release() at multifd_recv_thread().
1261      */
1262     smp_mb_acquire();
1263 
1264     assert(!p->data->size);
1265     multifd_recv_state->data = p->data;
1266     p->data = data;
1267 
1268     /*
1269      * Order p->data update before setting pending_job. Pairs with
1270      * qatomic_load_acquire() at multifd_recv_thread().
1271      */
1272     qatomic_store_release(&p->pending_job, true);
1273     qemu_sem_post(&p->sem);
1274 
1275     return true;
1276 }
1277 
multifd_get_recv_data(void)1278 MultiFDRecvData *multifd_get_recv_data(void)
1279 {
1280     return multifd_recv_state->data;
1281 }
1282 
multifd_recv_terminate_threads(Error * err)1283 static void multifd_recv_terminate_threads(Error *err)
1284 {
1285     int i;
1286 
1287     trace_multifd_recv_terminate_threads(err != NULL);
1288 
1289     if (qatomic_xchg(&multifd_recv_state->exiting, 1)) {
1290         return;
1291     }
1292 
1293     if (err) {
1294         MigrationState *s = migrate_get_current();
1295         migrate_set_error(s, err);
1296         if (s->state == MIGRATION_STATUS_SETUP ||
1297             s->state == MIGRATION_STATUS_ACTIVE) {
1298             migrate_set_state(&s->state, s->state,
1299                               MIGRATION_STATUS_FAILED);
1300         }
1301     }
1302 
1303     for (i = 0; i < migrate_multifd_channels(); i++) {
1304         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1305 
1306         /*
1307          * The migration thread and channels interact differently
1308          * depending on the presence of packets.
1309          */
1310         if (multifd_use_packets()) {
1311             /*
1312              * The channel receives as long as there are packets. When
1313              * packets end (i.e. MULTIFD_FLAG_SYNC is reached), the
1314              * channel waits for the migration thread to sync. If the
1315              * sync never happens, do it here.
1316              */
1317             qemu_sem_post(&p->sem_sync);
1318         } else {
1319             /*
1320              * The channel waits for the migration thread to give it
1321              * work. When the migration thread runs out of work, it
1322              * releases the channel and waits for any pending work to
1323              * finish. If we reach here (e.g. due to error) before the
1324              * work runs out, release the channel.
1325              */
1326             qemu_sem_post(&p->sem);
1327         }
1328 
1329         /*
1330          * We could arrive here for two reasons:
1331          *  - normal quit, i.e. everything went fine, just finished
1332          *  - error quit: We close the channels so the channel threads
1333          *    finish the qio_channel_read_all_eof()
1334          */
1335         if (p->c) {
1336             qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1337         }
1338     }
1339 }
1340 
multifd_recv_shutdown(void)1341 void multifd_recv_shutdown(void)
1342 {
1343     if (migrate_multifd()) {
1344         multifd_recv_terminate_threads(NULL);
1345     }
1346 }
1347 
multifd_recv_cleanup_channel(MultiFDRecvParams * p)1348 static void multifd_recv_cleanup_channel(MultiFDRecvParams *p)
1349 {
1350     migration_ioc_unregister_yank(p->c);
1351     object_unref(OBJECT(p->c));
1352     p->c = NULL;
1353     qemu_mutex_destroy(&p->mutex);
1354     qemu_sem_destroy(&p->sem_sync);
1355     qemu_sem_destroy(&p->sem);
1356     g_free(p->name);
1357     p->name = NULL;
1358     p->packet_len = 0;
1359     g_free(p->packet);
1360     p->packet = NULL;
1361     g_free(p->normal);
1362     p->normal = NULL;
1363     g_free(p->zero);
1364     p->zero = NULL;
1365     multifd_recv_state->ops->recv_cleanup(p);
1366 }
1367 
multifd_recv_cleanup_state(void)1368 static void multifd_recv_cleanup_state(void)
1369 {
1370     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1371     g_free(multifd_recv_state->params);
1372     multifd_recv_state->params = NULL;
1373     g_free(multifd_recv_state->data);
1374     multifd_recv_state->data = NULL;
1375     g_free(multifd_recv_state);
1376     multifd_recv_state = NULL;
1377 }
1378 
multifd_recv_cleanup(void)1379 void multifd_recv_cleanup(void)
1380 {
1381     int i;
1382 
1383     if (!migrate_multifd()) {
1384         return;
1385     }
1386     multifd_recv_terminate_threads(NULL);
1387     for (i = 0; i < migrate_multifd_channels(); i++) {
1388         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1389 
1390         if (p->thread_created) {
1391             qemu_thread_join(&p->thread);
1392         }
1393     }
1394     for (i = 0; i < migrate_multifd_channels(); i++) {
1395         multifd_recv_cleanup_channel(&multifd_recv_state->params[i]);
1396     }
1397     multifd_recv_cleanup_state();
1398 }
1399 
multifd_recv_sync_main(void)1400 void multifd_recv_sync_main(void)
1401 {
1402     int thread_count = migrate_multifd_channels();
1403     bool file_based = !multifd_use_packets();
1404     int i;
1405 
1406     if (!migrate_multifd()) {
1407         return;
1408     }
1409 
1410     /*
1411      * File-based channels don't use packets and therefore need to
1412      * wait for more work. Release them to start the sync.
1413      */
1414     if (file_based) {
1415         for (i = 0; i < thread_count; i++) {
1416             MultiFDRecvParams *p = &multifd_recv_state->params[i];
1417 
1418             trace_multifd_recv_sync_main_signal(p->id);
1419             qemu_sem_post(&p->sem);
1420         }
1421     }
1422 
1423     /*
1424      * Initiate the synchronization by waiting for all channels.
1425      *
1426      * For socket-based migration this means each channel has received
1427      * the SYNC packet on the stream.
1428      *
1429      * For file-based migration this means each channel is done with
1430      * the work (pending_job=false).
1431      */
1432     for (i = 0; i < thread_count; i++) {
1433         trace_multifd_recv_sync_main_wait(i);
1434         qemu_sem_wait(&multifd_recv_state->sem_sync);
1435     }
1436 
1437     if (file_based) {
1438         /*
1439          * For file-based loading is done in one iteration. We're
1440          * done.
1441          */
1442         return;
1443     }
1444 
1445     /*
1446      * Sync done. Release the channels for the next iteration.
1447      */
1448     for (i = 0; i < thread_count; i++) {
1449         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1450 
1451         WITH_QEMU_LOCK_GUARD(&p->mutex) {
1452             if (multifd_recv_state->packet_num < p->packet_num) {
1453                 multifd_recv_state->packet_num = p->packet_num;
1454             }
1455         }
1456         trace_multifd_recv_sync_main_signal(p->id);
1457         qemu_sem_post(&p->sem_sync);
1458     }
1459     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1460 }
1461 
multifd_recv_thread(void * opaque)1462 static void *multifd_recv_thread(void *opaque)
1463 {
1464     MultiFDRecvParams *p = opaque;
1465     Error *local_err = NULL;
1466     bool use_packets = multifd_use_packets();
1467     int ret;
1468 
1469     trace_multifd_recv_thread_start(p->id);
1470     rcu_register_thread();
1471 
1472     while (true) {
1473         uint32_t flags = 0;
1474         bool has_data = false;
1475         p->normal_num = 0;
1476 
1477         if (use_packets) {
1478             if (multifd_recv_should_exit()) {
1479                 break;
1480             }
1481 
1482             ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1483                                            p->packet_len, &local_err);
1484             if (ret == 0 || ret == -1) {   /* 0: EOF  -1: Error */
1485                 break;
1486             }
1487 
1488             qemu_mutex_lock(&p->mutex);
1489             ret = multifd_recv_unfill_packet(p, &local_err);
1490             if (ret) {
1491                 qemu_mutex_unlock(&p->mutex);
1492                 break;
1493             }
1494 
1495             flags = p->flags;
1496             /* recv methods don't know how to handle the SYNC flag */
1497             p->flags &= ~MULTIFD_FLAG_SYNC;
1498             has_data = p->normal_num || p->zero_num;
1499             qemu_mutex_unlock(&p->mutex);
1500         } else {
1501             /*
1502              * No packets, so we need to wait for the vmstate code to
1503              * give us work.
1504              */
1505             qemu_sem_wait(&p->sem);
1506 
1507             if (multifd_recv_should_exit()) {
1508                 break;
1509             }
1510 
1511             /* pairs with qatomic_store_release() at multifd_recv() */
1512             if (!qatomic_load_acquire(&p->pending_job)) {
1513                 /*
1514                  * Migration thread did not send work, this is
1515                  * equivalent to pending_sync on the sending
1516                  * side. Post sem_sync to notify we reached this
1517                  * point.
1518                  */
1519                 qemu_sem_post(&multifd_recv_state->sem_sync);
1520                 continue;
1521             }
1522 
1523             has_data = !!p->data->size;
1524         }
1525 
1526         if (has_data) {
1527             ret = multifd_recv_state->ops->recv(p, &local_err);
1528             if (ret != 0) {
1529                 break;
1530             }
1531         }
1532 
1533         if (use_packets) {
1534             if (flags & MULTIFD_FLAG_SYNC) {
1535                 qemu_sem_post(&multifd_recv_state->sem_sync);
1536                 qemu_sem_wait(&p->sem_sync);
1537             }
1538         } else {
1539             p->total_normal_pages += p->data->size / qemu_target_page_size();
1540             p->data->size = 0;
1541             /*
1542              * Order data->size update before clearing
1543              * pending_job. Pairs with smp_mb_acquire() at
1544              * multifd_recv().
1545              */
1546             qatomic_store_release(&p->pending_job, false);
1547         }
1548     }
1549 
1550     if (local_err) {
1551         multifd_recv_terminate_threads(local_err);
1552         error_free(local_err);
1553     }
1554 
1555     rcu_unregister_thread();
1556     trace_multifd_recv_thread_end(p->id, p->packets_recved,
1557                                   p->total_normal_pages,
1558                                   p->total_zero_pages);
1559 
1560     return NULL;
1561 }
1562 
multifd_recv_setup(Error ** errp)1563 int multifd_recv_setup(Error **errp)
1564 {
1565     int thread_count;
1566     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1567     bool use_packets = multifd_use_packets();
1568     uint8_t i;
1569 
1570     /*
1571      * Return successfully if multiFD recv state is already initialised
1572      * or multiFD is not enabled.
1573      */
1574     if (multifd_recv_state || !migrate_multifd()) {
1575         return 0;
1576     }
1577 
1578     thread_count = migrate_multifd_channels();
1579     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1580     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1581 
1582     multifd_recv_state->data = g_new0(MultiFDRecvData, 1);
1583     multifd_recv_state->data->size = 0;
1584 
1585     qatomic_set(&multifd_recv_state->count, 0);
1586     qatomic_set(&multifd_recv_state->exiting, 0);
1587     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1588     multifd_recv_state->ops = multifd_ops[migrate_multifd_compression()];
1589 
1590     for (i = 0; i < thread_count; i++) {
1591         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1592 
1593         qemu_mutex_init(&p->mutex);
1594         qemu_sem_init(&p->sem_sync, 0);
1595         qemu_sem_init(&p->sem, 0);
1596         p->pending_job = false;
1597         p->id = i;
1598 
1599         p->data = g_new0(MultiFDRecvData, 1);
1600         p->data->size = 0;
1601 
1602         if (use_packets) {
1603             p->packet_len = sizeof(MultiFDPacket_t)
1604                 + sizeof(uint64_t) * page_count;
1605             p->packet = g_malloc0(p->packet_len);
1606         }
1607         p->name = g_strdup_printf("multifdrecv_%d", i);
1608         p->normal = g_new0(ram_addr_t, page_count);
1609         p->zero = g_new0(ram_addr_t, page_count);
1610         p->page_count = page_count;
1611         p->page_size = qemu_target_page_size();
1612     }
1613 
1614     for (i = 0; i < thread_count; i++) {
1615         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1616         int ret;
1617 
1618         ret = multifd_recv_state->ops->recv_setup(p, errp);
1619         if (ret) {
1620             return ret;
1621         }
1622     }
1623     return 0;
1624 }
1625 
multifd_recv_all_channels_created(void)1626 bool multifd_recv_all_channels_created(void)
1627 {
1628     int thread_count = migrate_multifd_channels();
1629 
1630     if (!migrate_multifd()) {
1631         return true;
1632     }
1633 
1634     if (!multifd_recv_state) {
1635         /* Called before any connections created */
1636         return false;
1637     }
1638 
1639     return thread_count == qatomic_read(&multifd_recv_state->count);
1640 }
1641 
1642 /*
1643  * Try to receive all multifd channels to get ready for the migration.
1644  * Sets @errp when failing to receive the current channel.
1645  */
multifd_recv_new_channel(QIOChannel * ioc,Error ** errp)1646 void multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
1647 {
1648     MultiFDRecvParams *p;
1649     Error *local_err = NULL;
1650     bool use_packets = multifd_use_packets();
1651     int id;
1652 
1653     if (use_packets) {
1654         id = multifd_recv_initial_packet(ioc, &local_err);
1655         if (id < 0) {
1656             multifd_recv_terminate_threads(local_err);
1657             error_propagate_prepend(errp, local_err,
1658                                     "failed to receive packet"
1659                                     " via multifd channel %d: ",
1660                                     qatomic_read(&multifd_recv_state->count));
1661             return;
1662         }
1663         trace_multifd_recv_new_channel(id);
1664     } else {
1665         id = qatomic_read(&multifd_recv_state->count);
1666     }
1667 
1668     p = &multifd_recv_state->params[id];
1669     if (p->c != NULL) {
1670         error_setg(&local_err, "multifd: received id '%d' already setup'",
1671                    id);
1672         multifd_recv_terminate_threads(local_err);
1673         error_propagate(errp, local_err);
1674         return;
1675     }
1676     p->c = ioc;
1677     object_ref(OBJECT(ioc));
1678 
1679     p->thread_created = true;
1680     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1681                        QEMU_THREAD_JOINABLE);
1682     qatomic_inc(&multifd_recv_state->count);
1683 }
1684 
multifd_send_prepare_common(MultiFDSendParams * p)1685 bool multifd_send_prepare_common(MultiFDSendParams *p)
1686 {
1687     multifd_send_zero_page_detect(p);
1688 
1689     if (!p->pages->normal_num) {
1690         p->next_packet_size = 0;
1691         return false;
1692     }
1693 
1694     multifd_send_prepare_header(p);
1695 
1696     return true;
1697 }
1698