xref: /qemu/hw/vfio/migration.c (revision 73b49878)
1 /*
2  * Migration support for VFIO devices
3  *
4  * Copyright NVIDIA, Inc. 2020
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2. See
7  * the COPYING file in the top-level directory.
8  */
9 
10 #include "qemu/osdep.h"
11 #include "qemu/main-loop.h"
12 #include "qemu/cutils.h"
13 #include "qemu/units.h"
14 #include "qemu/error-report.h"
15 #include <linux/vfio.h>
16 #include <sys/ioctl.h>
17 
18 #include "sysemu/runstate.h"
19 #include "hw/vfio/vfio-common.h"
20 #include "migration/migration.h"
21 #include "migration/options.h"
22 #include "migration/savevm.h"
23 #include "migration/vmstate.h"
24 #include "migration/qemu-file.h"
25 #include "migration/register.h"
26 #include "migration/blocker.h"
27 #include "migration/misc.h"
28 #include "qapi/error.h"
29 #include "exec/ramlist.h"
30 #include "exec/ram_addr.h"
31 #include "pci.h"
32 #include "trace.h"
33 #include "hw/hw.h"
34 
35 /*
36  * Flags to be used as unique delimiters for VFIO devices in the migration
37  * stream. These flags are composed as:
38  * 0xffffffff => MSB 32-bit all 1s
39  * 0xef10     => Magic ID, represents emulated (virtual) function IO
40  * 0x0000     => 16-bits reserved for flags
41  *
42  * The beginning of state information is marked by _DEV_CONFIG_STATE,
43  * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a
44  * certain state information is marked by _END_OF_STATE.
45  */
46 #define VFIO_MIG_FLAG_END_OF_STATE      (0xffffffffef100001ULL)
47 #define VFIO_MIG_FLAG_DEV_CONFIG_STATE  (0xffffffffef100002ULL)
48 #define VFIO_MIG_FLAG_DEV_SETUP_STATE   (0xffffffffef100003ULL)
49 #define VFIO_MIG_FLAG_DEV_DATA_STATE    (0xffffffffef100004ULL)
50 #define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL)
51 
52 /*
53  * This is an arbitrary size based on migration of mlx5 devices, where typically
54  * total device migration size is on the order of 100s of MB. Testing with
55  * larger values, e.g. 128MB and 1GB, did not show a performance improvement.
56  */
57 #define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB)
58 
59 static int64_t bytes_transferred;
60 
61 static const char *mig_state_to_str(enum vfio_device_mig_state state)
62 {
63     switch (state) {
64     case VFIO_DEVICE_STATE_ERROR:
65         return "ERROR";
66     case VFIO_DEVICE_STATE_STOP:
67         return "STOP";
68     case VFIO_DEVICE_STATE_RUNNING:
69         return "RUNNING";
70     case VFIO_DEVICE_STATE_STOP_COPY:
71         return "STOP_COPY";
72     case VFIO_DEVICE_STATE_RESUMING:
73         return "RESUMING";
74     case VFIO_DEVICE_STATE_RUNNING_P2P:
75         return "RUNNING_P2P";
76     case VFIO_DEVICE_STATE_PRE_COPY:
77         return "PRE_COPY";
78     case VFIO_DEVICE_STATE_PRE_COPY_P2P:
79         return "PRE_COPY_P2P";
80     default:
81         return "UNKNOWN STATE";
82     }
83 }
84 
85 static int vfio_migration_set_state(VFIODevice *vbasedev,
86                                     enum vfio_device_mig_state new_state,
87                                     enum vfio_device_mig_state recover_state)
88 {
89     VFIOMigration *migration = vbasedev->migration;
90     uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
91                               sizeof(struct vfio_device_feature_mig_state),
92                               sizeof(uint64_t))] = {};
93     struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
94     struct vfio_device_feature_mig_state *mig_state =
95         (struct vfio_device_feature_mig_state *)feature->data;
96     int ret;
97 
98     feature->argsz = sizeof(buf);
99     feature->flags =
100         VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE;
101     mig_state->device_state = new_state;
102     if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
103         /* Try to set the device in some good state */
104         ret = -errno;
105 
106         if (recover_state == VFIO_DEVICE_STATE_ERROR) {
107             error_report("%s: Failed setting device state to %s, err: %s. "
108                          "Recover state is ERROR. Resetting device",
109                          vbasedev->name, mig_state_to_str(new_state),
110                          strerror(errno));
111 
112             goto reset_device;
113         }
114 
115         error_report(
116             "%s: Failed setting device state to %s, err: %s. Setting device in recover state %s",
117                      vbasedev->name, mig_state_to_str(new_state),
118                      strerror(errno), mig_state_to_str(recover_state));
119 
120         mig_state->device_state = recover_state;
121         if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
122             ret = -errno;
123             error_report(
124                 "%s: Failed setting device in recover state, err: %s. Resetting device",
125                          vbasedev->name, strerror(errno));
126 
127             goto reset_device;
128         }
129 
130         migration->device_state = recover_state;
131 
132         return ret;
133     }
134 
135     migration->device_state = new_state;
136     if (mig_state->data_fd != -1) {
137         if (migration->data_fd != -1) {
138             /*
139              * This can happen if the device is asynchronously reset and
140              * terminates a data transfer.
141              */
142             error_report("%s: data_fd out of sync", vbasedev->name);
143             close(mig_state->data_fd);
144 
145             return -EBADF;
146         }
147 
148         migration->data_fd = mig_state->data_fd;
149     }
150 
151     trace_vfio_migration_set_state(vbasedev->name, mig_state_to_str(new_state));
152 
153     return 0;
154 
155 reset_device:
156     if (ioctl(vbasedev->fd, VFIO_DEVICE_RESET)) {
157         hw_error("%s: Failed resetting device, err: %s", vbasedev->name,
158                  strerror(errno));
159     }
160 
161     migration->device_state = VFIO_DEVICE_STATE_RUNNING;
162 
163     return ret;
164 }
165 
166 /*
167  * Some device state transitions require resetting the device if they fail.
168  * This function sets the device in new_state and resets the device if that
169  * fails. Reset is done by using ERROR as the recover state.
170  */
171 static int
172 vfio_migration_set_state_or_reset(VFIODevice *vbasedev,
173                                   enum vfio_device_mig_state new_state)
174 {
175     return vfio_migration_set_state(vbasedev, new_state,
176                                     VFIO_DEVICE_STATE_ERROR);
177 }
178 
179 static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev,
180                             uint64_t data_size)
181 {
182     VFIOMigration *migration = vbasedev->migration;
183     int ret;
184 
185     ret = qemu_file_get_to_fd(f, migration->data_fd, data_size);
186     trace_vfio_load_state_device_data(vbasedev->name, data_size, ret);
187 
188     return ret;
189 }
190 
191 static int vfio_save_device_config_state(QEMUFile *f, void *opaque)
192 {
193     VFIODevice *vbasedev = opaque;
194 
195     qemu_put_be64(f, VFIO_MIG_FLAG_DEV_CONFIG_STATE);
196 
197     if (vbasedev->ops && vbasedev->ops->vfio_save_config) {
198         vbasedev->ops->vfio_save_config(vbasedev, f);
199     }
200 
201     qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
202 
203     trace_vfio_save_device_config_state(vbasedev->name);
204 
205     return qemu_file_get_error(f);
206 }
207 
208 static int vfio_load_device_config_state(QEMUFile *f, void *opaque)
209 {
210     VFIODevice *vbasedev = opaque;
211     uint64_t data;
212 
213     if (vbasedev->ops && vbasedev->ops->vfio_load_config) {
214         int ret;
215 
216         ret = vbasedev->ops->vfio_load_config(vbasedev, f);
217         if (ret) {
218             error_report("%s: Failed to load device config space",
219                          vbasedev->name);
220             return ret;
221         }
222     }
223 
224     data = qemu_get_be64(f);
225     if (data != VFIO_MIG_FLAG_END_OF_STATE) {
226         error_report("%s: Failed loading device config space, "
227                      "end flag incorrect 0x%"PRIx64, vbasedev->name, data);
228         return -EINVAL;
229     }
230 
231     trace_vfio_load_device_config_state(vbasedev->name);
232     return qemu_file_get_error(f);
233 }
234 
235 static void vfio_migration_cleanup(VFIODevice *vbasedev)
236 {
237     VFIOMigration *migration = vbasedev->migration;
238 
239     close(migration->data_fd);
240     migration->data_fd = -1;
241 }
242 
243 static int vfio_query_stop_copy_size(VFIODevice *vbasedev,
244                                      uint64_t *stop_copy_size)
245 {
246     uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
247                               sizeof(struct vfio_device_feature_mig_data_size),
248                               sizeof(uint64_t))] = {};
249     struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
250     struct vfio_device_feature_mig_data_size *mig_data_size =
251         (struct vfio_device_feature_mig_data_size *)feature->data;
252 
253     feature->argsz = sizeof(buf);
254     feature->flags =
255         VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIG_DATA_SIZE;
256 
257     if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
258         return -errno;
259     }
260 
261     *stop_copy_size = mig_data_size->stop_copy_length;
262 
263     return 0;
264 }
265 
266 static int vfio_query_precopy_size(VFIOMigration *migration)
267 {
268     struct vfio_precopy_info precopy = {
269         .argsz = sizeof(precopy),
270     };
271 
272     migration->precopy_init_size = 0;
273     migration->precopy_dirty_size = 0;
274 
275     if (ioctl(migration->data_fd, VFIO_MIG_GET_PRECOPY_INFO, &precopy)) {
276         return -errno;
277     }
278 
279     migration->precopy_init_size = precopy.initial_bytes;
280     migration->precopy_dirty_size = precopy.dirty_bytes;
281 
282     return 0;
283 }
284 
285 /* Returns the size of saved data on success and -errno on error */
286 static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration)
287 {
288     ssize_t data_size;
289 
290     data_size = read(migration->data_fd, migration->data_buffer,
291                      migration->data_buffer_size);
292     if (data_size < 0) {
293         /*
294          * Pre-copy emptied all the device state for now. For more information,
295          * please refer to the Linux kernel VFIO uAPI.
296          */
297         if (errno == ENOMSG) {
298             return 0;
299         }
300 
301         return -errno;
302     }
303     if (data_size == 0) {
304         return 0;
305     }
306 
307     qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE);
308     qemu_put_be64(f, data_size);
309     qemu_put_buffer(f, migration->data_buffer, data_size);
310     bytes_transferred += data_size;
311 
312     trace_vfio_save_block(migration->vbasedev->name, data_size);
313 
314     return qemu_file_get_error(f) ?: data_size;
315 }
316 
317 static void vfio_update_estimated_pending_data(VFIOMigration *migration,
318                                                uint64_t data_size)
319 {
320     if (!data_size) {
321         /*
322          * Pre-copy emptied all the device state for now, update estimated sizes
323          * accordingly.
324          */
325         migration->precopy_init_size = 0;
326         migration->precopy_dirty_size = 0;
327 
328         return;
329     }
330 
331     if (migration->precopy_init_size) {
332         uint64_t init_size = MIN(migration->precopy_init_size, data_size);
333 
334         migration->precopy_init_size -= init_size;
335         data_size -= init_size;
336     }
337 
338     migration->precopy_dirty_size -= MIN(migration->precopy_dirty_size,
339                                          data_size);
340 }
341 
342 static bool vfio_precopy_supported(VFIODevice *vbasedev)
343 {
344     VFIOMigration *migration = vbasedev->migration;
345 
346     return migration->mig_flags & VFIO_MIGRATION_PRE_COPY;
347 }
348 
349 /* ---------------------------------------------------------------------- */
350 
351 static int vfio_save_prepare(void *opaque, Error **errp)
352 {
353     VFIODevice *vbasedev = opaque;
354 
355     /*
356      * Snapshot doesn't use postcopy nor background snapshot, so allow snapshot
357      * even if they are on.
358      */
359     if (runstate_check(RUN_STATE_SAVE_VM)) {
360         return 0;
361     }
362 
363     if (migrate_postcopy_ram()) {
364         error_setg(
365             errp, "%s: VFIO migration is not supported with postcopy migration",
366             vbasedev->name);
367         return -EOPNOTSUPP;
368     }
369 
370     if (migrate_background_snapshot()) {
371         error_setg(
372             errp,
373             "%s: VFIO migration is not supported with background snapshot",
374             vbasedev->name);
375         return -EOPNOTSUPP;
376     }
377 
378     return 0;
379 }
380 
381 static int vfio_save_setup(QEMUFile *f, void *opaque)
382 {
383     VFIODevice *vbasedev = opaque;
384     VFIOMigration *migration = vbasedev->migration;
385     uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE;
386 
387     qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE);
388 
389     vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
390     migration->data_buffer_size = MIN(VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE,
391                                       stop_copy_size);
392     migration->data_buffer = g_try_malloc0(migration->data_buffer_size);
393     if (!migration->data_buffer) {
394         error_report("%s: Failed to allocate migration data buffer",
395                      vbasedev->name);
396         return -ENOMEM;
397     }
398 
399     if (vfio_precopy_supported(vbasedev)) {
400         int ret;
401 
402         switch (migration->device_state) {
403         case VFIO_DEVICE_STATE_RUNNING:
404             ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_PRE_COPY,
405                                            VFIO_DEVICE_STATE_RUNNING);
406             if (ret) {
407                 return ret;
408             }
409 
410             vfio_query_precopy_size(migration);
411 
412             break;
413         case VFIO_DEVICE_STATE_STOP:
414             /* vfio_save_complete_precopy() will go to STOP_COPY */
415             break;
416         default:
417             return -EINVAL;
418         }
419     }
420 
421     trace_vfio_save_setup(vbasedev->name, migration->data_buffer_size);
422 
423     qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
424 
425     return qemu_file_get_error(f);
426 }
427 
428 static void vfio_save_cleanup(void *opaque)
429 {
430     VFIODevice *vbasedev = opaque;
431     VFIOMigration *migration = vbasedev->migration;
432 
433     /*
434      * Changing device state from STOP_COPY to STOP can take time. Do it here,
435      * after migration has completed, so it won't increase downtime.
436      */
437     if (migration->device_state == VFIO_DEVICE_STATE_STOP_COPY) {
438         vfio_migration_set_state_or_reset(vbasedev, VFIO_DEVICE_STATE_STOP);
439     }
440 
441     g_free(migration->data_buffer);
442     migration->data_buffer = NULL;
443     migration->precopy_init_size = 0;
444     migration->precopy_dirty_size = 0;
445     migration->initial_data_sent = false;
446     vfio_migration_cleanup(vbasedev);
447     trace_vfio_save_cleanup(vbasedev->name);
448 }
449 
450 static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy,
451                                         uint64_t *can_postcopy)
452 {
453     VFIODevice *vbasedev = opaque;
454     VFIOMigration *migration = vbasedev->migration;
455 
456     if (!vfio_device_state_is_precopy(vbasedev)) {
457         return;
458     }
459 
460     *must_precopy +=
461         migration->precopy_init_size + migration->precopy_dirty_size;
462 
463     trace_vfio_state_pending_estimate(vbasedev->name, *must_precopy,
464                                       *can_postcopy,
465                                       migration->precopy_init_size,
466                                       migration->precopy_dirty_size);
467 }
468 
469 /*
470  * Migration size of VFIO devices can be as little as a few KBs or as big as
471  * many GBs. This value should be big enough to cover the worst case.
472  */
473 #define VFIO_MIG_STOP_COPY_SIZE (100 * GiB)
474 
475 static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy,
476                                      uint64_t *can_postcopy)
477 {
478     VFIODevice *vbasedev = opaque;
479     VFIOMigration *migration = vbasedev->migration;
480     uint64_t stop_copy_size = VFIO_MIG_STOP_COPY_SIZE;
481 
482     /*
483      * If getting pending migration size fails, VFIO_MIG_STOP_COPY_SIZE is
484      * reported so downtime limit won't be violated.
485      */
486     vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
487     *must_precopy += stop_copy_size;
488 
489     if (vfio_device_state_is_precopy(vbasedev)) {
490         vfio_query_precopy_size(migration);
491 
492         *must_precopy +=
493             migration->precopy_init_size + migration->precopy_dirty_size;
494     }
495 
496     trace_vfio_state_pending_exact(vbasedev->name, *must_precopy, *can_postcopy,
497                                    stop_copy_size, migration->precopy_init_size,
498                                    migration->precopy_dirty_size);
499 }
500 
501 static bool vfio_is_active_iterate(void *opaque)
502 {
503     VFIODevice *vbasedev = opaque;
504 
505     return vfio_device_state_is_precopy(vbasedev);
506 }
507 
508 static int vfio_save_iterate(QEMUFile *f, void *opaque)
509 {
510     VFIODevice *vbasedev = opaque;
511     VFIOMigration *migration = vbasedev->migration;
512     ssize_t data_size;
513 
514     data_size = vfio_save_block(f, migration);
515     if (data_size < 0) {
516         return data_size;
517     }
518 
519     vfio_update_estimated_pending_data(migration, data_size);
520 
521     if (migrate_switchover_ack() && !migration->precopy_init_size &&
522         !migration->initial_data_sent) {
523         qemu_put_be64(f, VFIO_MIG_FLAG_DEV_INIT_DATA_SENT);
524         migration->initial_data_sent = true;
525     } else {
526         qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
527     }
528 
529     trace_vfio_save_iterate(vbasedev->name, migration->precopy_init_size,
530                             migration->precopy_dirty_size);
531 
532     /*
533      * A VFIO device's pre-copy dirty_bytes is not guaranteed to reach zero.
534      * Return 1 so following handlers will not be potentially blocked.
535      */
536     return 1;
537 }
538 
539 static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
540 {
541     VFIODevice *vbasedev = opaque;
542     ssize_t data_size;
543     int ret;
544 
545     /* We reach here with device state STOP or STOP_COPY only */
546     ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
547                                    VFIO_DEVICE_STATE_STOP);
548     if (ret) {
549         return ret;
550     }
551 
552     do {
553         data_size = vfio_save_block(f, vbasedev->migration);
554         if (data_size < 0) {
555             return data_size;
556         }
557     } while (data_size);
558 
559     qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
560     ret = qemu_file_get_error(f);
561     if (ret) {
562         return ret;
563     }
564 
565     trace_vfio_save_complete_precopy(vbasedev->name, ret);
566 
567     return ret;
568 }
569 
570 static void vfio_save_state(QEMUFile *f, void *opaque)
571 {
572     VFIODevice *vbasedev = opaque;
573     int ret;
574 
575     ret = vfio_save_device_config_state(f, opaque);
576     if (ret) {
577         error_report("%s: Failed to save device config space",
578                      vbasedev->name);
579         qemu_file_set_error(f, ret);
580     }
581 }
582 
583 static int vfio_load_setup(QEMUFile *f, void *opaque)
584 {
585     VFIODevice *vbasedev = opaque;
586 
587     return vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING,
588                                    vbasedev->migration->device_state);
589 }
590 
591 static int vfio_load_cleanup(void *opaque)
592 {
593     VFIODevice *vbasedev = opaque;
594 
595     vfio_migration_cleanup(vbasedev);
596     trace_vfio_load_cleanup(vbasedev->name);
597 
598     return 0;
599 }
600 
601 static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
602 {
603     VFIODevice *vbasedev = opaque;
604     int ret = 0;
605     uint64_t data;
606 
607     data = qemu_get_be64(f);
608     while (data != VFIO_MIG_FLAG_END_OF_STATE) {
609 
610         trace_vfio_load_state(vbasedev->name, data);
611 
612         switch (data) {
613         case VFIO_MIG_FLAG_DEV_CONFIG_STATE:
614         {
615             return vfio_load_device_config_state(f, opaque);
616         }
617         case VFIO_MIG_FLAG_DEV_SETUP_STATE:
618         {
619             data = qemu_get_be64(f);
620             if (data == VFIO_MIG_FLAG_END_OF_STATE) {
621                 return ret;
622             } else {
623                 error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64,
624                              vbasedev->name, data);
625                 return -EINVAL;
626             }
627             break;
628         }
629         case VFIO_MIG_FLAG_DEV_DATA_STATE:
630         {
631             uint64_t data_size = qemu_get_be64(f);
632 
633             if (data_size) {
634                 ret = vfio_load_buffer(f, vbasedev, data_size);
635                 if (ret < 0) {
636                     return ret;
637                 }
638             }
639             break;
640         }
641         case VFIO_MIG_FLAG_DEV_INIT_DATA_SENT:
642         {
643             if (!vfio_precopy_supported(vbasedev) ||
644                 !migrate_switchover_ack()) {
645                 error_report("%s: Received INIT_DATA_SENT but switchover ack "
646                              "is not used", vbasedev->name);
647                 return -EINVAL;
648             }
649 
650             ret = qemu_loadvm_approve_switchover();
651             if (ret) {
652                 error_report(
653                     "%s: qemu_loadvm_approve_switchover failed, err=%d (%s)",
654                     vbasedev->name, ret, strerror(-ret));
655             }
656 
657             return ret;
658         }
659         default:
660             error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data);
661             return -EINVAL;
662         }
663 
664         data = qemu_get_be64(f);
665         ret = qemu_file_get_error(f);
666         if (ret) {
667             return ret;
668         }
669     }
670     return ret;
671 }
672 
673 static bool vfio_switchover_ack_needed(void *opaque)
674 {
675     VFIODevice *vbasedev = opaque;
676 
677     return vfio_precopy_supported(vbasedev);
678 }
679 
680 static const SaveVMHandlers savevm_vfio_handlers = {
681     .save_prepare = vfio_save_prepare,
682     .save_setup = vfio_save_setup,
683     .save_cleanup = vfio_save_cleanup,
684     .state_pending_estimate = vfio_state_pending_estimate,
685     .state_pending_exact = vfio_state_pending_exact,
686     .is_active_iterate = vfio_is_active_iterate,
687     .save_live_iterate = vfio_save_iterate,
688     .save_live_complete_precopy = vfio_save_complete_precopy,
689     .save_state = vfio_save_state,
690     .load_setup = vfio_load_setup,
691     .load_cleanup = vfio_load_cleanup,
692     .load_state = vfio_load_state,
693     .switchover_ack_needed = vfio_switchover_ack_needed,
694 };
695 
696 /* ---------------------------------------------------------------------- */
697 
698 static void vfio_vmstate_change_prepare(void *opaque, bool running,
699                                         RunState state)
700 {
701     VFIODevice *vbasedev = opaque;
702     VFIOMigration *migration = vbasedev->migration;
703     enum vfio_device_mig_state new_state;
704     int ret;
705 
706     new_state = migration->device_state == VFIO_DEVICE_STATE_PRE_COPY ?
707                     VFIO_DEVICE_STATE_PRE_COPY_P2P :
708                     VFIO_DEVICE_STATE_RUNNING_P2P;
709 
710     ret = vfio_migration_set_state_or_reset(vbasedev, new_state);
711     if (ret) {
712         /*
713          * Migration should be aborted in this case, but vm_state_notify()
714          * currently does not support reporting failures.
715          */
716         if (migrate_get_current()->to_dst_file) {
717             qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
718         }
719     }
720 
721     trace_vfio_vmstate_change_prepare(vbasedev->name, running,
722                                       RunState_str(state),
723                                       mig_state_to_str(new_state));
724 }
725 
726 static void vfio_vmstate_change(void *opaque, bool running, RunState state)
727 {
728     VFIODevice *vbasedev = opaque;
729     enum vfio_device_mig_state new_state;
730     int ret;
731 
732     if (running) {
733         new_state = VFIO_DEVICE_STATE_RUNNING;
734     } else {
735         new_state =
736             (vfio_device_state_is_precopy(vbasedev) &&
737              (state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSED)) ?
738                 VFIO_DEVICE_STATE_STOP_COPY :
739                 VFIO_DEVICE_STATE_STOP;
740     }
741 
742     ret = vfio_migration_set_state_or_reset(vbasedev, new_state);
743     if (ret) {
744         /*
745          * Migration should be aborted in this case, but vm_state_notify()
746          * currently does not support reporting failures.
747          */
748         if (migrate_get_current()->to_dst_file) {
749             qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
750         }
751     }
752 
753     trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
754                               mig_state_to_str(new_state));
755 }
756 
757 static void vfio_migration_state_notifier(Notifier *notifier, void *data)
758 {
759     MigrationState *s = data;
760     VFIOMigration *migration = container_of(notifier, VFIOMigration,
761                                             migration_state);
762     VFIODevice *vbasedev = migration->vbasedev;
763 
764     trace_vfio_migration_state_notifier(vbasedev->name,
765                                         MigrationStatus_str(s->state));
766 
767     switch (s->state) {
768     case MIGRATION_STATUS_CANCELLING:
769     case MIGRATION_STATUS_CANCELLED:
770     case MIGRATION_STATUS_FAILED:
771         vfio_migration_set_state_or_reset(vbasedev, VFIO_DEVICE_STATE_RUNNING);
772     }
773 }
774 
775 static void vfio_migration_free(VFIODevice *vbasedev)
776 {
777     g_free(vbasedev->migration);
778     vbasedev->migration = NULL;
779 }
780 
781 static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags)
782 {
783     uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
784                                   sizeof(struct vfio_device_feature_migration),
785                               sizeof(uint64_t))] = {};
786     struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
787     struct vfio_device_feature_migration *mig =
788         (struct vfio_device_feature_migration *)feature->data;
789 
790     feature->argsz = sizeof(buf);
791     feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION;
792     if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
793         return -errno;
794     }
795 
796     *mig_flags = mig->flags;
797 
798     return 0;
799 }
800 
801 static bool vfio_dma_logging_supported(VFIODevice *vbasedev)
802 {
803     uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
804                               sizeof(uint64_t))] = {};
805     struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
806 
807     feature->argsz = sizeof(buf);
808     feature->flags = VFIO_DEVICE_FEATURE_PROBE |
809                      VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
810 
811     return !ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
812 }
813 
814 static int vfio_migration_init(VFIODevice *vbasedev)
815 {
816     int ret;
817     Object *obj;
818     VFIOMigration *migration;
819     char id[256] = "";
820     g_autofree char *path = NULL, *oid = NULL;
821     uint64_t mig_flags = 0;
822     VMChangeStateHandler *prepare_cb;
823 
824     if (!vbasedev->ops->vfio_get_object) {
825         return -EINVAL;
826     }
827 
828     obj = vbasedev->ops->vfio_get_object(vbasedev);
829     if (!obj) {
830         return -EINVAL;
831     }
832 
833     ret = vfio_migration_query_flags(vbasedev, &mig_flags);
834     if (ret) {
835         return ret;
836     }
837 
838     /* Basic migration functionality must be supported */
839     if (!(mig_flags & VFIO_MIGRATION_STOP_COPY)) {
840         return -EOPNOTSUPP;
841     }
842 
843     vbasedev->migration = g_new0(VFIOMigration, 1);
844     migration = vbasedev->migration;
845     migration->vbasedev = vbasedev;
846     migration->device_state = VFIO_DEVICE_STATE_RUNNING;
847     migration->data_fd = -1;
848     migration->mig_flags = mig_flags;
849 
850     vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev);
851 
852     oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj)));
853     if (oid) {
854         path = g_strdup_printf("%s/vfio", oid);
855     } else {
856         path = g_strdup("vfio");
857     }
858     strpadcpy(id, sizeof(id), path, '\0');
859 
860     register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1, &savevm_vfio_handlers,
861                          vbasedev);
862 
863     prepare_cb = migration->mig_flags & VFIO_MIGRATION_P2P ?
864                      vfio_vmstate_change_prepare :
865                      NULL;
866     migration->vm_state = qdev_add_vm_change_state_handler_full(
867         vbasedev->dev, vfio_vmstate_change, prepare_cb, vbasedev);
868     migration_add_notifier(&migration->migration_state,
869                            vfio_migration_state_notifier);
870 
871     return 0;
872 }
873 
874 static void vfio_migration_deinit(VFIODevice *vbasedev)
875 {
876     VFIOMigration *migration = vbasedev->migration;
877 
878     migration_remove_notifier(&migration->migration_state);
879     qemu_del_vm_change_state_handler(migration->vm_state);
880     unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev);
881     vfio_migration_free(vbasedev);
882     vfio_unblock_multiple_devices_migration();
883 }
884 
885 static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp)
886 {
887     if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
888         error_propagate(errp, err);
889         return -EINVAL;
890     }
891 
892     vbasedev->migration_blocker = error_copy(err);
893     error_free(err);
894 
895     return migrate_add_blocker(&vbasedev->migration_blocker, errp);
896 }
897 
898 /* ---------------------------------------------------------------------- */
899 
900 int64_t vfio_mig_bytes_transferred(void)
901 {
902     return bytes_transferred;
903 }
904 
905 void vfio_reset_bytes_transferred(void)
906 {
907     bytes_transferred = 0;
908 }
909 
910 /*
911  * Return true when either migration initialized or blocker registered.
912  * Currently only return false when adding blocker fails which will
913  * de-register vfio device.
914  */
915 bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp)
916 {
917     Error *err = NULL;
918     int ret;
919 
920     if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) {
921         error_setg(&err, "%s: Migration is disabled for VFIO device",
922                    vbasedev->name);
923         return !vfio_block_migration(vbasedev, err, errp);
924     }
925 
926     ret = vfio_migration_init(vbasedev);
927     if (ret) {
928         if (ret == -ENOTTY) {
929             error_setg(&err, "%s: VFIO migration is not supported in kernel",
930                        vbasedev->name);
931         } else {
932             error_setg(&err,
933                        "%s: Migration couldn't be initialized for VFIO device, "
934                        "err: %d (%s)",
935                        vbasedev->name, ret, strerror(-ret));
936         }
937 
938         return !vfio_block_migration(vbasedev, err, errp);
939     }
940 
941     if (!vbasedev->dirty_pages_supported) {
942         if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) {
943             error_setg(&err,
944                        "%s: VFIO device doesn't support device dirty tracking",
945                        vbasedev->name);
946             goto add_blocker;
947         }
948 
949         warn_report("%s: VFIO device doesn't support device dirty tracking",
950                     vbasedev->name);
951     }
952 
953     ret = vfio_block_multiple_devices_migration(vbasedev, errp);
954     if (ret) {
955         goto out_deinit;
956     }
957 
958     if (vfio_viommu_preset(vbasedev)) {
959         error_setg(&err, "%s: Migration is currently not supported "
960                    "with vIOMMU enabled", vbasedev->name);
961         goto add_blocker;
962     }
963 
964     trace_vfio_migration_realize(vbasedev->name);
965     return true;
966 
967 add_blocker:
968     ret = vfio_block_migration(vbasedev, err, errp);
969 out_deinit:
970     if (ret) {
971         vfio_migration_deinit(vbasedev);
972     }
973     return !ret;
974 }
975 
976 void vfio_migration_exit(VFIODevice *vbasedev)
977 {
978     if (vbasedev->migration) {
979         vfio_migration_deinit(vbasedev);
980     }
981 
982     migrate_del_blocker(&vbasedev->migration_blocker);
983 }
984