1 /* 2 * Migration support for VFIO devices 3 * 4 * Copyright NVIDIA, Inc. 2020 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2. See 7 * the COPYING file in the top-level directory. 8 */ 9 10 #include "qemu/osdep.h" 11 #include "qemu/main-loop.h" 12 #include "qemu/cutils.h" 13 #include "qemu/units.h" 14 #include "qemu/error-report.h" 15 #include <linux/vfio.h> 16 #include <sys/ioctl.h> 17 18 #include "sysemu/runstate.h" 19 #include "hw/vfio/vfio-common.h" 20 #include "migration/migration.h" 21 #include "migration/options.h" 22 #include "migration/savevm.h" 23 #include "migration/vmstate.h" 24 #include "migration/qemu-file.h" 25 #include "migration/register.h" 26 #include "migration/blocker.h" 27 #include "migration/misc.h" 28 #include "qapi/error.h" 29 #include "exec/ramlist.h" 30 #include "exec/ram_addr.h" 31 #include "pci.h" 32 #include "trace.h" 33 #include "hw/hw.h" 34 35 /* 36 * Flags to be used as unique delimiters for VFIO devices in the migration 37 * stream. These flags are composed as: 38 * 0xffffffff => MSB 32-bit all 1s 39 * 0xef10 => Magic ID, represents emulated (virtual) function IO 40 * 0x0000 => 16-bits reserved for flags 41 * 42 * The beginning of state information is marked by _DEV_CONFIG_STATE, 43 * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a 44 * certain state information is marked by _END_OF_STATE. 45 */ 46 #define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL) 47 #define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) 48 #define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) 49 #define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) 50 #define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL) 51 52 /* 53 * This is an arbitrary size based on migration of mlx5 devices, where typically 54 * total device migration size is on the order of 100s of MB. Testing with 55 * larger values, e.g. 128MB and 1GB, did not show a performance improvement. 56 */ 57 #define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB) 58 59 static int64_t bytes_transferred; 60 61 static const char *mig_state_to_str(enum vfio_device_mig_state state) 62 { 63 switch (state) { 64 case VFIO_DEVICE_STATE_ERROR: 65 return "ERROR"; 66 case VFIO_DEVICE_STATE_STOP: 67 return "STOP"; 68 case VFIO_DEVICE_STATE_RUNNING: 69 return "RUNNING"; 70 case VFIO_DEVICE_STATE_STOP_COPY: 71 return "STOP_COPY"; 72 case VFIO_DEVICE_STATE_RESUMING: 73 return "RESUMING"; 74 case VFIO_DEVICE_STATE_RUNNING_P2P: 75 return "RUNNING_P2P"; 76 case VFIO_DEVICE_STATE_PRE_COPY: 77 return "PRE_COPY"; 78 case VFIO_DEVICE_STATE_PRE_COPY_P2P: 79 return "PRE_COPY_P2P"; 80 default: 81 return "UNKNOWN STATE"; 82 } 83 } 84 85 static int vfio_migration_set_state(VFIODevice *vbasedev, 86 enum vfio_device_mig_state new_state, 87 enum vfio_device_mig_state recover_state) 88 { 89 VFIOMigration *migration = vbasedev->migration; 90 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 91 sizeof(struct vfio_device_feature_mig_state), 92 sizeof(uint64_t))] = {}; 93 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 94 struct vfio_device_feature_mig_state *mig_state = 95 (struct vfio_device_feature_mig_state *)feature->data; 96 int ret; 97 98 feature->argsz = sizeof(buf); 99 feature->flags = 100 VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE; 101 mig_state->device_state = new_state; 102 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 103 /* Try to set the device in some good state */ 104 ret = -errno; 105 106 if (recover_state == VFIO_DEVICE_STATE_ERROR) { 107 error_report("%s: Failed setting device state to %s, err: %s. " 108 "Recover state is ERROR. Resetting device", 109 vbasedev->name, mig_state_to_str(new_state), 110 strerror(errno)); 111 112 goto reset_device; 113 } 114 115 error_report( 116 "%s: Failed setting device state to %s, err: %s. Setting device in recover state %s", 117 vbasedev->name, mig_state_to_str(new_state), 118 strerror(errno), mig_state_to_str(recover_state)); 119 120 mig_state->device_state = recover_state; 121 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 122 ret = -errno; 123 error_report( 124 "%s: Failed setting device in recover state, err: %s. Resetting device", 125 vbasedev->name, strerror(errno)); 126 127 goto reset_device; 128 } 129 130 migration->device_state = recover_state; 131 132 return ret; 133 } 134 135 migration->device_state = new_state; 136 if (mig_state->data_fd != -1) { 137 if (migration->data_fd != -1) { 138 /* 139 * This can happen if the device is asynchronously reset and 140 * terminates a data transfer. 141 */ 142 error_report("%s: data_fd out of sync", vbasedev->name); 143 close(mig_state->data_fd); 144 145 return -EBADF; 146 } 147 148 migration->data_fd = mig_state->data_fd; 149 } 150 151 trace_vfio_migration_set_state(vbasedev->name, mig_state_to_str(new_state)); 152 153 return 0; 154 155 reset_device: 156 if (ioctl(vbasedev->fd, VFIO_DEVICE_RESET)) { 157 hw_error("%s: Failed resetting device, err: %s", vbasedev->name, 158 strerror(errno)); 159 } 160 161 migration->device_state = VFIO_DEVICE_STATE_RUNNING; 162 163 return ret; 164 } 165 166 /* 167 * Some device state transitions require resetting the device if they fail. 168 * This function sets the device in new_state and resets the device if that 169 * fails. Reset is done by using ERROR as the recover state. 170 */ 171 static int 172 vfio_migration_set_state_or_reset(VFIODevice *vbasedev, 173 enum vfio_device_mig_state new_state) 174 { 175 return vfio_migration_set_state(vbasedev, new_state, 176 VFIO_DEVICE_STATE_ERROR); 177 } 178 179 static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev, 180 uint64_t data_size) 181 { 182 VFIOMigration *migration = vbasedev->migration; 183 int ret; 184 185 ret = qemu_file_get_to_fd(f, migration->data_fd, data_size); 186 trace_vfio_load_state_device_data(vbasedev->name, data_size, ret); 187 188 return ret; 189 } 190 191 static int vfio_save_device_config_state(QEMUFile *f, void *opaque) 192 { 193 VFIODevice *vbasedev = opaque; 194 195 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_CONFIG_STATE); 196 197 if (vbasedev->ops && vbasedev->ops->vfio_save_config) { 198 vbasedev->ops->vfio_save_config(vbasedev, f); 199 } 200 201 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 202 203 trace_vfio_save_device_config_state(vbasedev->name); 204 205 return qemu_file_get_error(f); 206 } 207 208 static int vfio_load_device_config_state(QEMUFile *f, void *opaque) 209 { 210 VFIODevice *vbasedev = opaque; 211 uint64_t data; 212 213 if (vbasedev->ops && vbasedev->ops->vfio_load_config) { 214 int ret; 215 216 ret = vbasedev->ops->vfio_load_config(vbasedev, f); 217 if (ret) { 218 error_report("%s: Failed to load device config space", 219 vbasedev->name); 220 return ret; 221 } 222 } 223 224 data = qemu_get_be64(f); 225 if (data != VFIO_MIG_FLAG_END_OF_STATE) { 226 error_report("%s: Failed loading device config space, " 227 "end flag incorrect 0x%"PRIx64, vbasedev->name, data); 228 return -EINVAL; 229 } 230 231 trace_vfio_load_device_config_state(vbasedev->name); 232 return qemu_file_get_error(f); 233 } 234 235 static void vfio_migration_cleanup(VFIODevice *vbasedev) 236 { 237 VFIOMigration *migration = vbasedev->migration; 238 239 close(migration->data_fd); 240 migration->data_fd = -1; 241 } 242 243 static int vfio_query_stop_copy_size(VFIODevice *vbasedev, 244 uint64_t *stop_copy_size) 245 { 246 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 247 sizeof(struct vfio_device_feature_mig_data_size), 248 sizeof(uint64_t))] = {}; 249 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 250 struct vfio_device_feature_mig_data_size *mig_data_size = 251 (struct vfio_device_feature_mig_data_size *)feature->data; 252 253 feature->argsz = sizeof(buf); 254 feature->flags = 255 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIG_DATA_SIZE; 256 257 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 258 return -errno; 259 } 260 261 *stop_copy_size = mig_data_size->stop_copy_length; 262 263 return 0; 264 } 265 266 static int vfio_query_precopy_size(VFIOMigration *migration) 267 { 268 struct vfio_precopy_info precopy = { 269 .argsz = sizeof(precopy), 270 }; 271 272 migration->precopy_init_size = 0; 273 migration->precopy_dirty_size = 0; 274 275 if (ioctl(migration->data_fd, VFIO_MIG_GET_PRECOPY_INFO, &precopy)) { 276 return -errno; 277 } 278 279 migration->precopy_init_size = precopy.initial_bytes; 280 migration->precopy_dirty_size = precopy.dirty_bytes; 281 282 return 0; 283 } 284 285 /* Returns the size of saved data on success and -errno on error */ 286 static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration) 287 { 288 ssize_t data_size; 289 290 data_size = read(migration->data_fd, migration->data_buffer, 291 migration->data_buffer_size); 292 if (data_size < 0) { 293 /* 294 * Pre-copy emptied all the device state for now. For more information, 295 * please refer to the Linux kernel VFIO uAPI. 296 */ 297 if (errno == ENOMSG) { 298 return 0; 299 } 300 301 return -errno; 302 } 303 if (data_size == 0) { 304 return 0; 305 } 306 307 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); 308 qemu_put_be64(f, data_size); 309 qemu_put_buffer(f, migration->data_buffer, data_size); 310 bytes_transferred += data_size; 311 312 trace_vfio_save_block(migration->vbasedev->name, data_size); 313 314 return qemu_file_get_error(f) ?: data_size; 315 } 316 317 static void vfio_update_estimated_pending_data(VFIOMigration *migration, 318 uint64_t data_size) 319 { 320 if (!data_size) { 321 /* 322 * Pre-copy emptied all the device state for now, update estimated sizes 323 * accordingly. 324 */ 325 migration->precopy_init_size = 0; 326 migration->precopy_dirty_size = 0; 327 328 return; 329 } 330 331 if (migration->precopy_init_size) { 332 uint64_t init_size = MIN(migration->precopy_init_size, data_size); 333 334 migration->precopy_init_size -= init_size; 335 data_size -= init_size; 336 } 337 338 migration->precopy_dirty_size -= MIN(migration->precopy_dirty_size, 339 data_size); 340 } 341 342 static bool vfio_precopy_supported(VFIODevice *vbasedev) 343 { 344 VFIOMigration *migration = vbasedev->migration; 345 346 return migration->mig_flags & VFIO_MIGRATION_PRE_COPY; 347 } 348 349 /* ---------------------------------------------------------------------- */ 350 351 static int vfio_save_prepare(void *opaque, Error **errp) 352 { 353 VFIODevice *vbasedev = opaque; 354 355 /* 356 * Snapshot doesn't use postcopy nor background snapshot, so allow snapshot 357 * even if they are on. 358 */ 359 if (runstate_check(RUN_STATE_SAVE_VM)) { 360 return 0; 361 } 362 363 if (migrate_postcopy_ram()) { 364 error_setg( 365 errp, "%s: VFIO migration is not supported with postcopy migration", 366 vbasedev->name); 367 return -EOPNOTSUPP; 368 } 369 370 if (migrate_background_snapshot()) { 371 error_setg( 372 errp, 373 "%s: VFIO migration is not supported with background snapshot", 374 vbasedev->name); 375 return -EOPNOTSUPP; 376 } 377 378 return 0; 379 } 380 381 static int vfio_save_setup(QEMUFile *f, void *opaque) 382 { 383 VFIODevice *vbasedev = opaque; 384 VFIOMigration *migration = vbasedev->migration; 385 uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE; 386 387 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); 388 389 vfio_query_stop_copy_size(vbasedev, &stop_copy_size); 390 migration->data_buffer_size = MIN(VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE, 391 stop_copy_size); 392 migration->data_buffer = g_try_malloc0(migration->data_buffer_size); 393 if (!migration->data_buffer) { 394 error_report("%s: Failed to allocate migration data buffer", 395 vbasedev->name); 396 return -ENOMEM; 397 } 398 399 if (vfio_precopy_supported(vbasedev)) { 400 int ret; 401 402 switch (migration->device_state) { 403 case VFIO_DEVICE_STATE_RUNNING: 404 ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_PRE_COPY, 405 VFIO_DEVICE_STATE_RUNNING); 406 if (ret) { 407 return ret; 408 } 409 410 vfio_query_precopy_size(migration); 411 412 break; 413 case VFIO_DEVICE_STATE_STOP: 414 /* vfio_save_complete_precopy() will go to STOP_COPY */ 415 break; 416 default: 417 return -EINVAL; 418 } 419 } 420 421 trace_vfio_save_setup(vbasedev->name, migration->data_buffer_size); 422 423 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 424 425 return qemu_file_get_error(f); 426 } 427 428 static void vfio_save_cleanup(void *opaque) 429 { 430 VFIODevice *vbasedev = opaque; 431 VFIOMigration *migration = vbasedev->migration; 432 433 /* 434 * Changing device state from STOP_COPY to STOP can take time. Do it here, 435 * after migration has completed, so it won't increase downtime. 436 */ 437 if (migration->device_state == VFIO_DEVICE_STATE_STOP_COPY) { 438 vfio_migration_set_state_or_reset(vbasedev, VFIO_DEVICE_STATE_STOP); 439 } 440 441 g_free(migration->data_buffer); 442 migration->data_buffer = NULL; 443 migration->precopy_init_size = 0; 444 migration->precopy_dirty_size = 0; 445 migration->initial_data_sent = false; 446 vfio_migration_cleanup(vbasedev); 447 trace_vfio_save_cleanup(vbasedev->name); 448 } 449 450 static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy, 451 uint64_t *can_postcopy) 452 { 453 VFIODevice *vbasedev = opaque; 454 VFIOMigration *migration = vbasedev->migration; 455 456 if (!vfio_device_state_is_precopy(vbasedev)) { 457 return; 458 } 459 460 *must_precopy += 461 migration->precopy_init_size + migration->precopy_dirty_size; 462 463 trace_vfio_state_pending_estimate(vbasedev->name, *must_precopy, 464 *can_postcopy, 465 migration->precopy_init_size, 466 migration->precopy_dirty_size); 467 } 468 469 /* 470 * Migration size of VFIO devices can be as little as a few KBs or as big as 471 * many GBs. This value should be big enough to cover the worst case. 472 */ 473 #define VFIO_MIG_STOP_COPY_SIZE (100 * GiB) 474 475 static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy, 476 uint64_t *can_postcopy) 477 { 478 VFIODevice *vbasedev = opaque; 479 VFIOMigration *migration = vbasedev->migration; 480 uint64_t stop_copy_size = VFIO_MIG_STOP_COPY_SIZE; 481 482 /* 483 * If getting pending migration size fails, VFIO_MIG_STOP_COPY_SIZE is 484 * reported so downtime limit won't be violated. 485 */ 486 vfio_query_stop_copy_size(vbasedev, &stop_copy_size); 487 *must_precopy += stop_copy_size; 488 489 if (vfio_device_state_is_precopy(vbasedev)) { 490 vfio_query_precopy_size(migration); 491 492 *must_precopy += 493 migration->precopy_init_size + migration->precopy_dirty_size; 494 } 495 496 trace_vfio_state_pending_exact(vbasedev->name, *must_precopy, *can_postcopy, 497 stop_copy_size, migration->precopy_init_size, 498 migration->precopy_dirty_size); 499 } 500 501 static bool vfio_is_active_iterate(void *opaque) 502 { 503 VFIODevice *vbasedev = opaque; 504 505 return vfio_device_state_is_precopy(vbasedev); 506 } 507 508 /* 509 * Note about migration rate limiting: VFIO migration buffer size is currently 510 * limited to 1MB, so there is no need to check if migration rate exceeded (as 511 * in the worst case it will exceed by 1MB). However, if the buffer size is 512 * later changed to a bigger value, migration rate should be enforced here. 513 */ 514 static int vfio_save_iterate(QEMUFile *f, void *opaque) 515 { 516 VFIODevice *vbasedev = opaque; 517 VFIOMigration *migration = vbasedev->migration; 518 ssize_t data_size; 519 520 data_size = vfio_save_block(f, migration); 521 if (data_size < 0) { 522 return data_size; 523 } 524 525 vfio_update_estimated_pending_data(migration, data_size); 526 527 if (migrate_switchover_ack() && !migration->precopy_init_size && 528 !migration->initial_data_sent) { 529 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_INIT_DATA_SENT); 530 migration->initial_data_sent = true; 531 } else { 532 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 533 } 534 535 trace_vfio_save_iterate(vbasedev->name, migration->precopy_init_size, 536 migration->precopy_dirty_size); 537 538 return !migration->precopy_init_size && !migration->precopy_dirty_size; 539 } 540 541 static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) 542 { 543 VFIODevice *vbasedev = opaque; 544 ssize_t data_size; 545 int ret; 546 547 /* We reach here with device state STOP or STOP_COPY only */ 548 ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, 549 VFIO_DEVICE_STATE_STOP); 550 if (ret) { 551 return ret; 552 } 553 554 do { 555 data_size = vfio_save_block(f, vbasedev->migration); 556 if (data_size < 0) { 557 return data_size; 558 } 559 } while (data_size); 560 561 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 562 ret = qemu_file_get_error(f); 563 if (ret) { 564 return ret; 565 } 566 567 trace_vfio_save_complete_precopy(vbasedev->name, ret); 568 569 return ret; 570 } 571 572 static void vfio_save_state(QEMUFile *f, void *opaque) 573 { 574 VFIODevice *vbasedev = opaque; 575 int ret; 576 577 ret = vfio_save_device_config_state(f, opaque); 578 if (ret) { 579 error_report("%s: Failed to save device config space", 580 vbasedev->name); 581 qemu_file_set_error(f, ret); 582 } 583 } 584 585 static int vfio_load_setup(QEMUFile *f, void *opaque) 586 { 587 VFIODevice *vbasedev = opaque; 588 589 return vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING, 590 vbasedev->migration->device_state); 591 } 592 593 static int vfio_load_cleanup(void *opaque) 594 { 595 VFIODevice *vbasedev = opaque; 596 597 vfio_migration_cleanup(vbasedev); 598 trace_vfio_load_cleanup(vbasedev->name); 599 600 return 0; 601 } 602 603 static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) 604 { 605 VFIODevice *vbasedev = opaque; 606 int ret = 0; 607 uint64_t data; 608 609 data = qemu_get_be64(f); 610 while (data != VFIO_MIG_FLAG_END_OF_STATE) { 611 612 trace_vfio_load_state(vbasedev->name, data); 613 614 switch (data) { 615 case VFIO_MIG_FLAG_DEV_CONFIG_STATE: 616 { 617 return vfio_load_device_config_state(f, opaque); 618 } 619 case VFIO_MIG_FLAG_DEV_SETUP_STATE: 620 { 621 data = qemu_get_be64(f); 622 if (data == VFIO_MIG_FLAG_END_OF_STATE) { 623 return ret; 624 } else { 625 error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64, 626 vbasedev->name, data); 627 return -EINVAL; 628 } 629 break; 630 } 631 case VFIO_MIG_FLAG_DEV_DATA_STATE: 632 { 633 uint64_t data_size = qemu_get_be64(f); 634 635 if (data_size) { 636 ret = vfio_load_buffer(f, vbasedev, data_size); 637 if (ret < 0) { 638 return ret; 639 } 640 } 641 break; 642 } 643 case VFIO_MIG_FLAG_DEV_INIT_DATA_SENT: 644 { 645 if (!vfio_precopy_supported(vbasedev) || 646 !migrate_switchover_ack()) { 647 error_report("%s: Received INIT_DATA_SENT but switchover ack " 648 "is not used", vbasedev->name); 649 return -EINVAL; 650 } 651 652 ret = qemu_loadvm_approve_switchover(); 653 if (ret) { 654 error_report( 655 "%s: qemu_loadvm_approve_switchover failed, err=%d (%s)", 656 vbasedev->name, ret, strerror(-ret)); 657 } 658 659 return ret; 660 } 661 default: 662 error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data); 663 return -EINVAL; 664 } 665 666 data = qemu_get_be64(f); 667 ret = qemu_file_get_error(f); 668 if (ret) { 669 return ret; 670 } 671 } 672 return ret; 673 } 674 675 static bool vfio_switchover_ack_needed(void *opaque) 676 { 677 VFIODevice *vbasedev = opaque; 678 679 return vfio_precopy_supported(vbasedev); 680 } 681 682 static const SaveVMHandlers savevm_vfio_handlers = { 683 .save_prepare = vfio_save_prepare, 684 .save_setup = vfio_save_setup, 685 .save_cleanup = vfio_save_cleanup, 686 .state_pending_estimate = vfio_state_pending_estimate, 687 .state_pending_exact = vfio_state_pending_exact, 688 .is_active_iterate = vfio_is_active_iterate, 689 .save_live_iterate = vfio_save_iterate, 690 .save_live_complete_precopy = vfio_save_complete_precopy, 691 .save_state = vfio_save_state, 692 .load_setup = vfio_load_setup, 693 .load_cleanup = vfio_load_cleanup, 694 .load_state = vfio_load_state, 695 .switchover_ack_needed = vfio_switchover_ack_needed, 696 }; 697 698 /* ---------------------------------------------------------------------- */ 699 700 static void vfio_vmstate_change_prepare(void *opaque, bool running, 701 RunState state) 702 { 703 VFIODevice *vbasedev = opaque; 704 VFIOMigration *migration = vbasedev->migration; 705 enum vfio_device_mig_state new_state; 706 int ret; 707 708 new_state = migration->device_state == VFIO_DEVICE_STATE_PRE_COPY ? 709 VFIO_DEVICE_STATE_PRE_COPY_P2P : 710 VFIO_DEVICE_STATE_RUNNING_P2P; 711 712 ret = vfio_migration_set_state_or_reset(vbasedev, new_state); 713 if (ret) { 714 /* 715 * Migration should be aborted in this case, but vm_state_notify() 716 * currently does not support reporting failures. 717 */ 718 if (migrate_get_current()->to_dst_file) { 719 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 720 } 721 } 722 723 trace_vfio_vmstate_change_prepare(vbasedev->name, running, 724 RunState_str(state), 725 mig_state_to_str(new_state)); 726 } 727 728 static void vfio_vmstate_change(void *opaque, bool running, RunState state) 729 { 730 VFIODevice *vbasedev = opaque; 731 enum vfio_device_mig_state new_state; 732 int ret; 733 734 if (running) { 735 new_state = VFIO_DEVICE_STATE_RUNNING; 736 } else { 737 new_state = 738 (vfio_device_state_is_precopy(vbasedev) && 739 (state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSED)) ? 740 VFIO_DEVICE_STATE_STOP_COPY : 741 VFIO_DEVICE_STATE_STOP; 742 } 743 744 ret = vfio_migration_set_state_or_reset(vbasedev, new_state); 745 if (ret) { 746 /* 747 * Migration should be aborted in this case, but vm_state_notify() 748 * currently does not support reporting failures. 749 */ 750 if (migrate_get_current()->to_dst_file) { 751 qemu_file_set_error(migrate_get_current()->to_dst_file, ret); 752 } 753 } 754 755 trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state), 756 mig_state_to_str(new_state)); 757 } 758 759 static int vfio_migration_state_notifier(NotifierWithReturn *notifier, 760 MigrationEvent *e, Error **errp) 761 { 762 VFIOMigration *migration = container_of(notifier, VFIOMigration, 763 migration_state); 764 VFIODevice *vbasedev = migration->vbasedev; 765 766 trace_vfio_migration_state_notifier(vbasedev->name, e->type); 767 768 if (e->type == MIG_EVENT_PRECOPY_FAILED) { 769 vfio_migration_set_state_or_reset(vbasedev, VFIO_DEVICE_STATE_RUNNING); 770 } 771 return 0; 772 } 773 774 static void vfio_migration_free(VFIODevice *vbasedev) 775 { 776 g_free(vbasedev->migration); 777 vbasedev->migration = NULL; 778 } 779 780 static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags) 781 { 782 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 783 sizeof(struct vfio_device_feature_migration), 784 sizeof(uint64_t))] = {}; 785 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 786 struct vfio_device_feature_migration *mig = 787 (struct vfio_device_feature_migration *)feature->data; 788 789 feature->argsz = sizeof(buf); 790 feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION; 791 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 792 return -errno; 793 } 794 795 *mig_flags = mig->flags; 796 797 return 0; 798 } 799 800 static bool vfio_dma_logging_supported(VFIODevice *vbasedev) 801 { 802 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), 803 sizeof(uint64_t))] = {}; 804 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 805 806 feature->argsz = sizeof(buf); 807 feature->flags = VFIO_DEVICE_FEATURE_PROBE | 808 VFIO_DEVICE_FEATURE_DMA_LOGGING_START; 809 810 return !ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); 811 } 812 813 static int vfio_migration_init(VFIODevice *vbasedev) 814 { 815 int ret; 816 Object *obj; 817 VFIOMigration *migration; 818 char id[256] = ""; 819 g_autofree char *path = NULL, *oid = NULL; 820 uint64_t mig_flags = 0; 821 VMChangeStateHandler *prepare_cb; 822 823 if (!vbasedev->ops->vfio_get_object) { 824 return -EINVAL; 825 } 826 827 obj = vbasedev->ops->vfio_get_object(vbasedev); 828 if (!obj) { 829 return -EINVAL; 830 } 831 832 ret = vfio_migration_query_flags(vbasedev, &mig_flags); 833 if (ret) { 834 return ret; 835 } 836 837 /* Basic migration functionality must be supported */ 838 if (!(mig_flags & VFIO_MIGRATION_STOP_COPY)) { 839 return -EOPNOTSUPP; 840 } 841 842 vbasedev->migration = g_new0(VFIOMigration, 1); 843 migration = vbasedev->migration; 844 migration->vbasedev = vbasedev; 845 migration->device_state = VFIO_DEVICE_STATE_RUNNING; 846 migration->data_fd = -1; 847 migration->mig_flags = mig_flags; 848 849 vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev); 850 851 oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj))); 852 if (oid) { 853 path = g_strdup_printf("%s/vfio", oid); 854 } else { 855 path = g_strdup("vfio"); 856 } 857 strpadcpy(id, sizeof(id), path, '\0'); 858 859 register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1, &savevm_vfio_handlers, 860 vbasedev); 861 862 prepare_cb = migration->mig_flags & VFIO_MIGRATION_P2P ? 863 vfio_vmstate_change_prepare : 864 NULL; 865 migration->vm_state = qdev_add_vm_change_state_handler_full( 866 vbasedev->dev, vfio_vmstate_change, prepare_cb, vbasedev); 867 migration_add_notifier(&migration->migration_state, 868 vfio_migration_state_notifier); 869 870 return 0; 871 } 872 873 static void vfio_migration_deinit(VFIODevice *vbasedev) 874 { 875 VFIOMigration *migration = vbasedev->migration; 876 877 migration_remove_notifier(&migration->migration_state); 878 qemu_del_vm_change_state_handler(migration->vm_state); 879 unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev); 880 vfio_migration_free(vbasedev); 881 vfio_unblock_multiple_devices_migration(); 882 } 883 884 static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp) 885 { 886 if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { 887 error_propagate(errp, err); 888 return -EINVAL; 889 } 890 891 vbasedev->migration_blocker = error_copy(err); 892 error_free(err); 893 894 return migrate_add_blocker(&vbasedev->migration_blocker, errp); 895 } 896 897 /* ---------------------------------------------------------------------- */ 898 899 int64_t vfio_mig_bytes_transferred(void) 900 { 901 return bytes_transferred; 902 } 903 904 void vfio_reset_bytes_transferred(void) 905 { 906 bytes_transferred = 0; 907 } 908 909 /* 910 * Return true when either migration initialized or blocker registered. 911 * Currently only return false when adding blocker fails which will 912 * de-register vfio device. 913 */ 914 bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp) 915 { 916 Error *err = NULL; 917 int ret; 918 919 if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) { 920 error_setg(&err, "%s: Migration is disabled for VFIO device", 921 vbasedev->name); 922 return !vfio_block_migration(vbasedev, err, errp); 923 } 924 925 ret = vfio_migration_init(vbasedev); 926 if (ret) { 927 if (ret == -ENOTTY) { 928 error_setg(&err, "%s: VFIO migration is not supported in kernel", 929 vbasedev->name); 930 } else { 931 error_setg(&err, 932 "%s: Migration couldn't be initialized for VFIO device, " 933 "err: %d (%s)", 934 vbasedev->name, ret, strerror(-ret)); 935 } 936 937 return !vfio_block_migration(vbasedev, err, errp); 938 } 939 940 if (!vbasedev->dirty_pages_supported) { 941 if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) { 942 error_setg(&err, 943 "%s: VFIO device doesn't support device dirty tracking", 944 vbasedev->name); 945 goto add_blocker; 946 } 947 948 warn_report("%s: VFIO device doesn't support device dirty tracking", 949 vbasedev->name); 950 } 951 952 ret = vfio_block_multiple_devices_migration(vbasedev, errp); 953 if (ret) { 954 goto out_deinit; 955 } 956 957 if (vfio_viommu_preset(vbasedev)) { 958 error_setg(&err, "%s: Migration is currently not supported " 959 "with vIOMMU enabled", vbasedev->name); 960 goto add_blocker; 961 } 962 963 trace_vfio_migration_realize(vbasedev->name); 964 return true; 965 966 add_blocker: 967 ret = vfio_block_migration(vbasedev, err, errp); 968 out_deinit: 969 if (ret) { 970 vfio_migration_deinit(vbasedev); 971 } 972 return !ret; 973 } 974 975 void vfio_migration_exit(VFIODevice *vbasedev) 976 { 977 if (vbasedev->migration) { 978 vfio_migration_deinit(vbasedev); 979 } 980 981 migrate_del_blocker(&vbasedev->migration_blocker); 982 } 983