1 /* 2 * QEMU KVM support 3 * 4 * Copyright IBM, Corp. 2008 5 * Red Hat, Inc. 2008 6 * 7 * Authors: 8 * Anthony Liguori <aliguori@us.ibm.com> 9 * Glauber Costa <gcosta@redhat.com> 10 * 11 * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 * See the COPYING file in the top-level directory. 13 * 14 */ 15 16 #include "qemu/osdep.h" 17 #include <sys/ioctl.h> 18 #include <poll.h> 19 20 #include <linux/kvm.h> 21 22 #include "qemu/atomic.h" 23 #include "qemu/option.h" 24 #include "qemu/config-file.h" 25 #include "qemu/error-report.h" 26 #include "qapi/error.h" 27 #include "hw/pci/msi.h" 28 #include "hw/pci/msix.h" 29 #include "hw/s390x/adapter.h" 30 #include "exec/gdbstub.h" 31 #include "sysemu/kvm_int.h" 32 #include "sysemu/runstate.h" 33 #include "sysemu/cpus.h" 34 #include "sysemu/accel-blocker.h" 35 #include "qemu/bswap.h" 36 #include "exec/memory.h" 37 #include "exec/ram_addr.h" 38 #include "qemu/event_notifier.h" 39 #include "qemu/main-loop.h" 40 #include "trace.h" 41 #include "hw/irq.h" 42 #include "qapi/visitor.h" 43 #include "qapi/qapi-types-common.h" 44 #include "qapi/qapi-visit-common.h" 45 #include "sysemu/reset.h" 46 #include "qemu/guest-random.h" 47 #include "sysemu/hw_accel.h" 48 #include "kvm-cpus.h" 49 #include "sysemu/dirtylimit.h" 50 #include "qemu/range.h" 51 52 #include "hw/boards.h" 53 #include "sysemu/stats.h" 54 55 /* This check must be after config-host.h is included */ 56 #ifdef CONFIG_EVENTFD 57 #include <sys/eventfd.h> 58 #endif 59 60 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We 61 * need to use the real host PAGE_SIZE, as that's what KVM will use. 62 */ 63 #ifdef PAGE_SIZE 64 #undef PAGE_SIZE 65 #endif 66 #define PAGE_SIZE qemu_real_host_page_size() 67 68 #ifndef KVM_GUESTDBG_BLOCKIRQ 69 #define KVM_GUESTDBG_BLOCKIRQ 0 70 #endif 71 72 struct KVMParkedVcpu { 73 unsigned long vcpu_id; 74 int kvm_fd; 75 QLIST_ENTRY(KVMParkedVcpu) node; 76 }; 77 78 KVMState *kvm_state; 79 bool kvm_kernel_irqchip; 80 bool kvm_split_irqchip; 81 bool kvm_async_interrupts_allowed; 82 bool kvm_halt_in_kernel_allowed; 83 bool kvm_resamplefds_allowed; 84 bool kvm_msi_via_irqfd_allowed; 85 bool kvm_gsi_routing_allowed; 86 bool kvm_gsi_direct_mapping; 87 bool kvm_allowed; 88 bool kvm_readonly_mem_allowed; 89 bool kvm_vm_attributes_allowed; 90 bool kvm_msi_use_devid; 91 static bool kvm_has_guest_debug; 92 static int kvm_sstep_flags; 93 static bool kvm_immediate_exit; 94 static uint64_t kvm_supported_memory_attributes; 95 static hwaddr kvm_max_slot_size = ~0; 96 97 static const KVMCapabilityInfo kvm_required_capabilites[] = { 98 KVM_CAP_INFO(USER_MEMORY), 99 KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS), 100 KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS), 101 KVM_CAP_INFO(INTERNAL_ERROR_DATA), 102 KVM_CAP_INFO(IOEVENTFD), 103 KVM_CAP_INFO(IOEVENTFD_ANY_LENGTH), 104 KVM_CAP_LAST_INFO 105 }; 106 107 static NotifierList kvm_irqchip_change_notifiers = 108 NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers); 109 110 struct KVMResampleFd { 111 int gsi; 112 EventNotifier *resample_event; 113 QLIST_ENTRY(KVMResampleFd) node; 114 }; 115 typedef struct KVMResampleFd KVMResampleFd; 116 117 /* 118 * Only used with split irqchip where we need to do the resample fd 119 * kick for the kernel from userspace. 120 */ 121 static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list = 122 QLIST_HEAD_INITIALIZER(kvm_resample_fd_list); 123 124 static QemuMutex kml_slots_lock; 125 126 #define kvm_slots_lock() qemu_mutex_lock(&kml_slots_lock) 127 #define kvm_slots_unlock() qemu_mutex_unlock(&kml_slots_lock) 128 129 static void kvm_slot_init_dirty_bitmap(KVMSlot *mem); 130 131 static inline void kvm_resample_fd_remove(int gsi) 132 { 133 KVMResampleFd *rfd; 134 135 QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) { 136 if (rfd->gsi == gsi) { 137 QLIST_REMOVE(rfd, node); 138 g_free(rfd); 139 break; 140 } 141 } 142 } 143 144 static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event) 145 { 146 KVMResampleFd *rfd = g_new0(KVMResampleFd, 1); 147 148 rfd->gsi = gsi; 149 rfd->resample_event = event; 150 151 QLIST_INSERT_HEAD(&kvm_resample_fd_list, rfd, node); 152 } 153 154 void kvm_resample_fd_notify(int gsi) 155 { 156 KVMResampleFd *rfd; 157 158 QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) { 159 if (rfd->gsi == gsi) { 160 event_notifier_set(rfd->resample_event); 161 trace_kvm_resample_fd_notify(gsi); 162 return; 163 } 164 } 165 } 166 167 unsigned int kvm_get_max_memslots(void) 168 { 169 KVMState *s = KVM_STATE(current_accel()); 170 171 return s->nr_slots; 172 } 173 174 unsigned int kvm_get_free_memslots(void) 175 { 176 unsigned int used_slots = 0; 177 KVMState *s = kvm_state; 178 int i; 179 180 kvm_slots_lock(); 181 for (i = 0; i < s->nr_as; i++) { 182 if (!s->as[i].ml) { 183 continue; 184 } 185 used_slots = MAX(used_slots, s->as[i].ml->nr_used_slots); 186 } 187 kvm_slots_unlock(); 188 189 return s->nr_slots - used_slots; 190 } 191 192 /* Called with KVMMemoryListener.slots_lock held */ 193 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml) 194 { 195 KVMState *s = kvm_state; 196 int i; 197 198 for (i = 0; i < s->nr_slots; i++) { 199 if (kml->slots[i].memory_size == 0) { 200 return &kml->slots[i]; 201 } 202 } 203 204 return NULL; 205 } 206 207 /* Called with KVMMemoryListener.slots_lock held */ 208 static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml) 209 { 210 KVMSlot *slot = kvm_get_free_slot(kml); 211 212 if (slot) { 213 return slot; 214 } 215 216 fprintf(stderr, "%s: no free slot available\n", __func__); 217 abort(); 218 } 219 220 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml, 221 hwaddr start_addr, 222 hwaddr size) 223 { 224 KVMState *s = kvm_state; 225 int i; 226 227 for (i = 0; i < s->nr_slots; i++) { 228 KVMSlot *mem = &kml->slots[i]; 229 230 if (start_addr == mem->start_addr && size == mem->memory_size) { 231 return mem; 232 } 233 } 234 235 return NULL; 236 } 237 238 /* 239 * Calculate and align the start address and the size of the section. 240 * Return the size. If the size is 0, the aligned section is empty. 241 */ 242 static hwaddr kvm_align_section(MemoryRegionSection *section, 243 hwaddr *start) 244 { 245 hwaddr size = int128_get64(section->size); 246 hwaddr delta, aligned; 247 248 /* kvm works in page size chunks, but the function may be called 249 with sub-page size and unaligned start address. Pad the start 250 address to next and truncate size to previous page boundary. */ 251 aligned = ROUND_UP(section->offset_within_address_space, 252 qemu_real_host_page_size()); 253 delta = aligned - section->offset_within_address_space; 254 *start = aligned; 255 if (delta > size) { 256 return 0; 257 } 258 259 return (size - delta) & qemu_real_host_page_mask(); 260 } 261 262 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram, 263 hwaddr *phys_addr) 264 { 265 KVMMemoryListener *kml = &s->memory_listener; 266 int i, ret = 0; 267 268 kvm_slots_lock(); 269 for (i = 0; i < s->nr_slots; i++) { 270 KVMSlot *mem = &kml->slots[i]; 271 272 if (ram >= mem->ram && ram < mem->ram + mem->memory_size) { 273 *phys_addr = mem->start_addr + (ram - mem->ram); 274 ret = 1; 275 break; 276 } 277 } 278 kvm_slots_unlock(); 279 280 return ret; 281 } 282 283 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new) 284 { 285 KVMState *s = kvm_state; 286 struct kvm_userspace_memory_region mem; 287 int ret; 288 289 mem.slot = slot->slot | (kml->as_id << 16); 290 mem.guest_phys_addr = slot->start_addr; 291 mem.userspace_addr = (unsigned long)slot->ram; 292 mem.flags = slot->flags; 293 294 if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) { 295 /* Set the slot size to 0 before setting the slot to the desired 296 * value. This is needed based on KVM commit 75d61fbc. */ 297 mem.memory_size = 0; 298 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 299 if (ret < 0) { 300 goto err; 301 } 302 } 303 mem.memory_size = slot->memory_size; 304 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 305 slot->old_flags = mem.flags; 306 err: 307 trace_kvm_set_user_memory(mem.slot >> 16, (uint16_t)mem.slot, mem.flags, 308 mem.guest_phys_addr, mem.memory_size, 309 mem.userspace_addr, ret); 310 if (ret < 0) { 311 error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d," 312 " start=0x%" PRIx64 ", size=0x%" PRIx64 ": %s", 313 __func__, mem.slot, slot->start_addr, 314 (uint64_t)mem.memory_size, strerror(errno)); 315 } 316 return ret; 317 } 318 319 static int do_kvm_destroy_vcpu(CPUState *cpu) 320 { 321 KVMState *s = kvm_state; 322 long mmap_size; 323 struct KVMParkedVcpu *vcpu = NULL; 324 int ret = 0; 325 326 trace_kvm_destroy_vcpu(); 327 328 ret = kvm_arch_destroy_vcpu(cpu); 329 if (ret < 0) { 330 goto err; 331 } 332 333 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 334 if (mmap_size < 0) { 335 ret = mmap_size; 336 trace_kvm_failed_get_vcpu_mmap_size(); 337 goto err; 338 } 339 340 ret = munmap(cpu->kvm_run, mmap_size); 341 if (ret < 0) { 342 goto err; 343 } 344 345 if (cpu->kvm_dirty_gfns) { 346 ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_bytes); 347 if (ret < 0) { 348 goto err; 349 } 350 } 351 352 vcpu = g_malloc0(sizeof(*vcpu)); 353 vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); 354 vcpu->kvm_fd = cpu->kvm_fd; 355 QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); 356 err: 357 return ret; 358 } 359 360 void kvm_destroy_vcpu(CPUState *cpu) 361 { 362 if (do_kvm_destroy_vcpu(cpu) < 0) { 363 error_report("kvm_destroy_vcpu failed"); 364 exit(EXIT_FAILURE); 365 } 366 } 367 368 static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id) 369 { 370 struct KVMParkedVcpu *cpu; 371 372 QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { 373 if (cpu->vcpu_id == vcpu_id) { 374 int kvm_fd; 375 376 QLIST_REMOVE(cpu, node); 377 kvm_fd = cpu->kvm_fd; 378 g_free(cpu); 379 return kvm_fd; 380 } 381 } 382 383 return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id); 384 } 385 386 int kvm_init_vcpu(CPUState *cpu, Error **errp) 387 { 388 KVMState *s = kvm_state; 389 long mmap_size; 390 int ret; 391 392 trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu)); 393 394 ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu)); 395 if (ret < 0) { 396 error_setg_errno(errp, -ret, "kvm_init_vcpu: kvm_get_vcpu failed (%lu)", 397 kvm_arch_vcpu_id(cpu)); 398 goto err; 399 } 400 401 cpu->kvm_fd = ret; 402 cpu->kvm_state = s; 403 cpu->vcpu_dirty = true; 404 cpu->dirty_pages = 0; 405 cpu->throttle_us_per_full = 0; 406 407 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 408 if (mmap_size < 0) { 409 ret = mmap_size; 410 error_setg_errno(errp, -mmap_size, 411 "kvm_init_vcpu: KVM_GET_VCPU_MMAP_SIZE failed"); 412 goto err; 413 } 414 415 cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 416 cpu->kvm_fd, 0); 417 if (cpu->kvm_run == MAP_FAILED) { 418 ret = -errno; 419 error_setg_errno(errp, ret, 420 "kvm_init_vcpu: mmap'ing vcpu state failed (%lu)", 421 kvm_arch_vcpu_id(cpu)); 422 goto err; 423 } 424 425 if (s->coalesced_mmio && !s->coalesced_mmio_ring) { 426 s->coalesced_mmio_ring = 427 (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE; 428 } 429 430 if (s->kvm_dirty_ring_size) { 431 /* Use MAP_SHARED to share pages with the kernel */ 432 cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_bytes, 433 PROT_READ | PROT_WRITE, MAP_SHARED, 434 cpu->kvm_fd, 435 PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET); 436 if (cpu->kvm_dirty_gfns == MAP_FAILED) { 437 ret = -errno; 438 goto err; 439 } 440 } 441 442 ret = kvm_arch_init_vcpu(cpu); 443 if (ret < 0) { 444 error_setg_errno(errp, -ret, 445 "kvm_init_vcpu: kvm_arch_init_vcpu failed (%lu)", 446 kvm_arch_vcpu_id(cpu)); 447 } 448 cpu->kvm_vcpu_stats_fd = kvm_vcpu_ioctl(cpu, KVM_GET_STATS_FD, NULL); 449 450 err: 451 return ret; 452 } 453 454 /* 455 * dirty pages logging control 456 */ 457 458 static int kvm_mem_flags(MemoryRegion *mr) 459 { 460 bool readonly = mr->readonly || memory_region_is_romd(mr); 461 int flags = 0; 462 463 if (memory_region_get_dirty_log_mask(mr) != 0) { 464 flags |= KVM_MEM_LOG_DIRTY_PAGES; 465 } 466 if (readonly && kvm_readonly_mem_allowed) { 467 flags |= KVM_MEM_READONLY; 468 } 469 return flags; 470 } 471 472 /* Called with KVMMemoryListener.slots_lock held */ 473 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem, 474 MemoryRegion *mr) 475 { 476 mem->flags = kvm_mem_flags(mr); 477 478 /* If nothing changed effectively, no need to issue ioctl */ 479 if (mem->flags == mem->old_flags) { 480 return 0; 481 } 482 483 kvm_slot_init_dirty_bitmap(mem); 484 return kvm_set_user_memory_region(kml, mem, false); 485 } 486 487 static int kvm_section_update_flags(KVMMemoryListener *kml, 488 MemoryRegionSection *section) 489 { 490 hwaddr start_addr, size, slot_size; 491 KVMSlot *mem; 492 int ret = 0; 493 494 size = kvm_align_section(section, &start_addr); 495 if (!size) { 496 return 0; 497 } 498 499 kvm_slots_lock(); 500 501 while (size && !ret) { 502 slot_size = MIN(kvm_max_slot_size, size); 503 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); 504 if (!mem) { 505 /* We don't have a slot if we want to trap every access. */ 506 goto out; 507 } 508 509 ret = kvm_slot_update_flags(kml, mem, section->mr); 510 start_addr += slot_size; 511 size -= slot_size; 512 } 513 514 out: 515 kvm_slots_unlock(); 516 return ret; 517 } 518 519 static void kvm_log_start(MemoryListener *listener, 520 MemoryRegionSection *section, 521 int old, int new) 522 { 523 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 524 int r; 525 526 if (old != 0) { 527 return; 528 } 529 530 r = kvm_section_update_flags(kml, section); 531 if (r < 0) { 532 abort(); 533 } 534 } 535 536 static void kvm_log_stop(MemoryListener *listener, 537 MemoryRegionSection *section, 538 int old, int new) 539 { 540 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 541 int r; 542 543 if (new != 0) { 544 return; 545 } 546 547 r = kvm_section_update_flags(kml, section); 548 if (r < 0) { 549 abort(); 550 } 551 } 552 553 /* get kvm's dirty pages bitmap and update qemu's */ 554 static void kvm_slot_sync_dirty_pages(KVMSlot *slot) 555 { 556 ram_addr_t start = slot->ram_start_offset; 557 ram_addr_t pages = slot->memory_size / qemu_real_host_page_size(); 558 559 cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages); 560 } 561 562 static void kvm_slot_reset_dirty_pages(KVMSlot *slot) 563 { 564 memset(slot->dirty_bmap, 0, slot->dirty_bmap_size); 565 } 566 567 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1)) 568 569 /* Allocate the dirty bitmap for a slot */ 570 static void kvm_slot_init_dirty_bitmap(KVMSlot *mem) 571 { 572 if (!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) || mem->dirty_bmap) { 573 return; 574 } 575 576 /* 577 * XXX bad kernel interface alert 578 * For dirty bitmap, kernel allocates array of size aligned to 579 * bits-per-long. But for case when the kernel is 64bits and 580 * the userspace is 32bits, userspace can't align to the same 581 * bits-per-long, since sizeof(long) is different between kernel 582 * and user space. This way, userspace will provide buffer which 583 * may be 4 bytes less than the kernel will use, resulting in 584 * userspace memory corruption (which is not detectable by valgrind 585 * too, in most cases). 586 * So for now, let's align to 64 instead of HOST_LONG_BITS here, in 587 * a hope that sizeof(long) won't become >8 any time soon. 588 * 589 * Note: the granule of kvm dirty log is qemu_real_host_page_size. 590 * And mem->memory_size is aligned to it (otherwise this mem can't 591 * be registered to KVM). 592 */ 593 hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size(), 594 /*HOST_LONG_BITS*/ 64) / 8; 595 mem->dirty_bmap = g_malloc0(bitmap_size); 596 mem->dirty_bmap_size = bitmap_size; 597 } 598 599 /* 600 * Sync dirty bitmap from kernel to KVMSlot.dirty_bmap, return true if 601 * succeeded, false otherwise 602 */ 603 static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot *slot) 604 { 605 struct kvm_dirty_log d = {}; 606 int ret; 607 608 d.dirty_bitmap = slot->dirty_bmap; 609 d.slot = slot->slot | (slot->as_id << 16); 610 ret = kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d); 611 612 if (ret == -ENOENT) { 613 /* kernel does not have dirty bitmap in this slot */ 614 ret = 0; 615 } 616 if (ret) { 617 error_report_once("%s: KVM_GET_DIRTY_LOG failed with %d", 618 __func__, ret); 619 } 620 return ret == 0; 621 } 622 623 /* Should be with all slots_lock held for the address spaces. */ 624 static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id, 625 uint32_t slot_id, uint64_t offset) 626 { 627 KVMMemoryListener *kml; 628 KVMSlot *mem; 629 630 if (as_id >= s->nr_as) { 631 return; 632 } 633 634 kml = s->as[as_id].ml; 635 mem = &kml->slots[slot_id]; 636 637 if (!mem->memory_size || offset >= 638 (mem->memory_size / qemu_real_host_page_size())) { 639 return; 640 } 641 642 set_bit(offset, mem->dirty_bmap); 643 } 644 645 static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn) 646 { 647 /* 648 * Read the flags before the value. Pairs with barrier in 649 * KVM's kvm_dirty_ring_push() function. 650 */ 651 return qatomic_load_acquire(&gfn->flags) == KVM_DIRTY_GFN_F_DIRTY; 652 } 653 654 static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn) 655 { 656 /* 657 * Use a store-release so that the CPU that executes KVM_RESET_DIRTY_RINGS 658 * sees the full content of the ring: 659 * 660 * CPU0 CPU1 CPU2 661 * ------------------------------------------------------------------------------ 662 * fill gfn0 663 * store-rel flags for gfn0 664 * load-acq flags for gfn0 665 * store-rel RESET for gfn0 666 * ioctl(RESET_RINGS) 667 * load-acq flags for gfn0 668 * check if flags have RESET 669 * 670 * The synchronization goes from CPU2 to CPU0 to CPU1. 671 */ 672 qatomic_store_release(&gfn->flags, KVM_DIRTY_GFN_F_RESET); 673 } 674 675 /* 676 * Should be with all slots_lock held for the address spaces. It returns the 677 * dirty page we've collected on this dirty ring. 678 */ 679 static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu) 680 { 681 struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur; 682 uint32_t ring_size = s->kvm_dirty_ring_size; 683 uint32_t count = 0, fetch = cpu->kvm_fetch_index; 684 685 /* 686 * It's possible that we race with vcpu creation code where the vcpu is 687 * put onto the vcpus list but not yet initialized the dirty ring 688 * structures. If so, skip it. 689 */ 690 if (!cpu->created) { 691 return 0; 692 } 693 694 assert(dirty_gfns && ring_size); 695 trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index); 696 697 while (true) { 698 cur = &dirty_gfns[fetch % ring_size]; 699 if (!dirty_gfn_is_dirtied(cur)) { 700 break; 701 } 702 kvm_dirty_ring_mark_page(s, cur->slot >> 16, cur->slot & 0xffff, 703 cur->offset); 704 dirty_gfn_set_collected(cur); 705 trace_kvm_dirty_ring_page(cpu->cpu_index, fetch, cur->offset); 706 fetch++; 707 count++; 708 } 709 cpu->kvm_fetch_index = fetch; 710 cpu->dirty_pages += count; 711 712 return count; 713 } 714 715 /* Must be with slots_lock held */ 716 static uint64_t kvm_dirty_ring_reap_locked(KVMState *s, CPUState* cpu) 717 { 718 int ret; 719 uint64_t total = 0; 720 int64_t stamp; 721 722 stamp = get_clock(); 723 724 if (cpu) { 725 total = kvm_dirty_ring_reap_one(s, cpu); 726 } else { 727 CPU_FOREACH(cpu) { 728 total += kvm_dirty_ring_reap_one(s, cpu); 729 } 730 } 731 732 if (total) { 733 ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS); 734 assert(ret == total); 735 } 736 737 stamp = get_clock() - stamp; 738 739 if (total) { 740 trace_kvm_dirty_ring_reap(total, stamp / 1000); 741 } 742 743 return total; 744 } 745 746 /* 747 * Currently for simplicity, we must hold BQL before calling this. We can 748 * consider to drop the BQL if we're clear with all the race conditions. 749 */ 750 static uint64_t kvm_dirty_ring_reap(KVMState *s, CPUState *cpu) 751 { 752 uint64_t total; 753 754 /* 755 * We need to lock all kvm slots for all address spaces here, 756 * because: 757 * 758 * (1) We need to mark dirty for dirty bitmaps in multiple slots 759 * and for tons of pages, so it's better to take the lock here 760 * once rather than once per page. And more importantly, 761 * 762 * (2) We must _NOT_ publish dirty bits to the other threads 763 * (e.g., the migration thread) via the kvm memory slot dirty 764 * bitmaps before correctly re-protect those dirtied pages. 765 * Otherwise we can have potential risk of data corruption if 766 * the page data is read in the other thread before we do 767 * reset below. 768 */ 769 kvm_slots_lock(); 770 total = kvm_dirty_ring_reap_locked(s, cpu); 771 kvm_slots_unlock(); 772 773 return total; 774 } 775 776 static void do_kvm_cpu_synchronize_kick(CPUState *cpu, run_on_cpu_data arg) 777 { 778 /* No need to do anything */ 779 } 780 781 /* 782 * Kick all vcpus out in a synchronized way. When returned, we 783 * guarantee that every vcpu has been kicked and at least returned to 784 * userspace once. 785 */ 786 static void kvm_cpu_synchronize_kick_all(void) 787 { 788 CPUState *cpu; 789 790 CPU_FOREACH(cpu) { 791 run_on_cpu(cpu, do_kvm_cpu_synchronize_kick, RUN_ON_CPU_NULL); 792 } 793 } 794 795 /* 796 * Flush all the existing dirty pages to the KVM slot buffers. When 797 * this call returns, we guarantee that all the touched dirty pages 798 * before calling this function have been put into the per-kvmslot 799 * dirty bitmap. 800 * 801 * This function must be called with BQL held. 802 */ 803 static void kvm_dirty_ring_flush(void) 804 { 805 trace_kvm_dirty_ring_flush(0); 806 /* 807 * The function needs to be serialized. Since this function 808 * should always be with BQL held, serialization is guaranteed. 809 * However, let's be sure of it. 810 */ 811 assert(bql_locked()); 812 /* 813 * First make sure to flush the hardware buffers by kicking all 814 * vcpus out in a synchronous way. 815 */ 816 kvm_cpu_synchronize_kick_all(); 817 kvm_dirty_ring_reap(kvm_state, NULL); 818 trace_kvm_dirty_ring_flush(1); 819 } 820 821 /** 822 * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space 823 * 824 * This function will first try to fetch dirty bitmap from the kernel, 825 * and then updates qemu's dirty bitmap. 826 * 827 * NOTE: caller must be with kml->slots_lock held. 828 * 829 * @kml: the KVM memory listener object 830 * @section: the memory section to sync the dirty bitmap with 831 */ 832 static void kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, 833 MemoryRegionSection *section) 834 { 835 KVMState *s = kvm_state; 836 KVMSlot *mem; 837 hwaddr start_addr, size; 838 hwaddr slot_size; 839 840 size = kvm_align_section(section, &start_addr); 841 while (size) { 842 slot_size = MIN(kvm_max_slot_size, size); 843 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); 844 if (!mem) { 845 /* We don't have a slot if we want to trap every access. */ 846 return; 847 } 848 if (kvm_slot_get_dirty_log(s, mem)) { 849 kvm_slot_sync_dirty_pages(mem); 850 } 851 start_addr += slot_size; 852 size -= slot_size; 853 } 854 } 855 856 /* Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages */ 857 #define KVM_CLEAR_LOG_SHIFT 6 858 #define KVM_CLEAR_LOG_ALIGN (qemu_real_host_page_size() << KVM_CLEAR_LOG_SHIFT) 859 #define KVM_CLEAR_LOG_MASK (-KVM_CLEAR_LOG_ALIGN) 860 861 static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start, 862 uint64_t size) 863 { 864 KVMState *s = kvm_state; 865 uint64_t end, bmap_start, start_delta, bmap_npages; 866 struct kvm_clear_dirty_log d; 867 unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size(); 868 int ret; 869 870 /* 871 * We need to extend either the start or the size or both to 872 * satisfy the KVM interface requirement. Firstly, do the start 873 * page alignment on 64 host pages 874 */ 875 bmap_start = start & KVM_CLEAR_LOG_MASK; 876 start_delta = start - bmap_start; 877 bmap_start /= psize; 878 879 /* 880 * The kernel interface has restriction on the size too, that either: 881 * 882 * (1) the size is 64 host pages aligned (just like the start), or 883 * (2) the size fills up until the end of the KVM memslot. 884 */ 885 bmap_npages = DIV_ROUND_UP(size + start_delta, KVM_CLEAR_LOG_ALIGN) 886 << KVM_CLEAR_LOG_SHIFT; 887 end = mem->memory_size / psize; 888 if (bmap_npages > end - bmap_start) { 889 bmap_npages = end - bmap_start; 890 } 891 start_delta /= psize; 892 893 /* 894 * Prepare the bitmap to clear dirty bits. Here we must guarantee 895 * that we won't clear any unknown dirty bits otherwise we might 896 * accidentally clear some set bits which are not yet synced from 897 * the kernel into QEMU's bitmap, then we'll lose track of the 898 * guest modifications upon those pages (which can directly lead 899 * to guest data loss or panic after migration). 900 * 901 * Layout of the KVMSlot.dirty_bmap: 902 * 903 * |<-------- bmap_npages -----------..>| 904 * [1] 905 * start_delta size 906 * |----------------|-------------|------------------|------------| 907 * ^ ^ ^ ^ 908 * | | | | 909 * start bmap_start (start) end 910 * of memslot of memslot 911 * 912 * [1] bmap_npages can be aligned to either 64 pages or the end of slot 913 */ 914 915 assert(bmap_start % BITS_PER_LONG == 0); 916 /* We should never do log_clear before log_sync */ 917 assert(mem->dirty_bmap); 918 if (start_delta || bmap_npages - size / psize) { 919 /* Slow path - we need to manipulate a temp bitmap */ 920 bmap_clear = bitmap_new(bmap_npages); 921 bitmap_copy_with_src_offset(bmap_clear, mem->dirty_bmap, 922 bmap_start, start_delta + size / psize); 923 /* 924 * We need to fill the holes at start because that was not 925 * specified by the caller and we extended the bitmap only for 926 * 64 pages alignment 927 */ 928 bitmap_clear(bmap_clear, 0, start_delta); 929 d.dirty_bitmap = bmap_clear; 930 } else { 931 /* 932 * Fast path - both start and size align well with BITS_PER_LONG 933 * (or the end of memory slot) 934 */ 935 d.dirty_bitmap = mem->dirty_bmap + BIT_WORD(bmap_start); 936 } 937 938 d.first_page = bmap_start; 939 /* It should never overflow. If it happens, say something */ 940 assert(bmap_npages <= UINT32_MAX); 941 d.num_pages = bmap_npages; 942 d.slot = mem->slot | (as_id << 16); 943 944 ret = kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d); 945 if (ret < 0 && ret != -ENOENT) { 946 error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, " 947 "start=0x%"PRIx64", size=0x%"PRIx32", errno=%d", 948 __func__, d.slot, (uint64_t)d.first_page, 949 (uint32_t)d.num_pages, ret); 950 } else { 951 ret = 0; 952 trace_kvm_clear_dirty_log(d.slot, d.first_page, d.num_pages); 953 } 954 955 /* 956 * After we have updated the remote dirty bitmap, we update the 957 * cached bitmap as well for the memslot, then if another user 958 * clears the same region we know we shouldn't clear it again on 959 * the remote otherwise it's data loss as well. 960 */ 961 bitmap_clear(mem->dirty_bmap, bmap_start + start_delta, 962 size / psize); 963 /* This handles the NULL case well */ 964 g_free(bmap_clear); 965 return ret; 966 } 967 968 969 /** 970 * kvm_physical_log_clear - Clear the kernel's dirty bitmap for range 971 * 972 * NOTE: this will be a no-op if we haven't enabled manual dirty log 973 * protection in the host kernel because in that case this operation 974 * will be done within log_sync(). 975 * 976 * @kml: the kvm memory listener 977 * @section: the memory range to clear dirty bitmap 978 */ 979 static int kvm_physical_log_clear(KVMMemoryListener *kml, 980 MemoryRegionSection *section) 981 { 982 KVMState *s = kvm_state; 983 uint64_t start, size, offset, count; 984 KVMSlot *mem; 985 int ret = 0, i; 986 987 if (!s->manual_dirty_log_protect) { 988 /* No need to do explicit clear */ 989 return ret; 990 } 991 992 start = section->offset_within_address_space; 993 size = int128_get64(section->size); 994 995 if (!size) { 996 /* Nothing more we can do... */ 997 return ret; 998 } 999 1000 kvm_slots_lock(); 1001 1002 for (i = 0; i < s->nr_slots; i++) { 1003 mem = &kml->slots[i]; 1004 /* Discard slots that are empty or do not overlap the section */ 1005 if (!mem->memory_size || 1006 mem->start_addr > start + size - 1 || 1007 start > mem->start_addr + mem->memory_size - 1) { 1008 continue; 1009 } 1010 1011 if (start >= mem->start_addr) { 1012 /* The slot starts before section or is aligned to it. */ 1013 offset = start - mem->start_addr; 1014 count = MIN(mem->memory_size - offset, size); 1015 } else { 1016 /* The slot starts after section. */ 1017 offset = 0; 1018 count = MIN(mem->memory_size, size - (mem->start_addr - start)); 1019 } 1020 ret = kvm_log_clear_one_slot(mem, kml->as_id, offset, count); 1021 if (ret < 0) { 1022 break; 1023 } 1024 } 1025 1026 kvm_slots_unlock(); 1027 1028 return ret; 1029 } 1030 1031 static void kvm_coalesce_mmio_region(MemoryListener *listener, 1032 MemoryRegionSection *secion, 1033 hwaddr start, hwaddr size) 1034 { 1035 KVMState *s = kvm_state; 1036 1037 if (s->coalesced_mmio) { 1038 struct kvm_coalesced_mmio_zone zone; 1039 1040 zone.addr = start; 1041 zone.size = size; 1042 zone.pad = 0; 1043 1044 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone); 1045 } 1046 } 1047 1048 static void kvm_uncoalesce_mmio_region(MemoryListener *listener, 1049 MemoryRegionSection *secion, 1050 hwaddr start, hwaddr size) 1051 { 1052 KVMState *s = kvm_state; 1053 1054 if (s->coalesced_mmio) { 1055 struct kvm_coalesced_mmio_zone zone; 1056 1057 zone.addr = start; 1058 zone.size = size; 1059 zone.pad = 0; 1060 1061 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone); 1062 } 1063 } 1064 1065 static void kvm_coalesce_pio_add(MemoryListener *listener, 1066 MemoryRegionSection *section, 1067 hwaddr start, hwaddr size) 1068 { 1069 KVMState *s = kvm_state; 1070 1071 if (s->coalesced_pio) { 1072 struct kvm_coalesced_mmio_zone zone; 1073 1074 zone.addr = start; 1075 zone.size = size; 1076 zone.pio = 1; 1077 1078 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone); 1079 } 1080 } 1081 1082 static void kvm_coalesce_pio_del(MemoryListener *listener, 1083 MemoryRegionSection *section, 1084 hwaddr start, hwaddr size) 1085 { 1086 KVMState *s = kvm_state; 1087 1088 if (s->coalesced_pio) { 1089 struct kvm_coalesced_mmio_zone zone; 1090 1091 zone.addr = start; 1092 zone.size = size; 1093 zone.pio = 1; 1094 1095 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone); 1096 } 1097 } 1098 1099 int kvm_check_extension(KVMState *s, unsigned int extension) 1100 { 1101 int ret; 1102 1103 ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension); 1104 if (ret < 0) { 1105 ret = 0; 1106 } 1107 1108 return ret; 1109 } 1110 1111 int kvm_vm_check_extension(KVMState *s, unsigned int extension) 1112 { 1113 int ret; 1114 1115 ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension); 1116 if (ret < 0) { 1117 /* VM wide version not implemented, use global one instead */ 1118 ret = kvm_check_extension(s, extension); 1119 } 1120 1121 return ret; 1122 } 1123 1124 /* 1125 * We track the poisoned pages to be able to: 1126 * - replace them on VM reset 1127 * - block a migration for a VM with a poisoned page 1128 */ 1129 typedef struct HWPoisonPage { 1130 ram_addr_t ram_addr; 1131 QLIST_ENTRY(HWPoisonPage) list; 1132 } HWPoisonPage; 1133 1134 static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list = 1135 QLIST_HEAD_INITIALIZER(hwpoison_page_list); 1136 1137 static void kvm_unpoison_all(void *param) 1138 { 1139 HWPoisonPage *page, *next_page; 1140 1141 QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) { 1142 QLIST_REMOVE(page, list); 1143 qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE); 1144 g_free(page); 1145 } 1146 } 1147 1148 void kvm_hwpoison_page_add(ram_addr_t ram_addr) 1149 { 1150 HWPoisonPage *page; 1151 1152 QLIST_FOREACH(page, &hwpoison_page_list, list) { 1153 if (page->ram_addr == ram_addr) { 1154 return; 1155 } 1156 } 1157 page = g_new(HWPoisonPage, 1); 1158 page->ram_addr = ram_addr; 1159 QLIST_INSERT_HEAD(&hwpoison_page_list, page, list); 1160 } 1161 1162 bool kvm_hwpoisoned_mem(void) 1163 { 1164 return !QLIST_EMPTY(&hwpoison_page_list); 1165 } 1166 1167 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size) 1168 { 1169 #if HOST_BIG_ENDIAN != TARGET_BIG_ENDIAN 1170 /* The kernel expects ioeventfd values in HOST_BIG_ENDIAN 1171 * endianness, but the memory core hands them in target endianness. 1172 * For example, PPC is always treated as big-endian even if running 1173 * on KVM and on PPC64LE. Correct here. 1174 */ 1175 switch (size) { 1176 case 2: 1177 val = bswap16(val); 1178 break; 1179 case 4: 1180 val = bswap32(val); 1181 break; 1182 } 1183 #endif 1184 return val; 1185 } 1186 1187 static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val, 1188 bool assign, uint32_t size, bool datamatch) 1189 { 1190 int ret; 1191 struct kvm_ioeventfd iofd = { 1192 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 1193 .addr = addr, 1194 .len = size, 1195 .flags = 0, 1196 .fd = fd, 1197 }; 1198 1199 trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size, 1200 datamatch); 1201 if (!kvm_enabled()) { 1202 return -ENOSYS; 1203 } 1204 1205 if (datamatch) { 1206 iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 1207 } 1208 if (!assign) { 1209 iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 1210 } 1211 1212 ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd); 1213 1214 if (ret < 0) { 1215 return -errno; 1216 } 1217 1218 return 0; 1219 } 1220 1221 static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val, 1222 bool assign, uint32_t size, bool datamatch) 1223 { 1224 struct kvm_ioeventfd kick = { 1225 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 1226 .addr = addr, 1227 .flags = KVM_IOEVENTFD_FLAG_PIO, 1228 .len = size, 1229 .fd = fd, 1230 }; 1231 int r; 1232 trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch); 1233 if (!kvm_enabled()) { 1234 return -ENOSYS; 1235 } 1236 if (datamatch) { 1237 kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 1238 } 1239 if (!assign) { 1240 kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 1241 } 1242 r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick); 1243 if (r < 0) { 1244 return r; 1245 } 1246 return 0; 1247 } 1248 1249 1250 static const KVMCapabilityInfo * 1251 kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list) 1252 { 1253 while (list->name) { 1254 if (!kvm_check_extension(s, list->value)) { 1255 return list; 1256 } 1257 list++; 1258 } 1259 return NULL; 1260 } 1261 1262 void kvm_set_max_memslot_size(hwaddr max_slot_size) 1263 { 1264 g_assert( 1265 ROUND_UP(max_slot_size, qemu_real_host_page_size()) == max_slot_size 1266 ); 1267 kvm_max_slot_size = max_slot_size; 1268 } 1269 1270 static int kvm_set_memory_attributes(hwaddr start, uint64_t size, uint64_t attr) 1271 { 1272 struct kvm_memory_attributes attrs; 1273 int r; 1274 1275 assert((attr & kvm_supported_memory_attributes) == attr); 1276 attrs.attributes = attr; 1277 attrs.address = start; 1278 attrs.size = size; 1279 attrs.flags = 0; 1280 1281 r = kvm_vm_ioctl(kvm_state, KVM_SET_MEMORY_ATTRIBUTES, &attrs); 1282 if (r) { 1283 error_report("failed to set memory (0x%" HWADDR_PRIx "+0x%" PRIx64 ") " 1284 "with attr 0x%" PRIx64 " error '%s'", 1285 start, size, attr, strerror(errno)); 1286 } 1287 return r; 1288 } 1289 1290 int kvm_set_memory_attributes_private(hwaddr start, uint64_t size) 1291 { 1292 return kvm_set_memory_attributes(start, size, KVM_MEMORY_ATTRIBUTE_PRIVATE); 1293 } 1294 1295 int kvm_set_memory_attributes_shared(hwaddr start, uint64_t size) 1296 { 1297 return kvm_set_memory_attributes(start, size, 0); 1298 } 1299 1300 /* Called with KVMMemoryListener.slots_lock held */ 1301 static void kvm_set_phys_mem(KVMMemoryListener *kml, 1302 MemoryRegionSection *section, bool add) 1303 { 1304 KVMSlot *mem; 1305 int err; 1306 MemoryRegion *mr = section->mr; 1307 bool writable = !mr->readonly && !mr->rom_device; 1308 hwaddr start_addr, size, slot_size, mr_offset; 1309 ram_addr_t ram_start_offset; 1310 void *ram; 1311 1312 if (!memory_region_is_ram(mr)) { 1313 if (writable || !kvm_readonly_mem_allowed) { 1314 return; 1315 } else if (!mr->romd_mode) { 1316 /* If the memory device is not in romd_mode, then we actually want 1317 * to remove the kvm memory slot so all accesses will trap. */ 1318 add = false; 1319 } 1320 } 1321 1322 size = kvm_align_section(section, &start_addr); 1323 if (!size) { 1324 return; 1325 } 1326 1327 /* The offset of the kvmslot within the memory region */ 1328 mr_offset = section->offset_within_region + start_addr - 1329 section->offset_within_address_space; 1330 1331 /* use aligned delta to align the ram address and offset */ 1332 ram = memory_region_get_ram_ptr(mr) + mr_offset; 1333 ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset; 1334 1335 if (!add) { 1336 do { 1337 slot_size = MIN(kvm_max_slot_size, size); 1338 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); 1339 if (!mem) { 1340 return; 1341 } 1342 if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { 1343 /* 1344 * NOTE: We should be aware of the fact that here we're only 1345 * doing a best effort to sync dirty bits. No matter whether 1346 * we're using dirty log or dirty ring, we ignored two facts: 1347 * 1348 * (1) dirty bits can reside in hardware buffers (PML) 1349 * 1350 * (2) after we collected dirty bits here, pages can be dirtied 1351 * again before we do the final KVM_SET_USER_MEMORY_REGION to 1352 * remove the slot. 1353 * 1354 * Not easy. Let's cross the fingers until it's fixed. 1355 */ 1356 if (kvm_state->kvm_dirty_ring_size) { 1357 kvm_dirty_ring_reap_locked(kvm_state, NULL); 1358 if (kvm_state->kvm_dirty_ring_with_bitmap) { 1359 kvm_slot_sync_dirty_pages(mem); 1360 kvm_slot_get_dirty_log(kvm_state, mem); 1361 } 1362 } else { 1363 kvm_slot_get_dirty_log(kvm_state, mem); 1364 } 1365 kvm_slot_sync_dirty_pages(mem); 1366 } 1367 1368 /* unregister the slot */ 1369 g_free(mem->dirty_bmap); 1370 mem->dirty_bmap = NULL; 1371 mem->memory_size = 0; 1372 mem->flags = 0; 1373 err = kvm_set_user_memory_region(kml, mem, false); 1374 if (err) { 1375 fprintf(stderr, "%s: error unregistering slot: %s\n", 1376 __func__, strerror(-err)); 1377 abort(); 1378 } 1379 start_addr += slot_size; 1380 size -= slot_size; 1381 kml->nr_used_slots--; 1382 } while (size); 1383 return; 1384 } 1385 1386 /* register the new slot */ 1387 do { 1388 slot_size = MIN(kvm_max_slot_size, size); 1389 mem = kvm_alloc_slot(kml); 1390 mem->as_id = kml->as_id; 1391 mem->memory_size = slot_size; 1392 mem->start_addr = start_addr; 1393 mem->ram_start_offset = ram_start_offset; 1394 mem->ram = ram; 1395 mem->flags = kvm_mem_flags(mr); 1396 kvm_slot_init_dirty_bitmap(mem); 1397 err = kvm_set_user_memory_region(kml, mem, true); 1398 if (err) { 1399 fprintf(stderr, "%s: error registering slot: %s\n", __func__, 1400 strerror(-err)); 1401 abort(); 1402 } 1403 start_addr += slot_size; 1404 ram_start_offset += slot_size; 1405 ram += slot_size; 1406 size -= slot_size; 1407 kml->nr_used_slots++; 1408 } while (size); 1409 } 1410 1411 static void *kvm_dirty_ring_reaper_thread(void *data) 1412 { 1413 KVMState *s = data; 1414 struct KVMDirtyRingReaper *r = &s->reaper; 1415 1416 rcu_register_thread(); 1417 1418 trace_kvm_dirty_ring_reaper("init"); 1419 1420 while (true) { 1421 r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT; 1422 trace_kvm_dirty_ring_reaper("wait"); 1423 /* 1424 * TODO: provide a smarter timeout rather than a constant? 1425 */ 1426 sleep(1); 1427 1428 /* keep sleeping so that dirtylimit not be interfered by reaper */ 1429 if (dirtylimit_in_service()) { 1430 continue; 1431 } 1432 1433 trace_kvm_dirty_ring_reaper("wakeup"); 1434 r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING; 1435 1436 bql_lock(); 1437 kvm_dirty_ring_reap(s, NULL); 1438 bql_unlock(); 1439 1440 r->reaper_iteration++; 1441 } 1442 1443 trace_kvm_dirty_ring_reaper("exit"); 1444 1445 rcu_unregister_thread(); 1446 1447 return NULL; 1448 } 1449 1450 static void kvm_dirty_ring_reaper_init(KVMState *s) 1451 { 1452 struct KVMDirtyRingReaper *r = &s->reaper; 1453 1454 qemu_thread_create(&r->reaper_thr, "kvm-reaper", 1455 kvm_dirty_ring_reaper_thread, 1456 s, QEMU_THREAD_JOINABLE); 1457 } 1458 1459 static int kvm_dirty_ring_init(KVMState *s) 1460 { 1461 uint32_t ring_size = s->kvm_dirty_ring_size; 1462 uint64_t ring_bytes = ring_size * sizeof(struct kvm_dirty_gfn); 1463 unsigned int capability = KVM_CAP_DIRTY_LOG_RING; 1464 int ret; 1465 1466 s->kvm_dirty_ring_size = 0; 1467 s->kvm_dirty_ring_bytes = 0; 1468 1469 /* Bail if the dirty ring size isn't specified */ 1470 if (!ring_size) { 1471 return 0; 1472 } 1473 1474 /* 1475 * Read the max supported pages. Fall back to dirty logging mode 1476 * if the dirty ring isn't supported. 1477 */ 1478 ret = kvm_vm_check_extension(s, capability); 1479 if (ret <= 0) { 1480 capability = KVM_CAP_DIRTY_LOG_RING_ACQ_REL; 1481 ret = kvm_vm_check_extension(s, capability); 1482 } 1483 1484 if (ret <= 0) { 1485 warn_report("KVM dirty ring not available, using bitmap method"); 1486 return 0; 1487 } 1488 1489 if (ring_bytes > ret) { 1490 error_report("KVM dirty ring size %" PRIu32 " too big " 1491 "(maximum is %ld). Please use a smaller value.", 1492 ring_size, (long)ret / sizeof(struct kvm_dirty_gfn)); 1493 return -EINVAL; 1494 } 1495 1496 ret = kvm_vm_enable_cap(s, capability, 0, ring_bytes); 1497 if (ret) { 1498 error_report("Enabling of KVM dirty ring failed: %s. " 1499 "Suggested minimum value is 1024.", strerror(-ret)); 1500 return -EIO; 1501 } 1502 1503 /* Enable the backup bitmap if it is supported */ 1504 ret = kvm_vm_check_extension(s, KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP); 1505 if (ret > 0) { 1506 ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP, 0); 1507 if (ret) { 1508 error_report("Enabling of KVM dirty ring's backup bitmap failed: " 1509 "%s. ", strerror(-ret)); 1510 return -EIO; 1511 } 1512 1513 s->kvm_dirty_ring_with_bitmap = true; 1514 } 1515 1516 s->kvm_dirty_ring_size = ring_size; 1517 s->kvm_dirty_ring_bytes = ring_bytes; 1518 1519 return 0; 1520 } 1521 1522 static void kvm_region_add(MemoryListener *listener, 1523 MemoryRegionSection *section) 1524 { 1525 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1526 KVMMemoryUpdate *update; 1527 1528 update = g_new0(KVMMemoryUpdate, 1); 1529 update->section = *section; 1530 1531 QSIMPLEQ_INSERT_TAIL(&kml->transaction_add, update, next); 1532 } 1533 1534 static void kvm_region_del(MemoryListener *listener, 1535 MemoryRegionSection *section) 1536 { 1537 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1538 KVMMemoryUpdate *update; 1539 1540 update = g_new0(KVMMemoryUpdate, 1); 1541 update->section = *section; 1542 1543 QSIMPLEQ_INSERT_TAIL(&kml->transaction_del, update, next); 1544 } 1545 1546 static void kvm_region_commit(MemoryListener *listener) 1547 { 1548 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, 1549 listener); 1550 KVMMemoryUpdate *u1, *u2; 1551 bool need_inhibit = false; 1552 1553 if (QSIMPLEQ_EMPTY(&kml->transaction_add) && 1554 QSIMPLEQ_EMPTY(&kml->transaction_del)) { 1555 return; 1556 } 1557 1558 /* 1559 * We have to be careful when regions to add overlap with ranges to remove. 1560 * We have to simulate atomic KVM memslot updates by making sure no ioctl() 1561 * is currently active. 1562 * 1563 * The lists are order by addresses, so it's easy to find overlaps. 1564 */ 1565 u1 = QSIMPLEQ_FIRST(&kml->transaction_del); 1566 u2 = QSIMPLEQ_FIRST(&kml->transaction_add); 1567 while (u1 && u2) { 1568 Range r1, r2; 1569 1570 range_init_nofail(&r1, u1->section.offset_within_address_space, 1571 int128_get64(u1->section.size)); 1572 range_init_nofail(&r2, u2->section.offset_within_address_space, 1573 int128_get64(u2->section.size)); 1574 1575 if (range_overlaps_range(&r1, &r2)) { 1576 need_inhibit = true; 1577 break; 1578 } 1579 if (range_lob(&r1) < range_lob(&r2)) { 1580 u1 = QSIMPLEQ_NEXT(u1, next); 1581 } else { 1582 u2 = QSIMPLEQ_NEXT(u2, next); 1583 } 1584 } 1585 1586 kvm_slots_lock(); 1587 if (need_inhibit) { 1588 accel_ioctl_inhibit_begin(); 1589 } 1590 1591 /* Remove all memslots before adding the new ones. */ 1592 while (!QSIMPLEQ_EMPTY(&kml->transaction_del)) { 1593 u1 = QSIMPLEQ_FIRST(&kml->transaction_del); 1594 QSIMPLEQ_REMOVE_HEAD(&kml->transaction_del, next); 1595 1596 kvm_set_phys_mem(kml, &u1->section, false); 1597 memory_region_unref(u1->section.mr); 1598 1599 g_free(u1); 1600 } 1601 while (!QSIMPLEQ_EMPTY(&kml->transaction_add)) { 1602 u1 = QSIMPLEQ_FIRST(&kml->transaction_add); 1603 QSIMPLEQ_REMOVE_HEAD(&kml->transaction_add, next); 1604 1605 memory_region_ref(u1->section.mr); 1606 kvm_set_phys_mem(kml, &u1->section, true); 1607 1608 g_free(u1); 1609 } 1610 1611 if (need_inhibit) { 1612 accel_ioctl_inhibit_end(); 1613 } 1614 kvm_slots_unlock(); 1615 } 1616 1617 static void kvm_log_sync(MemoryListener *listener, 1618 MemoryRegionSection *section) 1619 { 1620 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1621 1622 kvm_slots_lock(); 1623 kvm_physical_sync_dirty_bitmap(kml, section); 1624 kvm_slots_unlock(); 1625 } 1626 1627 static void kvm_log_sync_global(MemoryListener *l, bool last_stage) 1628 { 1629 KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener); 1630 KVMState *s = kvm_state; 1631 KVMSlot *mem; 1632 int i; 1633 1634 /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */ 1635 kvm_dirty_ring_flush(); 1636 1637 /* 1638 * TODO: make this faster when nr_slots is big while there are 1639 * only a few used slots (small VMs). 1640 */ 1641 kvm_slots_lock(); 1642 for (i = 0; i < s->nr_slots; i++) { 1643 mem = &kml->slots[i]; 1644 if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { 1645 kvm_slot_sync_dirty_pages(mem); 1646 1647 if (s->kvm_dirty_ring_with_bitmap && last_stage && 1648 kvm_slot_get_dirty_log(s, mem)) { 1649 kvm_slot_sync_dirty_pages(mem); 1650 } 1651 1652 /* 1653 * This is not needed by KVM_GET_DIRTY_LOG because the 1654 * ioctl will unconditionally overwrite the whole region. 1655 * However kvm dirty ring has no such side effect. 1656 */ 1657 kvm_slot_reset_dirty_pages(mem); 1658 } 1659 } 1660 kvm_slots_unlock(); 1661 } 1662 1663 static void kvm_log_clear(MemoryListener *listener, 1664 MemoryRegionSection *section) 1665 { 1666 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1667 int r; 1668 1669 r = kvm_physical_log_clear(kml, section); 1670 if (r < 0) { 1671 error_report_once("%s: kvm log clear failed: mr=%s " 1672 "offset=%"HWADDR_PRIx" size=%"PRIx64, __func__, 1673 section->mr->name, section->offset_within_region, 1674 int128_get64(section->size)); 1675 abort(); 1676 } 1677 } 1678 1679 static void kvm_mem_ioeventfd_add(MemoryListener *listener, 1680 MemoryRegionSection *section, 1681 bool match_data, uint64_t data, 1682 EventNotifier *e) 1683 { 1684 int fd = event_notifier_get_fd(e); 1685 int r; 1686 1687 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 1688 data, true, int128_get64(section->size), 1689 match_data); 1690 if (r < 0) { 1691 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n", 1692 __func__, strerror(-r), -r); 1693 abort(); 1694 } 1695 } 1696 1697 static void kvm_mem_ioeventfd_del(MemoryListener *listener, 1698 MemoryRegionSection *section, 1699 bool match_data, uint64_t data, 1700 EventNotifier *e) 1701 { 1702 int fd = event_notifier_get_fd(e); 1703 int r; 1704 1705 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 1706 data, false, int128_get64(section->size), 1707 match_data); 1708 if (r < 0) { 1709 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n", 1710 __func__, strerror(-r), -r); 1711 abort(); 1712 } 1713 } 1714 1715 static void kvm_io_ioeventfd_add(MemoryListener *listener, 1716 MemoryRegionSection *section, 1717 bool match_data, uint64_t data, 1718 EventNotifier *e) 1719 { 1720 int fd = event_notifier_get_fd(e); 1721 int r; 1722 1723 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 1724 data, true, int128_get64(section->size), 1725 match_data); 1726 if (r < 0) { 1727 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n", 1728 __func__, strerror(-r), -r); 1729 abort(); 1730 } 1731 } 1732 1733 static void kvm_io_ioeventfd_del(MemoryListener *listener, 1734 MemoryRegionSection *section, 1735 bool match_data, uint64_t data, 1736 EventNotifier *e) 1737 1738 { 1739 int fd = event_notifier_get_fd(e); 1740 int r; 1741 1742 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 1743 data, false, int128_get64(section->size), 1744 match_data); 1745 if (r < 0) { 1746 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n", 1747 __func__, strerror(-r), -r); 1748 abort(); 1749 } 1750 } 1751 1752 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, 1753 AddressSpace *as, int as_id, const char *name) 1754 { 1755 int i; 1756 1757 kml->slots = g_new0(KVMSlot, s->nr_slots); 1758 kml->as_id = as_id; 1759 1760 for (i = 0; i < s->nr_slots; i++) { 1761 kml->slots[i].slot = i; 1762 } 1763 1764 QSIMPLEQ_INIT(&kml->transaction_add); 1765 QSIMPLEQ_INIT(&kml->transaction_del); 1766 1767 kml->listener.region_add = kvm_region_add; 1768 kml->listener.region_del = kvm_region_del; 1769 kml->listener.commit = kvm_region_commit; 1770 kml->listener.log_start = kvm_log_start; 1771 kml->listener.log_stop = kvm_log_stop; 1772 kml->listener.priority = MEMORY_LISTENER_PRIORITY_ACCEL; 1773 kml->listener.name = name; 1774 1775 if (s->kvm_dirty_ring_size) { 1776 kml->listener.log_sync_global = kvm_log_sync_global; 1777 } else { 1778 kml->listener.log_sync = kvm_log_sync; 1779 kml->listener.log_clear = kvm_log_clear; 1780 } 1781 1782 memory_listener_register(&kml->listener, as); 1783 1784 for (i = 0; i < s->nr_as; ++i) { 1785 if (!s->as[i].as) { 1786 s->as[i].as = as; 1787 s->as[i].ml = kml; 1788 break; 1789 } 1790 } 1791 } 1792 1793 static MemoryListener kvm_io_listener = { 1794 .name = "kvm-io", 1795 .coalesced_io_add = kvm_coalesce_pio_add, 1796 .coalesced_io_del = kvm_coalesce_pio_del, 1797 .eventfd_add = kvm_io_ioeventfd_add, 1798 .eventfd_del = kvm_io_ioeventfd_del, 1799 .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND, 1800 }; 1801 1802 int kvm_set_irq(KVMState *s, int irq, int level) 1803 { 1804 struct kvm_irq_level event; 1805 int ret; 1806 1807 assert(kvm_async_interrupts_enabled()); 1808 1809 event.level = level; 1810 event.irq = irq; 1811 ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event); 1812 if (ret < 0) { 1813 perror("kvm_set_irq"); 1814 abort(); 1815 } 1816 1817 return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status; 1818 } 1819 1820 #ifdef KVM_CAP_IRQ_ROUTING 1821 typedef struct KVMMSIRoute { 1822 struct kvm_irq_routing_entry kroute; 1823 QTAILQ_ENTRY(KVMMSIRoute) entry; 1824 } KVMMSIRoute; 1825 1826 static void set_gsi(KVMState *s, unsigned int gsi) 1827 { 1828 set_bit(gsi, s->used_gsi_bitmap); 1829 } 1830 1831 static void clear_gsi(KVMState *s, unsigned int gsi) 1832 { 1833 clear_bit(gsi, s->used_gsi_bitmap); 1834 } 1835 1836 void kvm_init_irq_routing(KVMState *s) 1837 { 1838 int gsi_count; 1839 1840 gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1; 1841 if (gsi_count > 0) { 1842 /* Round up so we can search ints using ffs */ 1843 s->used_gsi_bitmap = bitmap_new(gsi_count); 1844 s->gsi_count = gsi_count; 1845 } 1846 1847 s->irq_routes = g_malloc0(sizeof(*s->irq_routes)); 1848 s->nr_allocated_irq_routes = 0; 1849 1850 kvm_arch_init_irq_routing(s); 1851 } 1852 1853 void kvm_irqchip_commit_routes(KVMState *s) 1854 { 1855 int ret; 1856 1857 if (kvm_gsi_direct_mapping()) { 1858 return; 1859 } 1860 1861 if (!kvm_gsi_routing_enabled()) { 1862 return; 1863 } 1864 1865 s->irq_routes->flags = 0; 1866 trace_kvm_irqchip_commit_routes(); 1867 ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes); 1868 assert(ret == 0); 1869 } 1870 1871 static void kvm_add_routing_entry(KVMState *s, 1872 struct kvm_irq_routing_entry *entry) 1873 { 1874 struct kvm_irq_routing_entry *new; 1875 int n, size; 1876 1877 if (s->irq_routes->nr == s->nr_allocated_irq_routes) { 1878 n = s->nr_allocated_irq_routes * 2; 1879 if (n < 64) { 1880 n = 64; 1881 } 1882 size = sizeof(struct kvm_irq_routing); 1883 size += n * sizeof(*new); 1884 s->irq_routes = g_realloc(s->irq_routes, size); 1885 s->nr_allocated_irq_routes = n; 1886 } 1887 n = s->irq_routes->nr++; 1888 new = &s->irq_routes->entries[n]; 1889 1890 *new = *entry; 1891 1892 set_gsi(s, entry->gsi); 1893 } 1894 1895 static int kvm_update_routing_entry(KVMState *s, 1896 struct kvm_irq_routing_entry *new_entry) 1897 { 1898 struct kvm_irq_routing_entry *entry; 1899 int n; 1900 1901 for (n = 0; n < s->irq_routes->nr; n++) { 1902 entry = &s->irq_routes->entries[n]; 1903 if (entry->gsi != new_entry->gsi) { 1904 continue; 1905 } 1906 1907 if(!memcmp(entry, new_entry, sizeof *entry)) { 1908 return 0; 1909 } 1910 1911 *entry = *new_entry; 1912 1913 return 0; 1914 } 1915 1916 return -ESRCH; 1917 } 1918 1919 void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin) 1920 { 1921 struct kvm_irq_routing_entry e = {}; 1922 1923 assert(pin < s->gsi_count); 1924 1925 e.gsi = irq; 1926 e.type = KVM_IRQ_ROUTING_IRQCHIP; 1927 e.flags = 0; 1928 e.u.irqchip.irqchip = irqchip; 1929 e.u.irqchip.pin = pin; 1930 kvm_add_routing_entry(s, &e); 1931 } 1932 1933 void kvm_irqchip_release_virq(KVMState *s, int virq) 1934 { 1935 struct kvm_irq_routing_entry *e; 1936 int i; 1937 1938 if (kvm_gsi_direct_mapping()) { 1939 return; 1940 } 1941 1942 for (i = 0; i < s->irq_routes->nr; i++) { 1943 e = &s->irq_routes->entries[i]; 1944 if (e->gsi == virq) { 1945 s->irq_routes->nr--; 1946 *e = s->irq_routes->entries[s->irq_routes->nr]; 1947 } 1948 } 1949 clear_gsi(s, virq); 1950 kvm_arch_release_virq_post(virq); 1951 trace_kvm_irqchip_release_virq(virq); 1952 } 1953 1954 void kvm_irqchip_add_change_notifier(Notifier *n) 1955 { 1956 notifier_list_add(&kvm_irqchip_change_notifiers, n); 1957 } 1958 1959 void kvm_irqchip_remove_change_notifier(Notifier *n) 1960 { 1961 notifier_remove(n); 1962 } 1963 1964 void kvm_irqchip_change_notify(void) 1965 { 1966 notifier_list_notify(&kvm_irqchip_change_notifiers, NULL); 1967 } 1968 1969 static int kvm_irqchip_get_virq(KVMState *s) 1970 { 1971 int next_virq; 1972 1973 /* Return the lowest unused GSI in the bitmap */ 1974 next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count); 1975 if (next_virq >= s->gsi_count) { 1976 return -ENOSPC; 1977 } else { 1978 return next_virq; 1979 } 1980 } 1981 1982 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 1983 { 1984 struct kvm_msi msi; 1985 1986 msi.address_lo = (uint32_t)msg.address; 1987 msi.address_hi = msg.address >> 32; 1988 msi.data = le32_to_cpu(msg.data); 1989 msi.flags = 0; 1990 memset(msi.pad, 0, sizeof(msi.pad)); 1991 1992 return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi); 1993 } 1994 1995 int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev) 1996 { 1997 struct kvm_irq_routing_entry kroute = {}; 1998 int virq; 1999 KVMState *s = c->s; 2000 MSIMessage msg = {0, 0}; 2001 2002 if (pci_available && dev) { 2003 msg = pci_get_msi_message(dev, vector); 2004 } 2005 2006 if (kvm_gsi_direct_mapping()) { 2007 return kvm_arch_msi_data_to_gsi(msg.data); 2008 } 2009 2010 if (!kvm_gsi_routing_enabled()) { 2011 return -ENOSYS; 2012 } 2013 2014 virq = kvm_irqchip_get_virq(s); 2015 if (virq < 0) { 2016 return virq; 2017 } 2018 2019 kroute.gsi = virq; 2020 kroute.type = KVM_IRQ_ROUTING_MSI; 2021 kroute.flags = 0; 2022 kroute.u.msi.address_lo = (uint32_t)msg.address; 2023 kroute.u.msi.address_hi = msg.address >> 32; 2024 kroute.u.msi.data = le32_to_cpu(msg.data); 2025 if (pci_available && kvm_msi_devid_required()) { 2026 kroute.flags = KVM_MSI_VALID_DEVID; 2027 kroute.u.msi.devid = pci_requester_id(dev); 2028 } 2029 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 2030 kvm_irqchip_release_virq(s, virq); 2031 return -EINVAL; 2032 } 2033 2034 if (s->irq_routes->nr < s->gsi_count) { 2035 trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A", 2036 vector, virq); 2037 2038 kvm_add_routing_entry(s, &kroute); 2039 kvm_arch_add_msi_route_post(&kroute, vector, dev); 2040 c->changes++; 2041 } else { 2042 kvm_irqchip_release_virq(s, virq); 2043 return -ENOSPC; 2044 } 2045 2046 return virq; 2047 } 2048 2049 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, 2050 PCIDevice *dev) 2051 { 2052 struct kvm_irq_routing_entry kroute = {}; 2053 2054 if (kvm_gsi_direct_mapping()) { 2055 return 0; 2056 } 2057 2058 if (!kvm_irqchip_in_kernel()) { 2059 return -ENOSYS; 2060 } 2061 2062 kroute.gsi = virq; 2063 kroute.type = KVM_IRQ_ROUTING_MSI; 2064 kroute.flags = 0; 2065 kroute.u.msi.address_lo = (uint32_t)msg.address; 2066 kroute.u.msi.address_hi = msg.address >> 32; 2067 kroute.u.msi.data = le32_to_cpu(msg.data); 2068 if (pci_available && kvm_msi_devid_required()) { 2069 kroute.flags = KVM_MSI_VALID_DEVID; 2070 kroute.u.msi.devid = pci_requester_id(dev); 2071 } 2072 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 2073 return -EINVAL; 2074 } 2075 2076 trace_kvm_irqchip_update_msi_route(virq); 2077 2078 return kvm_update_routing_entry(s, &kroute); 2079 } 2080 2081 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event, 2082 EventNotifier *resample, int virq, 2083 bool assign) 2084 { 2085 int fd = event_notifier_get_fd(event); 2086 int rfd = resample ? event_notifier_get_fd(resample) : -1; 2087 2088 struct kvm_irqfd irqfd = { 2089 .fd = fd, 2090 .gsi = virq, 2091 .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN, 2092 }; 2093 2094 if (rfd != -1) { 2095 assert(assign); 2096 if (kvm_irqchip_is_split()) { 2097 /* 2098 * When the slow irqchip (e.g. IOAPIC) is in the 2099 * userspace, KVM kernel resamplefd will not work because 2100 * the EOI of the interrupt will be delivered to userspace 2101 * instead, so the KVM kernel resamplefd kick will be 2102 * skipped. The userspace here mimics what the kernel 2103 * provides with resamplefd, remember the resamplefd and 2104 * kick it when we receive EOI of this IRQ. 2105 * 2106 * This is hackery because IOAPIC is mostly bypassed 2107 * (except EOI broadcasts) when irqfd is used. However 2108 * this can bring much performance back for split irqchip 2109 * with INTx IRQs (for VFIO, this gives 93% perf of the 2110 * full fast path, which is 46% perf boost comparing to 2111 * the INTx slow path). 2112 */ 2113 kvm_resample_fd_insert(virq, resample); 2114 } else { 2115 irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE; 2116 irqfd.resamplefd = rfd; 2117 } 2118 } else if (!assign) { 2119 if (kvm_irqchip_is_split()) { 2120 kvm_resample_fd_remove(virq); 2121 } 2122 } 2123 2124 return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd); 2125 } 2126 2127 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 2128 { 2129 struct kvm_irq_routing_entry kroute = {}; 2130 int virq; 2131 2132 if (!kvm_gsi_routing_enabled()) { 2133 return -ENOSYS; 2134 } 2135 2136 virq = kvm_irqchip_get_virq(s); 2137 if (virq < 0) { 2138 return virq; 2139 } 2140 2141 kroute.gsi = virq; 2142 kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER; 2143 kroute.flags = 0; 2144 kroute.u.adapter.summary_addr = adapter->summary_addr; 2145 kroute.u.adapter.ind_addr = adapter->ind_addr; 2146 kroute.u.adapter.summary_offset = adapter->summary_offset; 2147 kroute.u.adapter.ind_offset = adapter->ind_offset; 2148 kroute.u.adapter.adapter_id = adapter->adapter_id; 2149 2150 kvm_add_routing_entry(s, &kroute); 2151 2152 return virq; 2153 } 2154 2155 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 2156 { 2157 struct kvm_irq_routing_entry kroute = {}; 2158 int virq; 2159 2160 if (!kvm_gsi_routing_enabled()) { 2161 return -ENOSYS; 2162 } 2163 if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) { 2164 return -ENOSYS; 2165 } 2166 virq = kvm_irqchip_get_virq(s); 2167 if (virq < 0) { 2168 return virq; 2169 } 2170 2171 kroute.gsi = virq; 2172 kroute.type = KVM_IRQ_ROUTING_HV_SINT; 2173 kroute.flags = 0; 2174 kroute.u.hv_sint.vcpu = vcpu; 2175 kroute.u.hv_sint.sint = sint; 2176 2177 kvm_add_routing_entry(s, &kroute); 2178 kvm_irqchip_commit_routes(s); 2179 2180 return virq; 2181 } 2182 2183 #else /* !KVM_CAP_IRQ_ROUTING */ 2184 2185 void kvm_init_irq_routing(KVMState *s) 2186 { 2187 } 2188 2189 void kvm_irqchip_release_virq(KVMState *s, int virq) 2190 { 2191 } 2192 2193 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 2194 { 2195 abort(); 2196 } 2197 2198 int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev) 2199 { 2200 return -ENOSYS; 2201 } 2202 2203 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 2204 { 2205 return -ENOSYS; 2206 } 2207 2208 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 2209 { 2210 return -ENOSYS; 2211 } 2212 2213 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event, 2214 EventNotifier *resample, int virq, 2215 bool assign) 2216 { 2217 abort(); 2218 } 2219 2220 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg) 2221 { 2222 return -ENOSYS; 2223 } 2224 #endif /* !KVM_CAP_IRQ_ROUTING */ 2225 2226 int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 2227 EventNotifier *rn, int virq) 2228 { 2229 return kvm_irqchip_assign_irqfd(s, n, rn, virq, true); 2230 } 2231 2232 int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 2233 int virq) 2234 { 2235 return kvm_irqchip_assign_irqfd(s, n, NULL, virq, false); 2236 } 2237 2238 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, 2239 EventNotifier *rn, qemu_irq irq) 2240 { 2241 gpointer key, gsi; 2242 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 2243 2244 if (!found) { 2245 return -ENXIO; 2246 } 2247 return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi)); 2248 } 2249 2250 int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, 2251 qemu_irq irq) 2252 { 2253 gpointer key, gsi; 2254 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 2255 2256 if (!found) { 2257 return -ENXIO; 2258 } 2259 return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi)); 2260 } 2261 2262 void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi) 2263 { 2264 g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi)); 2265 } 2266 2267 static void kvm_irqchip_create(KVMState *s) 2268 { 2269 int ret; 2270 2271 assert(s->kernel_irqchip_split != ON_OFF_AUTO_AUTO); 2272 if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) { 2273 ; 2274 } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) { 2275 ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0); 2276 if (ret < 0) { 2277 fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret)); 2278 exit(1); 2279 } 2280 } else { 2281 return; 2282 } 2283 2284 if (kvm_check_extension(s, KVM_CAP_IRQFD) <= 0) { 2285 fprintf(stderr, "kvm: irqfd not implemented\n"); 2286 exit(1); 2287 } 2288 2289 /* First probe and see if there's a arch-specific hook to create the 2290 * in-kernel irqchip for us */ 2291 ret = kvm_arch_irqchip_create(s); 2292 if (ret == 0) { 2293 if (s->kernel_irqchip_split == ON_OFF_AUTO_ON) { 2294 error_report("Split IRQ chip mode not supported."); 2295 exit(1); 2296 } else { 2297 ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP); 2298 } 2299 } 2300 if (ret < 0) { 2301 fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret)); 2302 exit(1); 2303 } 2304 2305 kvm_kernel_irqchip = true; 2306 /* If we have an in-kernel IRQ chip then we must have asynchronous 2307 * interrupt delivery (though the reverse is not necessarily true) 2308 */ 2309 kvm_async_interrupts_allowed = true; 2310 kvm_halt_in_kernel_allowed = true; 2311 2312 kvm_init_irq_routing(s); 2313 2314 s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal); 2315 } 2316 2317 /* Find number of supported CPUs using the recommended 2318 * procedure from the kernel API documentation to cope with 2319 * older kernels that may be missing capabilities. 2320 */ 2321 static int kvm_recommended_vcpus(KVMState *s) 2322 { 2323 int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS); 2324 return (ret) ? ret : 4; 2325 } 2326 2327 static int kvm_max_vcpus(KVMState *s) 2328 { 2329 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS); 2330 return (ret) ? ret : kvm_recommended_vcpus(s); 2331 } 2332 2333 static int kvm_max_vcpu_id(KVMState *s) 2334 { 2335 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID); 2336 return (ret) ? ret : kvm_max_vcpus(s); 2337 } 2338 2339 bool kvm_vcpu_id_is_valid(int vcpu_id) 2340 { 2341 KVMState *s = KVM_STATE(current_accel()); 2342 return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s); 2343 } 2344 2345 bool kvm_dirty_ring_enabled(void) 2346 { 2347 return kvm_state->kvm_dirty_ring_size ? true : false; 2348 } 2349 2350 static void query_stats_cb(StatsResultList **result, StatsTarget target, 2351 strList *names, strList *targets, Error **errp); 2352 static void query_stats_schemas_cb(StatsSchemaList **result, Error **errp); 2353 2354 uint32_t kvm_dirty_ring_size(void) 2355 { 2356 return kvm_state->kvm_dirty_ring_size; 2357 } 2358 2359 static int kvm_init(MachineState *ms) 2360 { 2361 MachineClass *mc = MACHINE_GET_CLASS(ms); 2362 static const char upgrade_note[] = 2363 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n" 2364 "(see http://sourceforge.net/projects/kvm).\n"; 2365 const struct { 2366 const char *name; 2367 int num; 2368 } num_cpus[] = { 2369 { "SMP", ms->smp.cpus }, 2370 { "hotpluggable", ms->smp.max_cpus }, 2371 { /* end of list */ } 2372 }, *nc = num_cpus; 2373 int soft_vcpus_limit, hard_vcpus_limit; 2374 KVMState *s; 2375 const KVMCapabilityInfo *missing_cap; 2376 int ret; 2377 int type; 2378 uint64_t dirty_log_manual_caps; 2379 2380 qemu_mutex_init(&kml_slots_lock); 2381 2382 s = KVM_STATE(ms->accelerator); 2383 2384 /* 2385 * On systems where the kernel can support different base page 2386 * sizes, host page size may be different from TARGET_PAGE_SIZE, 2387 * even with KVM. TARGET_PAGE_SIZE is assumed to be the minimum 2388 * page size for the system though. 2389 */ 2390 assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size()); 2391 2392 s->sigmask_len = 8; 2393 accel_blocker_init(); 2394 2395 #ifdef TARGET_KVM_HAVE_GUEST_DEBUG 2396 QTAILQ_INIT(&s->kvm_sw_breakpoints); 2397 #endif 2398 QLIST_INIT(&s->kvm_parked_vcpus); 2399 s->fd = qemu_open_old(s->device ?: "/dev/kvm", O_RDWR); 2400 if (s->fd == -1) { 2401 fprintf(stderr, "Could not access KVM kernel module: %m\n"); 2402 ret = -errno; 2403 goto err; 2404 } 2405 2406 ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0); 2407 if (ret < KVM_API_VERSION) { 2408 if (ret >= 0) { 2409 ret = -EINVAL; 2410 } 2411 fprintf(stderr, "kvm version too old\n"); 2412 goto err; 2413 } 2414 2415 if (ret > KVM_API_VERSION) { 2416 ret = -EINVAL; 2417 fprintf(stderr, "kvm version not supported\n"); 2418 goto err; 2419 } 2420 2421 kvm_supported_memory_attributes = kvm_check_extension(s, KVM_CAP_MEMORY_ATTRIBUTES); 2422 kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT); 2423 s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); 2424 2425 /* If unspecified, use the default value */ 2426 if (!s->nr_slots) { 2427 s->nr_slots = 32; 2428 } 2429 2430 s->nr_as = kvm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE); 2431 if (s->nr_as <= 1) { 2432 s->nr_as = 1; 2433 } 2434 s->as = g_new0(struct KVMAs, s->nr_as); 2435 2436 if (object_property_find(OBJECT(current_machine), "kvm-type")) { 2437 g_autofree char *kvm_type = object_property_get_str(OBJECT(current_machine), 2438 "kvm-type", 2439 &error_abort); 2440 type = mc->kvm_type(ms, kvm_type); 2441 } else if (mc->kvm_type) { 2442 type = mc->kvm_type(ms, NULL); 2443 } else { 2444 type = kvm_arch_get_default_type(ms); 2445 } 2446 2447 if (type < 0) { 2448 ret = -EINVAL; 2449 goto err; 2450 } 2451 2452 do { 2453 ret = kvm_ioctl(s, KVM_CREATE_VM, type); 2454 } while (ret == -EINTR); 2455 2456 if (ret < 0) { 2457 fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret, 2458 strerror(-ret)); 2459 2460 #ifdef TARGET_S390X 2461 if (ret == -EINVAL) { 2462 fprintf(stderr, 2463 "Host kernel setup problem detected. Please verify:\n"); 2464 fprintf(stderr, "- for kernels supporting the switch_amode or" 2465 " user_mode parameters, whether\n"); 2466 fprintf(stderr, 2467 " user space is running in primary address space\n"); 2468 fprintf(stderr, 2469 "- for kernels supporting the vm.allocate_pgste sysctl, " 2470 "whether it is enabled\n"); 2471 } 2472 #elif defined(TARGET_PPC) 2473 if (ret == -EINVAL) { 2474 fprintf(stderr, 2475 "PPC KVM module is not loaded. Try modprobe kvm_%s.\n", 2476 (type == 2) ? "pr" : "hv"); 2477 } 2478 #endif 2479 goto err; 2480 } 2481 2482 s->vmfd = ret; 2483 2484 /* check the vcpu limits */ 2485 soft_vcpus_limit = kvm_recommended_vcpus(s); 2486 hard_vcpus_limit = kvm_max_vcpus(s); 2487 2488 while (nc->name) { 2489 if (nc->num > soft_vcpus_limit) { 2490 warn_report("Number of %s cpus requested (%d) exceeds " 2491 "the recommended cpus supported by KVM (%d)", 2492 nc->name, nc->num, soft_vcpus_limit); 2493 2494 if (nc->num > hard_vcpus_limit) { 2495 fprintf(stderr, "Number of %s cpus requested (%d) exceeds " 2496 "the maximum cpus supported by KVM (%d)\n", 2497 nc->name, nc->num, hard_vcpus_limit); 2498 exit(1); 2499 } 2500 } 2501 nc++; 2502 } 2503 2504 missing_cap = kvm_check_extension_list(s, kvm_required_capabilites); 2505 if (!missing_cap) { 2506 missing_cap = 2507 kvm_check_extension_list(s, kvm_arch_required_capabilities); 2508 } 2509 if (missing_cap) { 2510 ret = -EINVAL; 2511 fprintf(stderr, "kvm does not support %s\n%s", 2512 missing_cap->name, upgrade_note); 2513 goto err; 2514 } 2515 2516 s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO); 2517 s->coalesced_pio = s->coalesced_mmio && 2518 kvm_check_extension(s, KVM_CAP_COALESCED_PIO); 2519 2520 /* 2521 * Enable KVM dirty ring if supported, otherwise fall back to 2522 * dirty logging mode 2523 */ 2524 ret = kvm_dirty_ring_init(s); 2525 if (ret < 0) { 2526 goto err; 2527 } 2528 2529 /* 2530 * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is 2531 * enabled. More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no 2532 * page is wr-protected initially, which is against how kvm dirty ring is 2533 * usage - kvm dirty ring requires all pages are wr-protected at the very 2534 * beginning. Enabling this feature for dirty ring causes data corruption. 2535 * 2536 * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log, 2537 * we may expect a higher stall time when starting the migration. In the 2538 * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too: 2539 * instead of clearing dirty bit, it can be a way to explicitly wr-protect 2540 * guest pages. 2541 */ 2542 if (!s->kvm_dirty_ring_size) { 2543 dirty_log_manual_caps = 2544 kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); 2545 dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | 2546 KVM_DIRTY_LOG_INITIALLY_SET); 2547 s->manual_dirty_log_protect = dirty_log_manual_caps; 2548 if (dirty_log_manual_caps) { 2549 ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0, 2550 dirty_log_manual_caps); 2551 if (ret) { 2552 warn_report("Trying to enable capability %"PRIu64" of " 2553 "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. " 2554 "Falling back to the legacy mode. ", 2555 dirty_log_manual_caps); 2556 s->manual_dirty_log_protect = 0; 2557 } 2558 } 2559 } 2560 2561 #ifdef KVM_CAP_VCPU_EVENTS 2562 s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS); 2563 #endif 2564 s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE); 2565 2566 s->irq_set_ioctl = KVM_IRQ_LINE; 2567 if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) { 2568 s->irq_set_ioctl = KVM_IRQ_LINE_STATUS; 2569 } 2570 2571 kvm_readonly_mem_allowed = 2572 (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0); 2573 2574 kvm_resamplefds_allowed = 2575 (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0); 2576 2577 kvm_vm_attributes_allowed = 2578 (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0); 2579 2580 #ifdef TARGET_KVM_HAVE_GUEST_DEBUG 2581 kvm_has_guest_debug = 2582 (kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG) > 0); 2583 #endif 2584 2585 kvm_sstep_flags = 0; 2586 if (kvm_has_guest_debug) { 2587 kvm_sstep_flags = SSTEP_ENABLE; 2588 2589 #if defined TARGET_KVM_HAVE_GUEST_DEBUG 2590 int guest_debug_flags = 2591 kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG2); 2592 2593 if (guest_debug_flags & KVM_GUESTDBG_BLOCKIRQ) { 2594 kvm_sstep_flags |= SSTEP_NOIRQ; 2595 } 2596 #endif 2597 } 2598 2599 kvm_state = s; 2600 2601 ret = kvm_arch_init(ms, s); 2602 if (ret < 0) { 2603 goto err; 2604 } 2605 2606 if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) { 2607 s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; 2608 } 2609 2610 qemu_register_reset(kvm_unpoison_all, NULL); 2611 2612 if (s->kernel_irqchip_allowed) { 2613 kvm_irqchip_create(s); 2614 } 2615 2616 s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add; 2617 s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del; 2618 s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region; 2619 s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region; 2620 2621 kvm_memory_listener_register(s, &s->memory_listener, 2622 &address_space_memory, 0, "kvm-memory"); 2623 memory_listener_register(&kvm_io_listener, 2624 &address_space_io); 2625 2626 s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU); 2627 if (!s->sync_mmu) { 2628 ret = ram_block_discard_disable(true); 2629 assert(!ret); 2630 } 2631 2632 if (s->kvm_dirty_ring_size) { 2633 kvm_dirty_ring_reaper_init(s); 2634 } 2635 2636 if (kvm_check_extension(kvm_state, KVM_CAP_BINARY_STATS_FD)) { 2637 add_stats_callbacks(STATS_PROVIDER_KVM, query_stats_cb, 2638 query_stats_schemas_cb); 2639 } 2640 2641 return 0; 2642 2643 err: 2644 assert(ret < 0); 2645 if (s->vmfd >= 0) { 2646 close(s->vmfd); 2647 } 2648 if (s->fd != -1) { 2649 close(s->fd); 2650 } 2651 g_free(s->as); 2652 g_free(s->memory_listener.slots); 2653 2654 return ret; 2655 } 2656 2657 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len) 2658 { 2659 s->sigmask_len = sigmask_len; 2660 } 2661 2662 static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction, 2663 int size, uint32_t count) 2664 { 2665 int i; 2666 uint8_t *ptr = data; 2667 2668 for (i = 0; i < count; i++) { 2669 address_space_rw(&address_space_io, port, attrs, 2670 ptr, size, 2671 direction == KVM_EXIT_IO_OUT); 2672 ptr += size; 2673 } 2674 } 2675 2676 static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run) 2677 { 2678 int i; 2679 2680 fprintf(stderr, "KVM internal error. Suberror: %d\n", 2681 run->internal.suberror); 2682 2683 for (i = 0; i < run->internal.ndata; ++i) { 2684 fprintf(stderr, "extra data[%d]: 0x%016"PRIx64"\n", 2685 i, (uint64_t)run->internal.data[i]); 2686 } 2687 if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) { 2688 fprintf(stderr, "emulation failure\n"); 2689 if (!kvm_arch_stop_on_emulation_error(cpu)) { 2690 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); 2691 return EXCP_INTERRUPT; 2692 } 2693 } 2694 /* FIXME: Should trigger a qmp message to let management know 2695 * something went wrong. 2696 */ 2697 return -1; 2698 } 2699 2700 void kvm_flush_coalesced_mmio_buffer(void) 2701 { 2702 KVMState *s = kvm_state; 2703 2704 if (!s || s->coalesced_flush_in_progress) { 2705 return; 2706 } 2707 2708 s->coalesced_flush_in_progress = true; 2709 2710 if (s->coalesced_mmio_ring) { 2711 struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring; 2712 while (ring->first != ring->last) { 2713 struct kvm_coalesced_mmio *ent; 2714 2715 ent = &ring->coalesced_mmio[ring->first]; 2716 2717 if (ent->pio == 1) { 2718 address_space_write(&address_space_io, ent->phys_addr, 2719 MEMTXATTRS_UNSPECIFIED, ent->data, 2720 ent->len); 2721 } else { 2722 cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len); 2723 } 2724 smp_wmb(); 2725 ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX; 2726 } 2727 } 2728 2729 s->coalesced_flush_in_progress = false; 2730 } 2731 2732 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 2733 { 2734 if (!cpu->vcpu_dirty && !kvm_state->guest_state_protected) { 2735 int ret = kvm_arch_get_registers(cpu); 2736 if (ret) { 2737 error_report("Failed to get registers: %s", strerror(-ret)); 2738 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); 2739 vm_stop(RUN_STATE_INTERNAL_ERROR); 2740 } 2741 2742 cpu->vcpu_dirty = true; 2743 } 2744 } 2745 2746 void kvm_cpu_synchronize_state(CPUState *cpu) 2747 { 2748 if (!cpu->vcpu_dirty && !kvm_state->guest_state_protected) { 2749 run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL); 2750 } 2751 } 2752 2753 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg) 2754 { 2755 int ret = kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE); 2756 if (ret) { 2757 error_report("Failed to put registers after reset: %s", strerror(-ret)); 2758 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); 2759 vm_stop(RUN_STATE_INTERNAL_ERROR); 2760 } 2761 2762 cpu->vcpu_dirty = false; 2763 } 2764 2765 void kvm_cpu_synchronize_post_reset(CPUState *cpu) 2766 { 2767 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 2768 } 2769 2770 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg) 2771 { 2772 int ret = kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE); 2773 if (ret) { 2774 error_report("Failed to put registers after init: %s", strerror(-ret)); 2775 exit(1); 2776 } 2777 2778 cpu->vcpu_dirty = false; 2779 } 2780 2781 void kvm_cpu_synchronize_post_init(CPUState *cpu) 2782 { 2783 if (!kvm_state->guest_state_protected) { 2784 /* 2785 * This runs before the machine_init_done notifiers, and is the last 2786 * opportunity to synchronize the state of confidential guests. 2787 */ 2788 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 2789 } 2790 } 2791 2792 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg) 2793 { 2794 cpu->vcpu_dirty = true; 2795 } 2796 2797 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu) 2798 { 2799 run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 2800 } 2801 2802 #ifdef KVM_HAVE_MCE_INJECTION 2803 static __thread void *pending_sigbus_addr; 2804 static __thread int pending_sigbus_code; 2805 static __thread bool have_sigbus_pending; 2806 #endif 2807 2808 static void kvm_cpu_kick(CPUState *cpu) 2809 { 2810 qatomic_set(&cpu->kvm_run->immediate_exit, 1); 2811 } 2812 2813 static void kvm_cpu_kick_self(void) 2814 { 2815 if (kvm_immediate_exit) { 2816 kvm_cpu_kick(current_cpu); 2817 } else { 2818 qemu_cpu_kick_self(); 2819 } 2820 } 2821 2822 static void kvm_eat_signals(CPUState *cpu) 2823 { 2824 struct timespec ts = { 0, 0 }; 2825 siginfo_t siginfo; 2826 sigset_t waitset; 2827 sigset_t chkset; 2828 int r; 2829 2830 if (kvm_immediate_exit) { 2831 qatomic_set(&cpu->kvm_run->immediate_exit, 0); 2832 /* Write kvm_run->immediate_exit before the cpu->exit_request 2833 * write in kvm_cpu_exec. 2834 */ 2835 smp_wmb(); 2836 return; 2837 } 2838 2839 sigemptyset(&waitset); 2840 sigaddset(&waitset, SIG_IPI); 2841 2842 do { 2843 r = sigtimedwait(&waitset, &siginfo, &ts); 2844 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) { 2845 perror("sigtimedwait"); 2846 exit(1); 2847 } 2848 2849 r = sigpending(&chkset); 2850 if (r == -1) { 2851 perror("sigpending"); 2852 exit(1); 2853 } 2854 } while (sigismember(&chkset, SIG_IPI)); 2855 } 2856 2857 int kvm_cpu_exec(CPUState *cpu) 2858 { 2859 struct kvm_run *run = cpu->kvm_run; 2860 int ret, run_ret; 2861 2862 trace_kvm_cpu_exec(); 2863 2864 if (kvm_arch_process_async_events(cpu)) { 2865 qatomic_set(&cpu->exit_request, 0); 2866 return EXCP_HLT; 2867 } 2868 2869 bql_unlock(); 2870 cpu_exec_start(cpu); 2871 2872 do { 2873 MemTxAttrs attrs; 2874 2875 if (cpu->vcpu_dirty) { 2876 ret = kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE); 2877 if (ret) { 2878 error_report("Failed to put registers after init: %s", 2879 strerror(-ret)); 2880 ret = -1; 2881 break; 2882 } 2883 2884 cpu->vcpu_dirty = false; 2885 } 2886 2887 kvm_arch_pre_run(cpu, run); 2888 if (qatomic_read(&cpu->exit_request)) { 2889 trace_kvm_interrupt_exit_request(); 2890 /* 2891 * KVM requires us to reenter the kernel after IO exits to complete 2892 * instruction emulation. This self-signal will ensure that we 2893 * leave ASAP again. 2894 */ 2895 kvm_cpu_kick_self(); 2896 } 2897 2898 /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit. 2899 * Matching barrier in kvm_eat_signals. 2900 */ 2901 smp_rmb(); 2902 2903 run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0); 2904 2905 attrs = kvm_arch_post_run(cpu, run); 2906 2907 #ifdef KVM_HAVE_MCE_INJECTION 2908 if (unlikely(have_sigbus_pending)) { 2909 bql_lock(); 2910 kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code, 2911 pending_sigbus_addr); 2912 have_sigbus_pending = false; 2913 bql_unlock(); 2914 } 2915 #endif 2916 2917 if (run_ret < 0) { 2918 if (run_ret == -EINTR || run_ret == -EAGAIN) { 2919 trace_kvm_io_window_exit(); 2920 kvm_eat_signals(cpu); 2921 ret = EXCP_INTERRUPT; 2922 break; 2923 } 2924 fprintf(stderr, "error: kvm run failed %s\n", 2925 strerror(-run_ret)); 2926 #ifdef TARGET_PPC 2927 if (run_ret == -EBUSY) { 2928 fprintf(stderr, 2929 "This is probably because your SMT is enabled.\n" 2930 "VCPU can only run on primary threads with all " 2931 "secondary threads offline.\n"); 2932 } 2933 #endif 2934 ret = -1; 2935 break; 2936 } 2937 2938 trace_kvm_run_exit(cpu->cpu_index, run->exit_reason); 2939 switch (run->exit_reason) { 2940 case KVM_EXIT_IO: 2941 /* Called outside BQL */ 2942 kvm_handle_io(run->io.port, attrs, 2943 (uint8_t *)run + run->io.data_offset, 2944 run->io.direction, 2945 run->io.size, 2946 run->io.count); 2947 ret = 0; 2948 break; 2949 case KVM_EXIT_MMIO: 2950 /* Called outside BQL */ 2951 address_space_rw(&address_space_memory, 2952 run->mmio.phys_addr, attrs, 2953 run->mmio.data, 2954 run->mmio.len, 2955 run->mmio.is_write); 2956 ret = 0; 2957 break; 2958 case KVM_EXIT_IRQ_WINDOW_OPEN: 2959 ret = EXCP_INTERRUPT; 2960 break; 2961 case KVM_EXIT_SHUTDOWN: 2962 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 2963 ret = EXCP_INTERRUPT; 2964 break; 2965 case KVM_EXIT_UNKNOWN: 2966 fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n", 2967 (uint64_t)run->hw.hardware_exit_reason); 2968 ret = -1; 2969 break; 2970 case KVM_EXIT_INTERNAL_ERROR: 2971 ret = kvm_handle_internal_error(cpu, run); 2972 break; 2973 case KVM_EXIT_DIRTY_RING_FULL: 2974 /* 2975 * We shouldn't continue if the dirty ring of this vcpu is 2976 * still full. Got kicked by KVM_RESET_DIRTY_RINGS. 2977 */ 2978 trace_kvm_dirty_ring_full(cpu->cpu_index); 2979 bql_lock(); 2980 /* 2981 * We throttle vCPU by making it sleep once it exit from kernel 2982 * due to dirty ring full. In the dirtylimit scenario, reaping 2983 * all vCPUs after a single vCPU dirty ring get full result in 2984 * the miss of sleep, so just reap the ring-fulled vCPU. 2985 */ 2986 if (dirtylimit_in_service()) { 2987 kvm_dirty_ring_reap(kvm_state, cpu); 2988 } else { 2989 kvm_dirty_ring_reap(kvm_state, NULL); 2990 } 2991 bql_unlock(); 2992 dirtylimit_vcpu_execute(cpu); 2993 ret = 0; 2994 break; 2995 case KVM_EXIT_SYSTEM_EVENT: 2996 trace_kvm_run_exit_system_event(cpu->cpu_index, run->system_event.type); 2997 switch (run->system_event.type) { 2998 case KVM_SYSTEM_EVENT_SHUTDOWN: 2999 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); 3000 ret = EXCP_INTERRUPT; 3001 break; 3002 case KVM_SYSTEM_EVENT_RESET: 3003 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 3004 ret = EXCP_INTERRUPT; 3005 break; 3006 case KVM_SYSTEM_EVENT_CRASH: 3007 kvm_cpu_synchronize_state(cpu); 3008 bql_lock(); 3009 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 3010 bql_unlock(); 3011 ret = 0; 3012 break; 3013 default: 3014 ret = kvm_arch_handle_exit(cpu, run); 3015 break; 3016 } 3017 break; 3018 default: 3019 ret = kvm_arch_handle_exit(cpu, run); 3020 break; 3021 } 3022 } while (ret == 0); 3023 3024 cpu_exec_end(cpu); 3025 bql_lock(); 3026 3027 if (ret < 0) { 3028 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); 3029 vm_stop(RUN_STATE_INTERNAL_ERROR); 3030 } 3031 3032 qatomic_set(&cpu->exit_request, 0); 3033 return ret; 3034 } 3035 3036 int kvm_ioctl(KVMState *s, int type, ...) 3037 { 3038 int ret; 3039 void *arg; 3040 va_list ap; 3041 3042 va_start(ap, type); 3043 arg = va_arg(ap, void *); 3044 va_end(ap); 3045 3046 trace_kvm_ioctl(type, arg); 3047 ret = ioctl(s->fd, type, arg); 3048 if (ret == -1) { 3049 ret = -errno; 3050 } 3051 return ret; 3052 } 3053 3054 int kvm_vm_ioctl(KVMState *s, int type, ...) 3055 { 3056 int ret; 3057 void *arg; 3058 va_list ap; 3059 3060 va_start(ap, type); 3061 arg = va_arg(ap, void *); 3062 va_end(ap); 3063 3064 trace_kvm_vm_ioctl(type, arg); 3065 accel_ioctl_begin(); 3066 ret = ioctl(s->vmfd, type, arg); 3067 accel_ioctl_end(); 3068 if (ret == -1) { 3069 ret = -errno; 3070 } 3071 return ret; 3072 } 3073 3074 int kvm_vcpu_ioctl(CPUState *cpu, int type, ...) 3075 { 3076 int ret; 3077 void *arg; 3078 va_list ap; 3079 3080 va_start(ap, type); 3081 arg = va_arg(ap, void *); 3082 va_end(ap); 3083 3084 trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg); 3085 accel_cpu_ioctl_begin(cpu); 3086 ret = ioctl(cpu->kvm_fd, type, arg); 3087 accel_cpu_ioctl_end(cpu); 3088 if (ret == -1) { 3089 ret = -errno; 3090 } 3091 return ret; 3092 } 3093 3094 int kvm_device_ioctl(int fd, int type, ...) 3095 { 3096 int ret; 3097 void *arg; 3098 va_list ap; 3099 3100 va_start(ap, type); 3101 arg = va_arg(ap, void *); 3102 va_end(ap); 3103 3104 trace_kvm_device_ioctl(fd, type, arg); 3105 accel_ioctl_begin(); 3106 ret = ioctl(fd, type, arg); 3107 accel_ioctl_end(); 3108 if (ret == -1) { 3109 ret = -errno; 3110 } 3111 return ret; 3112 } 3113 3114 int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr) 3115 { 3116 int ret; 3117 struct kvm_device_attr attribute = { 3118 .group = group, 3119 .attr = attr, 3120 }; 3121 3122 if (!kvm_vm_attributes_allowed) { 3123 return 0; 3124 } 3125 3126 ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute); 3127 /* kvm returns 0 on success for HAS_DEVICE_ATTR */ 3128 return ret ? 0 : 1; 3129 } 3130 3131 int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) 3132 { 3133 struct kvm_device_attr attribute = { 3134 .group = group, 3135 .attr = attr, 3136 .flags = 0, 3137 }; 3138 3139 return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1; 3140 } 3141 3142 int kvm_device_access(int fd, int group, uint64_t attr, 3143 void *val, bool write, Error **errp) 3144 { 3145 struct kvm_device_attr kvmattr; 3146 int err; 3147 3148 kvmattr.flags = 0; 3149 kvmattr.group = group; 3150 kvmattr.attr = attr; 3151 kvmattr.addr = (uintptr_t)val; 3152 3153 err = kvm_device_ioctl(fd, 3154 write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR, 3155 &kvmattr); 3156 if (err < 0) { 3157 error_setg_errno(errp, -err, 3158 "KVM_%s_DEVICE_ATTR failed: Group %d " 3159 "attr 0x%016" PRIx64, 3160 write ? "SET" : "GET", group, attr); 3161 } 3162 return err; 3163 } 3164 3165 bool kvm_has_sync_mmu(void) 3166 { 3167 return kvm_state->sync_mmu; 3168 } 3169 3170 int kvm_has_vcpu_events(void) 3171 { 3172 return kvm_state->vcpu_events; 3173 } 3174 3175 int kvm_max_nested_state_length(void) 3176 { 3177 return kvm_state->max_nested_state_len; 3178 } 3179 3180 int kvm_has_gsi_routing(void) 3181 { 3182 #ifdef KVM_CAP_IRQ_ROUTING 3183 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING); 3184 #else 3185 return false; 3186 #endif 3187 } 3188 3189 bool kvm_arm_supports_user_irq(void) 3190 { 3191 return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ); 3192 } 3193 3194 #ifdef TARGET_KVM_HAVE_GUEST_DEBUG 3195 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, vaddr pc) 3196 { 3197 struct kvm_sw_breakpoint *bp; 3198 3199 QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) { 3200 if (bp->pc == pc) { 3201 return bp; 3202 } 3203 } 3204 return NULL; 3205 } 3206 3207 int kvm_sw_breakpoints_active(CPUState *cpu) 3208 { 3209 return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints); 3210 } 3211 3212 struct kvm_set_guest_debug_data { 3213 struct kvm_guest_debug dbg; 3214 int err; 3215 }; 3216 3217 static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data) 3218 { 3219 struct kvm_set_guest_debug_data *dbg_data = 3220 (struct kvm_set_guest_debug_data *) data.host_ptr; 3221 3222 dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG, 3223 &dbg_data->dbg); 3224 } 3225 3226 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 3227 { 3228 struct kvm_set_guest_debug_data data; 3229 3230 data.dbg.control = reinject_trap; 3231 3232 if (cpu->singlestep_enabled) { 3233 data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; 3234 3235 if (cpu->singlestep_enabled & SSTEP_NOIRQ) { 3236 data.dbg.control |= KVM_GUESTDBG_BLOCKIRQ; 3237 } 3238 } 3239 kvm_arch_update_guest_debug(cpu, &data.dbg); 3240 3241 run_on_cpu(cpu, kvm_invoke_set_guest_debug, 3242 RUN_ON_CPU_HOST_PTR(&data)); 3243 return data.err; 3244 } 3245 3246 bool kvm_supports_guest_debug(void) 3247 { 3248 /* probed during kvm_init() */ 3249 return kvm_has_guest_debug; 3250 } 3251 3252 int kvm_insert_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len) 3253 { 3254 struct kvm_sw_breakpoint *bp; 3255 int err; 3256 3257 if (type == GDB_BREAKPOINT_SW) { 3258 bp = kvm_find_sw_breakpoint(cpu, addr); 3259 if (bp) { 3260 bp->use_count++; 3261 return 0; 3262 } 3263 3264 bp = g_new(struct kvm_sw_breakpoint, 1); 3265 bp->pc = addr; 3266 bp->use_count = 1; 3267 err = kvm_arch_insert_sw_breakpoint(cpu, bp); 3268 if (err) { 3269 g_free(bp); 3270 return err; 3271 } 3272 3273 QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 3274 } else { 3275 err = kvm_arch_insert_hw_breakpoint(addr, len, type); 3276 if (err) { 3277 return err; 3278 } 3279 } 3280 3281 CPU_FOREACH(cpu) { 3282 err = kvm_update_guest_debug(cpu, 0); 3283 if (err) { 3284 return err; 3285 } 3286 } 3287 return 0; 3288 } 3289 3290 int kvm_remove_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len) 3291 { 3292 struct kvm_sw_breakpoint *bp; 3293 int err; 3294 3295 if (type == GDB_BREAKPOINT_SW) { 3296 bp = kvm_find_sw_breakpoint(cpu, addr); 3297 if (!bp) { 3298 return -ENOENT; 3299 } 3300 3301 if (bp->use_count > 1) { 3302 bp->use_count--; 3303 return 0; 3304 } 3305 3306 err = kvm_arch_remove_sw_breakpoint(cpu, bp); 3307 if (err) { 3308 return err; 3309 } 3310 3311 QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 3312 g_free(bp); 3313 } else { 3314 err = kvm_arch_remove_hw_breakpoint(addr, len, type); 3315 if (err) { 3316 return err; 3317 } 3318 } 3319 3320 CPU_FOREACH(cpu) { 3321 err = kvm_update_guest_debug(cpu, 0); 3322 if (err) { 3323 return err; 3324 } 3325 } 3326 return 0; 3327 } 3328 3329 void kvm_remove_all_breakpoints(CPUState *cpu) 3330 { 3331 struct kvm_sw_breakpoint *bp, *next; 3332 KVMState *s = cpu->kvm_state; 3333 CPUState *tmpcpu; 3334 3335 QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) { 3336 if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) { 3337 /* Try harder to find a CPU that currently sees the breakpoint. */ 3338 CPU_FOREACH(tmpcpu) { 3339 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) { 3340 break; 3341 } 3342 } 3343 } 3344 QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry); 3345 g_free(bp); 3346 } 3347 kvm_arch_remove_all_hw_breakpoints(); 3348 3349 CPU_FOREACH(cpu) { 3350 kvm_update_guest_debug(cpu, 0); 3351 } 3352 } 3353 3354 #endif /* !TARGET_KVM_HAVE_GUEST_DEBUG */ 3355 3356 static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset) 3357 { 3358 KVMState *s = kvm_state; 3359 struct kvm_signal_mask *sigmask; 3360 int r; 3361 3362 sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset)); 3363 3364 sigmask->len = s->sigmask_len; 3365 memcpy(sigmask->sigset, sigset, sizeof(*sigset)); 3366 r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask); 3367 g_free(sigmask); 3368 3369 return r; 3370 } 3371 3372 static void kvm_ipi_signal(int sig) 3373 { 3374 if (current_cpu) { 3375 assert(kvm_immediate_exit); 3376 kvm_cpu_kick(current_cpu); 3377 } 3378 } 3379 3380 void kvm_init_cpu_signals(CPUState *cpu) 3381 { 3382 int r; 3383 sigset_t set; 3384 struct sigaction sigact; 3385 3386 memset(&sigact, 0, sizeof(sigact)); 3387 sigact.sa_handler = kvm_ipi_signal; 3388 sigaction(SIG_IPI, &sigact, NULL); 3389 3390 pthread_sigmask(SIG_BLOCK, NULL, &set); 3391 #if defined KVM_HAVE_MCE_INJECTION 3392 sigdelset(&set, SIGBUS); 3393 pthread_sigmask(SIG_SETMASK, &set, NULL); 3394 #endif 3395 sigdelset(&set, SIG_IPI); 3396 if (kvm_immediate_exit) { 3397 r = pthread_sigmask(SIG_SETMASK, &set, NULL); 3398 } else { 3399 r = kvm_set_signal_mask(cpu, &set); 3400 } 3401 if (r) { 3402 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r)); 3403 exit(1); 3404 } 3405 } 3406 3407 /* Called asynchronously in VCPU thread. */ 3408 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr) 3409 { 3410 #ifdef KVM_HAVE_MCE_INJECTION 3411 if (have_sigbus_pending) { 3412 return 1; 3413 } 3414 have_sigbus_pending = true; 3415 pending_sigbus_addr = addr; 3416 pending_sigbus_code = code; 3417 qatomic_set(&cpu->exit_request, 1); 3418 return 0; 3419 #else 3420 return 1; 3421 #endif 3422 } 3423 3424 /* Called synchronously (via signalfd) in main thread. */ 3425 int kvm_on_sigbus(int code, void *addr) 3426 { 3427 #ifdef KVM_HAVE_MCE_INJECTION 3428 /* Action required MCE kills the process if SIGBUS is blocked. Because 3429 * that's what happens in the I/O thread, where we handle MCE via signalfd, 3430 * we can only get action optional here. 3431 */ 3432 assert(code != BUS_MCEERR_AR); 3433 kvm_arch_on_sigbus_vcpu(first_cpu, code, addr); 3434 return 0; 3435 #else 3436 return 1; 3437 #endif 3438 } 3439 3440 int kvm_create_device(KVMState *s, uint64_t type, bool test) 3441 { 3442 int ret; 3443 struct kvm_create_device create_dev; 3444 3445 create_dev.type = type; 3446 create_dev.fd = -1; 3447 create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0; 3448 3449 if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) { 3450 return -ENOTSUP; 3451 } 3452 3453 ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev); 3454 if (ret) { 3455 return ret; 3456 } 3457 3458 return test ? 0 : create_dev.fd; 3459 } 3460 3461 bool kvm_device_supported(int vmfd, uint64_t type) 3462 { 3463 struct kvm_create_device create_dev = { 3464 .type = type, 3465 .fd = -1, 3466 .flags = KVM_CREATE_DEVICE_TEST, 3467 }; 3468 3469 if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) { 3470 return false; 3471 } 3472 3473 return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0); 3474 } 3475 3476 int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source) 3477 { 3478 struct kvm_one_reg reg; 3479 int r; 3480 3481 reg.id = id; 3482 reg.addr = (uintptr_t) source; 3483 r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®); 3484 if (r) { 3485 trace_kvm_failed_reg_set(id, strerror(-r)); 3486 } 3487 return r; 3488 } 3489 3490 int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target) 3491 { 3492 struct kvm_one_reg reg; 3493 int r; 3494 3495 reg.id = id; 3496 reg.addr = (uintptr_t) target; 3497 r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®); 3498 if (r) { 3499 trace_kvm_failed_reg_get(id, strerror(-r)); 3500 } 3501 return r; 3502 } 3503 3504 static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as, 3505 hwaddr start_addr, hwaddr size) 3506 { 3507 KVMState *kvm = KVM_STATE(ms->accelerator); 3508 int i; 3509 3510 for (i = 0; i < kvm->nr_as; ++i) { 3511 if (kvm->as[i].as == as && kvm->as[i].ml) { 3512 size = MIN(kvm_max_slot_size, size); 3513 return NULL != kvm_lookup_matching_slot(kvm->as[i].ml, 3514 start_addr, size); 3515 } 3516 } 3517 3518 return false; 3519 } 3520 3521 static void kvm_get_kvm_shadow_mem(Object *obj, Visitor *v, 3522 const char *name, void *opaque, 3523 Error **errp) 3524 { 3525 KVMState *s = KVM_STATE(obj); 3526 int64_t value = s->kvm_shadow_mem; 3527 3528 visit_type_int(v, name, &value, errp); 3529 } 3530 3531 static void kvm_set_kvm_shadow_mem(Object *obj, Visitor *v, 3532 const char *name, void *opaque, 3533 Error **errp) 3534 { 3535 KVMState *s = KVM_STATE(obj); 3536 int64_t value; 3537 3538 if (s->fd != -1) { 3539 error_setg(errp, "Cannot set properties after the accelerator has been initialized"); 3540 return; 3541 } 3542 3543 if (!visit_type_int(v, name, &value, errp)) { 3544 return; 3545 } 3546 3547 s->kvm_shadow_mem = value; 3548 } 3549 3550 static void kvm_set_kernel_irqchip(Object *obj, Visitor *v, 3551 const char *name, void *opaque, 3552 Error **errp) 3553 { 3554 KVMState *s = KVM_STATE(obj); 3555 OnOffSplit mode; 3556 3557 if (s->fd != -1) { 3558 error_setg(errp, "Cannot set properties after the accelerator has been initialized"); 3559 return; 3560 } 3561 3562 if (!visit_type_OnOffSplit(v, name, &mode, errp)) { 3563 return; 3564 } 3565 switch (mode) { 3566 case ON_OFF_SPLIT_ON: 3567 s->kernel_irqchip_allowed = true; 3568 s->kernel_irqchip_required = true; 3569 s->kernel_irqchip_split = ON_OFF_AUTO_OFF; 3570 break; 3571 case ON_OFF_SPLIT_OFF: 3572 s->kernel_irqchip_allowed = false; 3573 s->kernel_irqchip_required = false; 3574 s->kernel_irqchip_split = ON_OFF_AUTO_OFF; 3575 break; 3576 case ON_OFF_SPLIT_SPLIT: 3577 s->kernel_irqchip_allowed = true; 3578 s->kernel_irqchip_required = true; 3579 s->kernel_irqchip_split = ON_OFF_AUTO_ON; 3580 break; 3581 default: 3582 /* The value was checked in visit_type_OnOffSplit() above. If 3583 * we get here, then something is wrong in QEMU. 3584 */ 3585 abort(); 3586 } 3587 } 3588 3589 bool kvm_kernel_irqchip_allowed(void) 3590 { 3591 return kvm_state->kernel_irqchip_allowed; 3592 } 3593 3594 bool kvm_kernel_irqchip_required(void) 3595 { 3596 return kvm_state->kernel_irqchip_required; 3597 } 3598 3599 bool kvm_kernel_irqchip_split(void) 3600 { 3601 return kvm_state->kernel_irqchip_split == ON_OFF_AUTO_ON; 3602 } 3603 3604 static void kvm_get_dirty_ring_size(Object *obj, Visitor *v, 3605 const char *name, void *opaque, 3606 Error **errp) 3607 { 3608 KVMState *s = KVM_STATE(obj); 3609 uint32_t value = s->kvm_dirty_ring_size; 3610 3611 visit_type_uint32(v, name, &value, errp); 3612 } 3613 3614 static void kvm_set_dirty_ring_size(Object *obj, Visitor *v, 3615 const char *name, void *opaque, 3616 Error **errp) 3617 { 3618 KVMState *s = KVM_STATE(obj); 3619 uint32_t value; 3620 3621 if (s->fd != -1) { 3622 error_setg(errp, "Cannot set properties after the accelerator has been initialized"); 3623 return; 3624 } 3625 3626 if (!visit_type_uint32(v, name, &value, errp)) { 3627 return; 3628 } 3629 if (value & (value - 1)) { 3630 error_setg(errp, "dirty-ring-size must be a power of two."); 3631 return; 3632 } 3633 3634 s->kvm_dirty_ring_size = value; 3635 } 3636 3637 static char *kvm_get_device(Object *obj, 3638 Error **errp G_GNUC_UNUSED) 3639 { 3640 KVMState *s = KVM_STATE(obj); 3641 3642 return g_strdup(s->device); 3643 } 3644 3645 static void kvm_set_device(Object *obj, 3646 const char *value, 3647 Error **errp G_GNUC_UNUSED) 3648 { 3649 KVMState *s = KVM_STATE(obj); 3650 3651 g_free(s->device); 3652 s->device = g_strdup(value); 3653 } 3654 3655 static void kvm_accel_instance_init(Object *obj) 3656 { 3657 KVMState *s = KVM_STATE(obj); 3658 3659 s->fd = -1; 3660 s->vmfd = -1; 3661 s->kvm_shadow_mem = -1; 3662 s->kernel_irqchip_allowed = true; 3663 s->kernel_irqchip_split = ON_OFF_AUTO_AUTO; 3664 /* KVM dirty ring is by default off */ 3665 s->kvm_dirty_ring_size = 0; 3666 s->kvm_dirty_ring_with_bitmap = false; 3667 s->kvm_eager_split_size = 0; 3668 s->notify_vmexit = NOTIFY_VMEXIT_OPTION_RUN; 3669 s->notify_window = 0; 3670 s->xen_version = 0; 3671 s->xen_gnttab_max_frames = 64; 3672 s->xen_evtchn_max_pirq = 256; 3673 s->device = NULL; 3674 } 3675 3676 /** 3677 * kvm_gdbstub_sstep_flags(): 3678 * 3679 * Returns: SSTEP_* flags that KVM supports for guest debug. The 3680 * support is probed during kvm_init() 3681 */ 3682 static int kvm_gdbstub_sstep_flags(void) 3683 { 3684 return kvm_sstep_flags; 3685 } 3686 3687 static void kvm_accel_class_init(ObjectClass *oc, void *data) 3688 { 3689 AccelClass *ac = ACCEL_CLASS(oc); 3690 ac->name = "KVM"; 3691 ac->init_machine = kvm_init; 3692 ac->has_memory = kvm_accel_has_memory; 3693 ac->allowed = &kvm_allowed; 3694 ac->gdbstub_supported_sstep_flags = kvm_gdbstub_sstep_flags; 3695 3696 object_class_property_add(oc, "kernel-irqchip", "on|off|split", 3697 NULL, kvm_set_kernel_irqchip, 3698 NULL, NULL); 3699 object_class_property_set_description(oc, "kernel-irqchip", 3700 "Configure KVM in-kernel irqchip"); 3701 3702 object_class_property_add(oc, "kvm-shadow-mem", "int", 3703 kvm_get_kvm_shadow_mem, kvm_set_kvm_shadow_mem, 3704 NULL, NULL); 3705 object_class_property_set_description(oc, "kvm-shadow-mem", 3706 "KVM shadow MMU size"); 3707 3708 object_class_property_add(oc, "dirty-ring-size", "uint32", 3709 kvm_get_dirty_ring_size, kvm_set_dirty_ring_size, 3710 NULL, NULL); 3711 object_class_property_set_description(oc, "dirty-ring-size", 3712 "Size of KVM dirty page ring buffer (default: 0, i.e. use bitmap)"); 3713 3714 object_class_property_add_str(oc, "device", kvm_get_device, kvm_set_device); 3715 object_class_property_set_description(oc, "device", 3716 "Path to the device node to use (default: /dev/kvm)"); 3717 3718 kvm_arch_accel_class_init(oc); 3719 } 3720 3721 static const TypeInfo kvm_accel_type = { 3722 .name = TYPE_KVM_ACCEL, 3723 .parent = TYPE_ACCEL, 3724 .instance_init = kvm_accel_instance_init, 3725 .class_init = kvm_accel_class_init, 3726 .instance_size = sizeof(KVMState), 3727 }; 3728 3729 static void kvm_type_init(void) 3730 { 3731 type_register_static(&kvm_accel_type); 3732 } 3733 3734 type_init(kvm_type_init); 3735 3736 typedef struct StatsArgs { 3737 union StatsResultsType { 3738 StatsResultList **stats; 3739 StatsSchemaList **schema; 3740 } result; 3741 strList *names; 3742 Error **errp; 3743 } StatsArgs; 3744 3745 static StatsList *add_kvmstat_entry(struct kvm_stats_desc *pdesc, 3746 uint64_t *stats_data, 3747 StatsList *stats_list, 3748 Error **errp) 3749 { 3750 3751 Stats *stats; 3752 uint64List *val_list = NULL; 3753 3754 /* Only add stats that we understand. */ 3755 switch (pdesc->flags & KVM_STATS_TYPE_MASK) { 3756 case KVM_STATS_TYPE_CUMULATIVE: 3757 case KVM_STATS_TYPE_INSTANT: 3758 case KVM_STATS_TYPE_PEAK: 3759 case KVM_STATS_TYPE_LINEAR_HIST: 3760 case KVM_STATS_TYPE_LOG_HIST: 3761 break; 3762 default: 3763 return stats_list; 3764 } 3765 3766 switch (pdesc->flags & KVM_STATS_UNIT_MASK) { 3767 case KVM_STATS_UNIT_NONE: 3768 case KVM_STATS_UNIT_BYTES: 3769 case KVM_STATS_UNIT_CYCLES: 3770 case KVM_STATS_UNIT_SECONDS: 3771 case KVM_STATS_UNIT_BOOLEAN: 3772 break; 3773 default: 3774 return stats_list; 3775 } 3776 3777 switch (pdesc->flags & KVM_STATS_BASE_MASK) { 3778 case KVM_STATS_BASE_POW10: 3779 case KVM_STATS_BASE_POW2: 3780 break; 3781 default: 3782 return stats_list; 3783 } 3784 3785 /* Alloc and populate data list */ 3786 stats = g_new0(Stats, 1); 3787 stats->name = g_strdup(pdesc->name); 3788 stats->value = g_new0(StatsValue, 1);; 3789 3790 if ((pdesc->flags & KVM_STATS_UNIT_MASK) == KVM_STATS_UNIT_BOOLEAN) { 3791 stats->value->u.boolean = *stats_data; 3792 stats->value->type = QTYPE_QBOOL; 3793 } else if (pdesc->size == 1) { 3794 stats->value->u.scalar = *stats_data; 3795 stats->value->type = QTYPE_QNUM; 3796 } else { 3797 int i; 3798 for (i = 0; i < pdesc->size; i++) { 3799 QAPI_LIST_PREPEND(val_list, stats_data[i]); 3800 } 3801 stats->value->u.list = val_list; 3802 stats->value->type = QTYPE_QLIST; 3803 } 3804 3805 QAPI_LIST_PREPEND(stats_list, stats); 3806 return stats_list; 3807 } 3808 3809 static StatsSchemaValueList *add_kvmschema_entry(struct kvm_stats_desc *pdesc, 3810 StatsSchemaValueList *list, 3811 Error **errp) 3812 { 3813 StatsSchemaValueList *schema_entry = g_new0(StatsSchemaValueList, 1); 3814 schema_entry->value = g_new0(StatsSchemaValue, 1); 3815 3816 switch (pdesc->flags & KVM_STATS_TYPE_MASK) { 3817 case KVM_STATS_TYPE_CUMULATIVE: 3818 schema_entry->value->type = STATS_TYPE_CUMULATIVE; 3819 break; 3820 case KVM_STATS_TYPE_INSTANT: 3821 schema_entry->value->type = STATS_TYPE_INSTANT; 3822 break; 3823 case KVM_STATS_TYPE_PEAK: 3824 schema_entry->value->type = STATS_TYPE_PEAK; 3825 break; 3826 case KVM_STATS_TYPE_LINEAR_HIST: 3827 schema_entry->value->type = STATS_TYPE_LINEAR_HISTOGRAM; 3828 schema_entry->value->bucket_size = pdesc->bucket_size; 3829 schema_entry->value->has_bucket_size = true; 3830 break; 3831 case KVM_STATS_TYPE_LOG_HIST: 3832 schema_entry->value->type = STATS_TYPE_LOG2_HISTOGRAM; 3833 break; 3834 default: 3835 goto exit; 3836 } 3837 3838 switch (pdesc->flags & KVM_STATS_UNIT_MASK) { 3839 case KVM_STATS_UNIT_NONE: 3840 break; 3841 case KVM_STATS_UNIT_BOOLEAN: 3842 schema_entry->value->has_unit = true; 3843 schema_entry->value->unit = STATS_UNIT_BOOLEAN; 3844 break; 3845 case KVM_STATS_UNIT_BYTES: 3846 schema_entry->value->has_unit = true; 3847 schema_entry->value->unit = STATS_UNIT_BYTES; 3848 break; 3849 case KVM_STATS_UNIT_CYCLES: 3850 schema_entry->value->has_unit = true; 3851 schema_entry->value->unit = STATS_UNIT_CYCLES; 3852 break; 3853 case KVM_STATS_UNIT_SECONDS: 3854 schema_entry->value->has_unit = true; 3855 schema_entry->value->unit = STATS_UNIT_SECONDS; 3856 break; 3857 default: 3858 goto exit; 3859 } 3860 3861 schema_entry->value->exponent = pdesc->exponent; 3862 if (pdesc->exponent) { 3863 switch (pdesc->flags & KVM_STATS_BASE_MASK) { 3864 case KVM_STATS_BASE_POW10: 3865 schema_entry->value->has_base = true; 3866 schema_entry->value->base = 10; 3867 break; 3868 case KVM_STATS_BASE_POW2: 3869 schema_entry->value->has_base = true; 3870 schema_entry->value->base = 2; 3871 break; 3872 default: 3873 goto exit; 3874 } 3875 } 3876 3877 schema_entry->value->name = g_strdup(pdesc->name); 3878 schema_entry->next = list; 3879 return schema_entry; 3880 exit: 3881 g_free(schema_entry->value); 3882 g_free(schema_entry); 3883 return list; 3884 } 3885 3886 /* Cached stats descriptors */ 3887 typedef struct StatsDescriptors { 3888 const char *ident; /* cache key, currently the StatsTarget */ 3889 struct kvm_stats_desc *kvm_stats_desc; 3890 struct kvm_stats_header kvm_stats_header; 3891 QTAILQ_ENTRY(StatsDescriptors) next; 3892 } StatsDescriptors; 3893 3894 static QTAILQ_HEAD(, StatsDescriptors) stats_descriptors = 3895 QTAILQ_HEAD_INITIALIZER(stats_descriptors); 3896 3897 /* 3898 * Return the descriptors for 'target', that either have already been read 3899 * or are retrieved from 'stats_fd'. 3900 */ 3901 static StatsDescriptors *find_stats_descriptors(StatsTarget target, int stats_fd, 3902 Error **errp) 3903 { 3904 StatsDescriptors *descriptors; 3905 const char *ident; 3906 struct kvm_stats_desc *kvm_stats_desc; 3907 struct kvm_stats_header *kvm_stats_header; 3908 size_t size_desc; 3909 ssize_t ret; 3910 3911 ident = StatsTarget_str(target); 3912 QTAILQ_FOREACH(descriptors, &stats_descriptors, next) { 3913 if (g_str_equal(descriptors->ident, ident)) { 3914 return descriptors; 3915 } 3916 } 3917 3918 descriptors = g_new0(StatsDescriptors, 1); 3919 3920 /* Read stats header */ 3921 kvm_stats_header = &descriptors->kvm_stats_header; 3922 ret = pread(stats_fd, kvm_stats_header, sizeof(*kvm_stats_header), 0); 3923 if (ret != sizeof(*kvm_stats_header)) { 3924 error_setg(errp, "KVM stats: failed to read stats header: " 3925 "expected %zu actual %zu", 3926 sizeof(*kvm_stats_header), ret); 3927 g_free(descriptors); 3928 return NULL; 3929 } 3930 size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size; 3931 3932 /* Read stats descriptors */ 3933 kvm_stats_desc = g_malloc0_n(kvm_stats_header->num_desc, size_desc); 3934 ret = pread(stats_fd, kvm_stats_desc, 3935 size_desc * kvm_stats_header->num_desc, 3936 kvm_stats_header->desc_offset); 3937 3938 if (ret != size_desc * kvm_stats_header->num_desc) { 3939 error_setg(errp, "KVM stats: failed to read stats descriptors: " 3940 "expected %zu actual %zu", 3941 size_desc * kvm_stats_header->num_desc, ret); 3942 g_free(descriptors); 3943 g_free(kvm_stats_desc); 3944 return NULL; 3945 } 3946 descriptors->kvm_stats_desc = kvm_stats_desc; 3947 descriptors->ident = ident; 3948 QTAILQ_INSERT_TAIL(&stats_descriptors, descriptors, next); 3949 return descriptors; 3950 } 3951 3952 static void query_stats(StatsResultList **result, StatsTarget target, 3953 strList *names, int stats_fd, CPUState *cpu, 3954 Error **errp) 3955 { 3956 struct kvm_stats_desc *kvm_stats_desc; 3957 struct kvm_stats_header *kvm_stats_header; 3958 StatsDescriptors *descriptors; 3959 g_autofree uint64_t *stats_data = NULL; 3960 struct kvm_stats_desc *pdesc; 3961 StatsList *stats_list = NULL; 3962 size_t size_desc, size_data = 0; 3963 ssize_t ret; 3964 int i; 3965 3966 descriptors = find_stats_descriptors(target, stats_fd, errp); 3967 if (!descriptors) { 3968 return; 3969 } 3970 3971 kvm_stats_header = &descriptors->kvm_stats_header; 3972 kvm_stats_desc = descriptors->kvm_stats_desc; 3973 size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size; 3974 3975 /* Tally the total data size; read schema data */ 3976 for (i = 0; i < kvm_stats_header->num_desc; ++i) { 3977 pdesc = (void *)kvm_stats_desc + i * size_desc; 3978 size_data += pdesc->size * sizeof(*stats_data); 3979 } 3980 3981 stats_data = g_malloc0(size_data); 3982 ret = pread(stats_fd, stats_data, size_data, kvm_stats_header->data_offset); 3983 3984 if (ret != size_data) { 3985 error_setg(errp, "KVM stats: failed to read data: " 3986 "expected %zu actual %zu", size_data, ret); 3987 return; 3988 } 3989 3990 for (i = 0; i < kvm_stats_header->num_desc; ++i) { 3991 uint64_t *stats; 3992 pdesc = (void *)kvm_stats_desc + i * size_desc; 3993 3994 /* Add entry to the list */ 3995 stats = (void *)stats_data + pdesc->offset; 3996 if (!apply_str_list_filter(pdesc->name, names)) { 3997 continue; 3998 } 3999 stats_list = add_kvmstat_entry(pdesc, stats, stats_list, errp); 4000 } 4001 4002 if (!stats_list) { 4003 return; 4004 } 4005 4006 switch (target) { 4007 case STATS_TARGET_VM: 4008 add_stats_entry(result, STATS_PROVIDER_KVM, NULL, stats_list); 4009 break; 4010 case STATS_TARGET_VCPU: 4011 add_stats_entry(result, STATS_PROVIDER_KVM, 4012 cpu->parent_obj.canonical_path, 4013 stats_list); 4014 break; 4015 default: 4016 g_assert_not_reached(); 4017 } 4018 } 4019 4020 static void query_stats_schema(StatsSchemaList **result, StatsTarget target, 4021 int stats_fd, Error **errp) 4022 { 4023 struct kvm_stats_desc *kvm_stats_desc; 4024 struct kvm_stats_header *kvm_stats_header; 4025 StatsDescriptors *descriptors; 4026 struct kvm_stats_desc *pdesc; 4027 StatsSchemaValueList *stats_list = NULL; 4028 size_t size_desc; 4029 int i; 4030 4031 descriptors = find_stats_descriptors(target, stats_fd, errp); 4032 if (!descriptors) { 4033 return; 4034 } 4035 4036 kvm_stats_header = &descriptors->kvm_stats_header; 4037 kvm_stats_desc = descriptors->kvm_stats_desc; 4038 size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size; 4039 4040 /* Tally the total data size; read schema data */ 4041 for (i = 0; i < kvm_stats_header->num_desc; ++i) { 4042 pdesc = (void *)kvm_stats_desc + i * size_desc; 4043 stats_list = add_kvmschema_entry(pdesc, stats_list, errp); 4044 } 4045 4046 add_stats_schema(result, STATS_PROVIDER_KVM, target, stats_list); 4047 } 4048 4049 static void query_stats_vcpu(CPUState *cpu, StatsArgs *kvm_stats_args) 4050 { 4051 int stats_fd = cpu->kvm_vcpu_stats_fd; 4052 Error *local_err = NULL; 4053 4054 if (stats_fd == -1) { 4055 error_setg_errno(&local_err, errno, "KVM stats: ioctl failed"); 4056 error_propagate(kvm_stats_args->errp, local_err); 4057 return; 4058 } 4059 query_stats(kvm_stats_args->result.stats, STATS_TARGET_VCPU, 4060 kvm_stats_args->names, stats_fd, cpu, 4061 kvm_stats_args->errp); 4062 } 4063 4064 static void query_stats_schema_vcpu(CPUState *cpu, StatsArgs *kvm_stats_args) 4065 { 4066 int stats_fd = cpu->kvm_vcpu_stats_fd; 4067 Error *local_err = NULL; 4068 4069 if (stats_fd == -1) { 4070 error_setg_errno(&local_err, errno, "KVM stats: ioctl failed"); 4071 error_propagate(kvm_stats_args->errp, local_err); 4072 return; 4073 } 4074 query_stats_schema(kvm_stats_args->result.schema, STATS_TARGET_VCPU, stats_fd, 4075 kvm_stats_args->errp); 4076 } 4077 4078 static void query_stats_cb(StatsResultList **result, StatsTarget target, 4079 strList *names, strList *targets, Error **errp) 4080 { 4081 KVMState *s = kvm_state; 4082 CPUState *cpu; 4083 int stats_fd; 4084 4085 switch (target) { 4086 case STATS_TARGET_VM: 4087 { 4088 stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL); 4089 if (stats_fd == -1) { 4090 error_setg_errno(errp, errno, "KVM stats: ioctl failed"); 4091 return; 4092 } 4093 query_stats(result, target, names, stats_fd, NULL, errp); 4094 close(stats_fd); 4095 break; 4096 } 4097 case STATS_TARGET_VCPU: 4098 { 4099 StatsArgs stats_args; 4100 stats_args.result.stats = result; 4101 stats_args.names = names; 4102 stats_args.errp = errp; 4103 CPU_FOREACH(cpu) { 4104 if (!apply_str_list_filter(cpu->parent_obj.canonical_path, targets)) { 4105 continue; 4106 } 4107 query_stats_vcpu(cpu, &stats_args); 4108 } 4109 break; 4110 } 4111 default: 4112 break; 4113 } 4114 } 4115 4116 void query_stats_schemas_cb(StatsSchemaList **result, Error **errp) 4117 { 4118 StatsArgs stats_args; 4119 KVMState *s = kvm_state; 4120 int stats_fd; 4121 4122 stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL); 4123 if (stats_fd == -1) { 4124 error_setg_errno(errp, errno, "KVM stats: ioctl failed"); 4125 return; 4126 } 4127 query_stats_schema(result, STATS_TARGET_VM, stats_fd, errp); 4128 close(stats_fd); 4129 4130 if (first_cpu) { 4131 stats_args.result.schema = result; 4132 stats_args.errp = errp; 4133 query_stats_schema_vcpu(first_cpu, &stats_args); 4134 } 4135 } 4136 4137 void kvm_mark_guest_state_protected(void) 4138 { 4139 kvm_state->guest_state_protected = true; 4140 } 4141