1 /*
2 * QEMU KVM support
3 *
4 * Copyright IBM, Corp. 2008
5 * Red Hat, Inc. 2008
6 *
7 * Authors:
8 * Anthony Liguori <aliguori@us.ibm.com>
9 * Glauber Costa <gcosta@redhat.com>
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12 * See the COPYING file in the top-level directory.
13 *
14 */
15
16 #include "qemu/osdep.h"
17 #include <sys/ioctl.h>
18
19 #include <linux/kvm.h>
20
21 #include "qemu/atomic.h"
22 #include "qemu/option.h"
23 #include "qemu/config-file.h"
24 #include "qemu/error-report.h"
25 #include "qapi/error.h"
26 #include "hw/pci/msi.h"
27 #include "hw/pci/msix.h"
28 #include "hw/s390x/adapter.h"
29 #include "exec/gdbstub.h"
30 #include "sysemu/kvm_int.h"
31 #include "sysemu/runstate.h"
32 #include "sysemu/cpus.h"
33 #include "sysemu/sysemu.h"
34 #include "qemu/bswap.h"
35 #include "exec/memory.h"
36 #include "exec/ram_addr.h"
37 #include "exec/address-spaces.h"
38 #include "qemu/event_notifier.h"
39 #include "qemu/main-loop.h"
40 #include "trace.h"
41 #include "hw/irq.h"
42 #include "sysemu/sev.h"
43 #include "sysemu/balloon.h"
44 #include "qapi/visitor.h"
45 #include "qapi/qapi-types-common.h"
46 #include "qapi/qapi-visit-common.h"
47
48 #include "hw/boards.h"
49
50 /* This check must be after config-host.h is included */
51 #ifdef CONFIG_EVENTFD
52 #include <sys/eventfd.h>
53 #endif
54
55 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
56 * need to use the real host PAGE_SIZE, as that's what KVM will use.
57 */
58 #define PAGE_SIZE qemu_real_host_page_size
59
60 //#define DEBUG_KVM
61
62 #ifdef DEBUG_KVM
63 #define DPRINTF(fmt, ...) \
64 do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
65 #else
66 #define DPRINTF(fmt, ...) \
67 do { } while (0)
68 #endif
69
70 #define KVM_MSI_HASHTAB_SIZE 256
71
72 struct KVMParkedVcpu {
73 unsigned long vcpu_id;
74 int kvm_fd;
75 QLIST_ENTRY(KVMParkedVcpu) node;
76 };
77
78 struct KVMState
79 {
80 AccelState parent_obj;
81
82 int nr_slots;
83 int fd;
84 int vmfd;
85 int coalesced_mmio;
86 int coalesced_pio;
87 struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
88 bool coalesced_flush_in_progress;
89 int vcpu_events;
90 int robust_singlestep;
91 int debugregs;
92 #ifdef KVM_CAP_SET_GUEST_DEBUG
93 QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints;
94 #endif
95 int max_nested_state_len;
96 int many_ioeventfds;
97 int intx_set_mask;
98 int kvm_shadow_mem;
99 bool kernel_irqchip_allowed;
100 bool kernel_irqchip_required;
101 OnOffAuto kernel_irqchip_split;
102 bool sync_mmu;
103 bool manual_dirty_log_protect;
104 /* The man page (and posix) say ioctl numbers are signed int, but
105 * they're not. Linux, glibc and *BSD all treat ioctl numbers as
106 * unsigned, and treating them as signed here can break things */
107 unsigned irq_set_ioctl;
108 unsigned int sigmask_len;
109 GHashTable *gsimap;
110 #ifdef KVM_CAP_IRQ_ROUTING
111 struct kvm_irq_routing *irq_routes;
112 int nr_allocated_irq_routes;
113 unsigned long *used_gsi_bitmap;
114 unsigned int gsi_count;
115 QTAILQ_HEAD(, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
116 #endif
117 KVMMemoryListener memory_listener;
118 QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
119
120 /* memory encryption */
121 void *memcrypt_handle;
122 int (*memcrypt_encrypt_data)(void *handle, uint8_t *ptr, uint64_t len);
123
124 /* For "info mtree -f" to tell if an MR is registered in KVM */
125 int nr_as;
126 struct KVMAs {
127 KVMMemoryListener *ml;
128 AddressSpace *as;
129 } *as;
130 };
131
132 KVMState *kvm_state;
133 bool kvm_kernel_irqchip;
134 bool kvm_split_irqchip;
135 bool kvm_async_interrupts_allowed;
136 bool kvm_halt_in_kernel_allowed;
137 bool kvm_eventfds_allowed;
138 bool kvm_irqfds_allowed;
139 bool kvm_resamplefds_allowed;
140 bool kvm_msi_via_irqfd_allowed;
141 bool kvm_gsi_routing_allowed;
142 bool kvm_gsi_direct_mapping;
143 bool kvm_allowed;
144 bool kvm_readonly_mem_allowed;
145 bool kvm_vm_attributes_allowed;
146 bool kvm_direct_msi_allowed;
147 bool kvm_ioeventfd_any_length_allowed;
148 bool kvm_msi_use_devid;
149 static bool kvm_immediate_exit;
150 static hwaddr kvm_max_slot_size = ~0;
151
152 static const KVMCapabilityInfo kvm_required_capabilites[] = {
153 KVM_CAP_INFO(USER_MEMORY),
154 KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
155 KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS),
156 KVM_CAP_LAST_INFO
157 };
158
159 static NotifierList kvm_irqchip_change_notifiers =
160 NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
161
162 #define kvm_slots_lock(kml) qemu_mutex_lock(&(kml)->slots_lock)
163 #define kvm_slots_unlock(kml) qemu_mutex_unlock(&(kml)->slots_lock)
164
kvm_get_max_memslots(void)165 int kvm_get_max_memslots(void)
166 {
167 KVMState *s = KVM_STATE(current_accel());
168
169 return s->nr_slots;
170 }
171
kvm_memcrypt_enabled(void)172 bool kvm_memcrypt_enabled(void)
173 {
174 if (kvm_state && kvm_state->memcrypt_handle) {
175 return true;
176 }
177
178 return false;
179 }
180
kvm_memcrypt_encrypt_data(uint8_t * ptr,uint64_t len)181 int kvm_memcrypt_encrypt_data(uint8_t *ptr, uint64_t len)
182 {
183 if (kvm_state->memcrypt_handle &&
184 kvm_state->memcrypt_encrypt_data) {
185 return kvm_state->memcrypt_encrypt_data(kvm_state->memcrypt_handle,
186 ptr, len);
187 }
188
189 return 1;
190 }
191
192 /* Called with KVMMemoryListener.slots_lock held */
kvm_get_free_slot(KVMMemoryListener * kml)193 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
194 {
195 KVMState *s = kvm_state;
196 int i;
197
198 for (i = 0; i < s->nr_slots; i++) {
199 if (kml->slots[i].memory_size == 0) {
200 return &kml->slots[i];
201 }
202 }
203
204 return NULL;
205 }
206
kvm_has_free_slot(MachineState * ms)207 bool kvm_has_free_slot(MachineState *ms)
208 {
209 KVMState *s = KVM_STATE(ms->accelerator);
210 bool result;
211 KVMMemoryListener *kml = &s->memory_listener;
212
213 kvm_slots_lock(kml);
214 result = !!kvm_get_free_slot(kml);
215 kvm_slots_unlock(kml);
216
217 return result;
218 }
219
220 /* Called with KVMMemoryListener.slots_lock held */
kvm_alloc_slot(KVMMemoryListener * kml)221 static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
222 {
223 KVMSlot *slot = kvm_get_free_slot(kml);
224
225 if (slot) {
226 return slot;
227 }
228
229 fprintf(stderr, "%s: no free slot available\n", __func__);
230 abort();
231 }
232
kvm_lookup_matching_slot(KVMMemoryListener * kml,hwaddr start_addr,hwaddr size)233 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
234 hwaddr start_addr,
235 hwaddr size)
236 {
237 KVMState *s = kvm_state;
238 int i;
239
240 for (i = 0; i < s->nr_slots; i++) {
241 KVMSlot *mem = &kml->slots[i];
242
243 if (start_addr == mem->start_addr && size == mem->memory_size) {
244 return mem;
245 }
246 }
247
248 return NULL;
249 }
250
251 /*
252 * Calculate and align the start address and the size of the section.
253 * Return the size. If the size is 0, the aligned section is empty.
254 */
kvm_align_section(MemoryRegionSection * section,hwaddr * start)255 static hwaddr kvm_align_section(MemoryRegionSection *section,
256 hwaddr *start)
257 {
258 hwaddr size = int128_get64(section->size);
259 hwaddr delta, aligned;
260
261 /* kvm works in page size chunks, but the function may be called
262 with sub-page size and unaligned start address. Pad the start
263 address to next and truncate size to previous page boundary. */
264 aligned = ROUND_UP(section->offset_within_address_space,
265 qemu_real_host_page_size);
266 delta = aligned - section->offset_within_address_space;
267 *start = aligned;
268 if (delta > size) {
269 return 0;
270 }
271
272 return (size - delta) & qemu_real_host_page_mask;
273 }
274
kvm_physical_memory_addr_from_host(KVMState * s,void * ram,hwaddr * phys_addr)275 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
276 hwaddr *phys_addr)
277 {
278 KVMMemoryListener *kml = &s->memory_listener;
279 int i, ret = 0;
280
281 kvm_slots_lock(kml);
282 for (i = 0; i < s->nr_slots; i++) {
283 KVMSlot *mem = &kml->slots[i];
284
285 if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
286 *phys_addr = mem->start_addr + (ram - mem->ram);
287 ret = 1;
288 break;
289 }
290 }
291 kvm_slots_unlock(kml);
292
293 return ret;
294 }
295
kvm_set_user_memory_region(KVMMemoryListener * kml,KVMSlot * slot,bool new)296 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new)
297 {
298 KVMState *s = kvm_state;
299 struct kvm_userspace_memory_region mem;
300 int ret;
301
302 mem.slot = slot->slot | (kml->as_id << 16);
303 mem.guest_phys_addr = slot->start_addr;
304 mem.userspace_addr = (unsigned long)slot->ram;
305 mem.flags = slot->flags;
306
307 if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) {
308 /* Set the slot size to 0 before setting the slot to the desired
309 * value. This is needed based on KVM commit 75d61fbc. */
310 mem.memory_size = 0;
311 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
312 if (ret < 0) {
313 goto err;
314 }
315 }
316 mem.memory_size = slot->memory_size;
317 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
318 slot->old_flags = mem.flags;
319 err:
320 trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr,
321 mem.memory_size, mem.userspace_addr, ret);
322 if (ret < 0) {
323 error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d,"
324 " start=0x%" PRIx64 ", size=0x%" PRIx64 ": %s",
325 __func__, mem.slot, slot->start_addr,
326 (uint64_t)mem.memory_size, strerror(errno));
327 }
328 return ret;
329 }
330
kvm_destroy_vcpu(CPUState * cpu)331 int kvm_destroy_vcpu(CPUState *cpu)
332 {
333 KVMState *s = kvm_state;
334 long mmap_size;
335 struct KVMParkedVcpu *vcpu = NULL;
336 int ret = 0;
337
338 DPRINTF("kvm_destroy_vcpu\n");
339
340 ret = kvm_arch_destroy_vcpu(cpu);
341 if (ret < 0) {
342 goto err;
343 }
344
345 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
346 if (mmap_size < 0) {
347 ret = mmap_size;
348 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
349 goto err;
350 }
351
352 ret = munmap(cpu->kvm_run, mmap_size);
353 if (ret < 0) {
354 goto err;
355 }
356
357 vcpu = g_malloc0(sizeof(*vcpu));
358 vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
359 vcpu->kvm_fd = cpu->kvm_fd;
360 QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
361 err:
362 return ret;
363 }
364
kvm_get_vcpu(KVMState * s,unsigned long vcpu_id)365 static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
366 {
367 struct KVMParkedVcpu *cpu;
368
369 QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
370 if (cpu->vcpu_id == vcpu_id) {
371 int kvm_fd;
372
373 QLIST_REMOVE(cpu, node);
374 kvm_fd = cpu->kvm_fd;
375 g_free(cpu);
376 return kvm_fd;
377 }
378 }
379
380 return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
381 }
382
kvm_init_vcpu(CPUState * cpu)383 int kvm_init_vcpu(CPUState *cpu)
384 {
385 KVMState *s = kvm_state;
386 long mmap_size;
387 int ret;
388
389 DPRINTF("kvm_init_vcpu\n");
390
391 ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
392 if (ret < 0) {
393 DPRINTF("kvm_create_vcpu failed\n");
394 goto err;
395 }
396
397 cpu->kvm_fd = ret;
398 cpu->kvm_state = s;
399 cpu->vcpu_dirty = true;
400
401 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
402 if (mmap_size < 0) {
403 ret = mmap_size;
404 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
405 goto err;
406 }
407
408 cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
409 cpu->kvm_fd, 0);
410 if (cpu->kvm_run == MAP_FAILED) {
411 ret = -errno;
412 DPRINTF("mmap'ing vcpu state failed\n");
413 goto err;
414 }
415
416 if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
417 s->coalesced_mmio_ring =
418 (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
419 }
420
421 ret = kvm_arch_init_vcpu(cpu);
422 err:
423 return ret;
424 }
425
426 /*
427 * dirty pages logging control
428 */
429
kvm_mem_flags(MemoryRegion * mr)430 static int kvm_mem_flags(MemoryRegion *mr)
431 {
432 bool readonly = mr->readonly || memory_region_is_romd(mr);
433 int flags = 0;
434
435 if (memory_region_get_dirty_log_mask(mr) != 0) {
436 flags |= KVM_MEM_LOG_DIRTY_PAGES;
437 }
438 if (readonly && kvm_readonly_mem_allowed) {
439 flags |= KVM_MEM_READONLY;
440 }
441 return flags;
442 }
443
444 /* Called with KVMMemoryListener.slots_lock held */
kvm_slot_update_flags(KVMMemoryListener * kml,KVMSlot * mem,MemoryRegion * mr)445 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
446 MemoryRegion *mr)
447 {
448 mem->flags = kvm_mem_flags(mr);
449
450 /* If nothing changed effectively, no need to issue ioctl */
451 if (mem->flags == mem->old_flags) {
452 return 0;
453 }
454
455 return kvm_set_user_memory_region(kml, mem, false);
456 }
457
kvm_section_update_flags(KVMMemoryListener * kml,MemoryRegionSection * section)458 static int kvm_section_update_flags(KVMMemoryListener *kml,
459 MemoryRegionSection *section)
460 {
461 hwaddr start_addr, size, slot_size;
462 KVMSlot *mem;
463 int ret = 0;
464
465 size = kvm_align_section(section, &start_addr);
466 if (!size) {
467 return 0;
468 }
469
470 kvm_slots_lock(kml);
471
472 while (size && !ret) {
473 slot_size = MIN(kvm_max_slot_size, size);
474 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
475 if (!mem) {
476 /* We don't have a slot if we want to trap every access. */
477 goto out;
478 }
479
480 ret = kvm_slot_update_flags(kml, mem, section->mr);
481 start_addr += slot_size;
482 size -= slot_size;
483 }
484
485 out:
486 kvm_slots_unlock(kml);
487 return ret;
488 }
489
kvm_log_start(MemoryListener * listener,MemoryRegionSection * section,int old,int new)490 static void kvm_log_start(MemoryListener *listener,
491 MemoryRegionSection *section,
492 int old, int new)
493 {
494 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
495 int r;
496
497 if (old != 0) {
498 return;
499 }
500
501 r = kvm_section_update_flags(kml, section);
502 if (r < 0) {
503 abort();
504 }
505 }
506
kvm_log_stop(MemoryListener * listener,MemoryRegionSection * section,int old,int new)507 static void kvm_log_stop(MemoryListener *listener,
508 MemoryRegionSection *section,
509 int old, int new)
510 {
511 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
512 int r;
513
514 if (new != 0) {
515 return;
516 }
517
518 r = kvm_section_update_flags(kml, section);
519 if (r < 0) {
520 abort();
521 }
522 }
523
524 /* get kvm's dirty pages bitmap and update qemu's */
kvm_get_dirty_pages_log_range(MemoryRegionSection * section,unsigned long * bitmap)525 static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section,
526 unsigned long *bitmap)
527 {
528 ram_addr_t start = section->offset_within_region +
529 memory_region_get_ram_addr(section->mr);
530 ram_addr_t pages = int128_get64(section->size) / qemu_real_host_page_size;
531
532 cpu_physical_memory_set_dirty_lebitmap(bitmap, start, pages);
533 return 0;
534 }
535
536 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
537
538 /* Allocate the dirty bitmap for a slot */
kvm_memslot_init_dirty_bitmap(KVMSlot * mem)539 static void kvm_memslot_init_dirty_bitmap(KVMSlot *mem)
540 {
541 /*
542 * XXX bad kernel interface alert
543 * For dirty bitmap, kernel allocates array of size aligned to
544 * bits-per-long. But for case when the kernel is 64bits and
545 * the userspace is 32bits, userspace can't align to the same
546 * bits-per-long, since sizeof(long) is different between kernel
547 * and user space. This way, userspace will provide buffer which
548 * may be 4 bytes less than the kernel will use, resulting in
549 * userspace memory corruption (which is not detectable by valgrind
550 * too, in most cases).
551 * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
552 * a hope that sizeof(long) won't become >8 any time soon.
553 */
554 hwaddr bitmap_size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS),
555 /*HOST_LONG_BITS*/ 64) / 8;
556 mem->dirty_bmap = g_malloc0(bitmap_size);
557 }
558
559 /**
560 * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space
561 *
562 * This function will first try to fetch dirty bitmap from the kernel,
563 * and then updates qemu's dirty bitmap.
564 *
565 * NOTE: caller must be with kml->slots_lock held.
566 *
567 * @kml: the KVM memory listener object
568 * @section: the memory section to sync the dirty bitmap with
569 */
kvm_physical_sync_dirty_bitmap(KVMMemoryListener * kml,MemoryRegionSection * section)570 static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
571 MemoryRegionSection *section)
572 {
573 KVMState *s = kvm_state;
574 struct kvm_dirty_log d = {};
575 KVMSlot *mem;
576 hwaddr start_addr, size;
577 hwaddr slot_size, slot_offset = 0;
578 int ret = 0;
579
580 size = kvm_align_section(section, &start_addr);
581 while (size) {
582 MemoryRegionSection subsection = *section;
583
584 slot_size = MIN(kvm_max_slot_size, size);
585 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
586 if (!mem) {
587 /* We don't have a slot if we want to trap every access. */
588 goto out;
589 }
590
591 if (!mem->dirty_bmap) {
592 /* Allocate on the first log_sync, once and for all */
593 kvm_memslot_init_dirty_bitmap(mem);
594 }
595
596 d.dirty_bitmap = mem->dirty_bmap;
597 d.slot = mem->slot | (kml->as_id << 16);
598 if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
599 DPRINTF("ioctl failed %d\n", errno);
600 ret = -1;
601 goto out;
602 }
603
604 subsection.offset_within_region += slot_offset;
605 subsection.size = int128_make64(slot_size);
606 kvm_get_dirty_pages_log_range(&subsection, d.dirty_bitmap);
607
608 slot_offset += slot_size;
609 start_addr += slot_size;
610 size -= slot_size;
611 }
612 out:
613 return ret;
614 }
615
616 /* Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages */
617 #define KVM_CLEAR_LOG_SHIFT 6
618 #define KVM_CLEAR_LOG_ALIGN (qemu_real_host_page_size << KVM_CLEAR_LOG_SHIFT)
619 #define KVM_CLEAR_LOG_MASK (-KVM_CLEAR_LOG_ALIGN)
620
kvm_log_clear_one_slot(KVMSlot * mem,int as_id,uint64_t start,uint64_t size)621 static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start,
622 uint64_t size)
623 {
624 KVMState *s = kvm_state;
625 uint64_t end, bmap_start, start_delta, bmap_npages;
626 struct kvm_clear_dirty_log d;
627 unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size;
628 int ret;
629
630 /*
631 * We need to extend either the start or the size or both to
632 * satisfy the KVM interface requirement. Firstly, do the start
633 * page alignment on 64 host pages
634 */
635 bmap_start = start & KVM_CLEAR_LOG_MASK;
636 start_delta = start - bmap_start;
637 bmap_start /= psize;
638
639 /*
640 * The kernel interface has restriction on the size too, that either:
641 *
642 * (1) the size is 64 host pages aligned (just like the start), or
643 * (2) the size fills up until the end of the KVM memslot.
644 */
645 bmap_npages = DIV_ROUND_UP(size + start_delta, KVM_CLEAR_LOG_ALIGN)
646 << KVM_CLEAR_LOG_SHIFT;
647 end = mem->memory_size / psize;
648 if (bmap_npages > end - bmap_start) {
649 bmap_npages = end - bmap_start;
650 }
651 start_delta /= psize;
652
653 /*
654 * Prepare the bitmap to clear dirty bits. Here we must guarantee
655 * that we won't clear any unknown dirty bits otherwise we might
656 * accidentally clear some set bits which are not yet synced from
657 * the kernel into QEMU's bitmap, then we'll lose track of the
658 * guest modifications upon those pages (which can directly lead
659 * to guest data loss or panic after migration).
660 *
661 * Layout of the KVMSlot.dirty_bmap:
662 *
663 * |<-------- bmap_npages -----------..>|
664 * [1]
665 * start_delta size
666 * |----------------|-------------|------------------|------------|
667 * ^ ^ ^ ^
668 * | | | |
669 * start bmap_start (start) end
670 * of memslot of memslot
671 *
672 * [1] bmap_npages can be aligned to either 64 pages or the end of slot
673 */
674
675 assert(bmap_start % BITS_PER_LONG == 0);
676 /* We should never do log_clear before log_sync */
677 assert(mem->dirty_bmap);
678 if (start_delta) {
679 /* Slow path - we need to manipulate a temp bitmap */
680 bmap_clear = bitmap_new(bmap_npages);
681 bitmap_copy_with_src_offset(bmap_clear, mem->dirty_bmap,
682 bmap_start, start_delta + size / psize);
683 /*
684 * We need to fill the holes at start because that was not
685 * specified by the caller and we extended the bitmap only for
686 * 64 pages alignment
687 */
688 bitmap_clear(bmap_clear, 0, start_delta);
689 d.dirty_bitmap = bmap_clear;
690 } else {
691 /* Fast path - start address aligns well with BITS_PER_LONG */
692 d.dirty_bitmap = mem->dirty_bmap + BIT_WORD(bmap_start);
693 }
694
695 d.first_page = bmap_start;
696 /* It should never overflow. If it happens, say something */
697 assert(bmap_npages <= UINT32_MAX);
698 d.num_pages = bmap_npages;
699 d.slot = mem->slot | (as_id << 16);
700
701 if (kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d) == -1) {
702 ret = -errno;
703 error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, "
704 "start=0x%"PRIx64", size=0x%"PRIx32", errno=%d",
705 __func__, d.slot, (uint64_t)d.first_page,
706 (uint32_t)d.num_pages, ret);
707 } else {
708 ret = 0;
709 trace_kvm_clear_dirty_log(d.slot, d.first_page, d.num_pages);
710 }
711
712 /*
713 * After we have updated the remote dirty bitmap, we update the
714 * cached bitmap as well for the memslot, then if another user
715 * clears the same region we know we shouldn't clear it again on
716 * the remote otherwise it's data loss as well.
717 */
718 bitmap_clear(mem->dirty_bmap, bmap_start + start_delta,
719 size / psize);
720 /* This handles the NULL case well */
721 g_free(bmap_clear);
722 return ret;
723 }
724
725
726 /**
727 * kvm_physical_log_clear - Clear the kernel's dirty bitmap for range
728 *
729 * NOTE: this will be a no-op if we haven't enabled manual dirty log
730 * protection in the host kernel because in that case this operation
731 * will be done within log_sync().
732 *
733 * @kml: the kvm memory listener
734 * @section: the memory range to clear dirty bitmap
735 */
kvm_physical_log_clear(KVMMemoryListener * kml,MemoryRegionSection * section)736 static int kvm_physical_log_clear(KVMMemoryListener *kml,
737 MemoryRegionSection *section)
738 {
739 KVMState *s = kvm_state;
740 uint64_t start, size, offset, count;
741 KVMSlot *mem;
742 int ret = 0, i;
743
744 if (!s->manual_dirty_log_protect) {
745 /* No need to do explicit clear */
746 return ret;
747 }
748
749 start = section->offset_within_address_space;
750 size = int128_get64(section->size);
751
752 if (!size) {
753 /* Nothing more we can do... */
754 return ret;
755 }
756
757 kvm_slots_lock(kml);
758
759 for (i = 0; i < s->nr_slots; i++) {
760 mem = &kml->slots[i];
761 /* Discard slots that are empty or do not overlap the section */
762 if (!mem->memory_size ||
763 mem->start_addr > start + size - 1 ||
764 start > mem->start_addr + mem->memory_size - 1) {
765 continue;
766 }
767
768 if (start >= mem->start_addr) {
769 /* The slot starts before section or is aligned to it. */
770 offset = start - mem->start_addr;
771 count = MIN(mem->memory_size - offset, size);
772 } else {
773 /* The slot starts after section. */
774 offset = 0;
775 count = MIN(mem->memory_size, size - (mem->start_addr - start));
776 }
777 ret = kvm_log_clear_one_slot(mem, kml->as_id, offset, count);
778 if (ret < 0) {
779 break;
780 }
781 }
782
783 kvm_slots_unlock(kml);
784
785 return ret;
786 }
787
kvm_coalesce_mmio_region(MemoryListener * listener,MemoryRegionSection * secion,hwaddr start,hwaddr size)788 static void kvm_coalesce_mmio_region(MemoryListener *listener,
789 MemoryRegionSection *secion,
790 hwaddr start, hwaddr size)
791 {
792 KVMState *s = kvm_state;
793
794 if (s->coalesced_mmio) {
795 struct kvm_coalesced_mmio_zone zone;
796
797 zone.addr = start;
798 zone.size = size;
799 zone.pad = 0;
800
801 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
802 }
803 }
804
kvm_uncoalesce_mmio_region(MemoryListener * listener,MemoryRegionSection * secion,hwaddr start,hwaddr size)805 static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
806 MemoryRegionSection *secion,
807 hwaddr start, hwaddr size)
808 {
809 KVMState *s = kvm_state;
810
811 if (s->coalesced_mmio) {
812 struct kvm_coalesced_mmio_zone zone;
813
814 zone.addr = start;
815 zone.size = size;
816 zone.pad = 0;
817
818 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
819 }
820 }
821
kvm_coalesce_pio_add(MemoryListener * listener,MemoryRegionSection * section,hwaddr start,hwaddr size)822 static void kvm_coalesce_pio_add(MemoryListener *listener,
823 MemoryRegionSection *section,
824 hwaddr start, hwaddr size)
825 {
826 KVMState *s = kvm_state;
827
828 if (s->coalesced_pio) {
829 struct kvm_coalesced_mmio_zone zone;
830
831 zone.addr = start;
832 zone.size = size;
833 zone.pio = 1;
834
835 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
836 }
837 }
838
kvm_coalesce_pio_del(MemoryListener * listener,MemoryRegionSection * section,hwaddr start,hwaddr size)839 static void kvm_coalesce_pio_del(MemoryListener *listener,
840 MemoryRegionSection *section,
841 hwaddr start, hwaddr size)
842 {
843 KVMState *s = kvm_state;
844
845 if (s->coalesced_pio) {
846 struct kvm_coalesced_mmio_zone zone;
847
848 zone.addr = start;
849 zone.size = size;
850 zone.pio = 1;
851
852 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
853 }
854 }
855
856 static MemoryListener kvm_coalesced_pio_listener = {
857 .coalesced_io_add = kvm_coalesce_pio_add,
858 .coalesced_io_del = kvm_coalesce_pio_del,
859 };
860
kvm_check_extension(KVMState * s,unsigned int extension)861 int kvm_check_extension(KVMState *s, unsigned int extension)
862 {
863 int ret;
864
865 ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
866 if (ret < 0) {
867 ret = 0;
868 }
869
870 return ret;
871 }
872
kvm_vm_check_extension(KVMState * s,unsigned int extension)873 int kvm_vm_check_extension(KVMState *s, unsigned int extension)
874 {
875 int ret;
876
877 ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension);
878 if (ret < 0) {
879 /* VM wide version not implemented, use global one instead */
880 ret = kvm_check_extension(s, extension);
881 }
882
883 return ret;
884 }
885
adjust_ioeventfd_endianness(uint32_t val,uint32_t size)886 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
887 {
888 #if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN)
889 /* The kernel expects ioeventfd values in HOST_WORDS_BIGENDIAN
890 * endianness, but the memory core hands them in target endianness.
891 * For example, PPC is always treated as big-endian even if running
892 * on KVM and on PPC64LE. Correct here.
893 */
894 switch (size) {
895 case 2:
896 val = bswap16(val);
897 break;
898 case 4:
899 val = bswap32(val);
900 break;
901 }
902 #endif
903 return val;
904 }
905
kvm_set_ioeventfd_mmio(int fd,hwaddr addr,uint32_t val,bool assign,uint32_t size,bool datamatch)906 static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
907 bool assign, uint32_t size, bool datamatch)
908 {
909 int ret;
910 struct kvm_ioeventfd iofd = {
911 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
912 .addr = addr,
913 .len = size,
914 .flags = 0,
915 .fd = fd,
916 };
917
918 trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size,
919 datamatch);
920 if (!kvm_enabled()) {
921 return -ENOSYS;
922 }
923
924 if (datamatch) {
925 iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
926 }
927 if (!assign) {
928 iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
929 }
930
931 ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
932
933 if (ret < 0) {
934 return -errno;
935 }
936
937 return 0;
938 }
939
kvm_set_ioeventfd_pio(int fd,uint16_t addr,uint16_t val,bool assign,uint32_t size,bool datamatch)940 static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
941 bool assign, uint32_t size, bool datamatch)
942 {
943 struct kvm_ioeventfd kick = {
944 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
945 .addr = addr,
946 .flags = KVM_IOEVENTFD_FLAG_PIO,
947 .len = size,
948 .fd = fd,
949 };
950 int r;
951 trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch);
952 if (!kvm_enabled()) {
953 return -ENOSYS;
954 }
955 if (datamatch) {
956 kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
957 }
958 if (!assign) {
959 kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
960 }
961 r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
962 if (r < 0) {
963 return r;
964 }
965 return 0;
966 }
967
968
kvm_check_many_ioeventfds(void)969 static int kvm_check_many_ioeventfds(void)
970 {
971 /* Userspace can use ioeventfd for io notification. This requires a host
972 * that supports eventfd(2) and an I/O thread; since eventfd does not
973 * support SIGIO it cannot interrupt the vcpu.
974 *
975 * Older kernels have a 6 device limit on the KVM io bus. Find out so we
976 * can avoid creating too many ioeventfds.
977 */
978 #if defined(CONFIG_EVENTFD)
979 int ioeventfds[7];
980 int i, ret = 0;
981 for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
982 ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
983 if (ioeventfds[i] < 0) {
984 break;
985 }
986 ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true);
987 if (ret < 0) {
988 close(ioeventfds[i]);
989 break;
990 }
991 }
992
993 /* Decide whether many devices are supported or not */
994 ret = i == ARRAY_SIZE(ioeventfds);
995
996 while (i-- > 0) {
997 kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true);
998 close(ioeventfds[i]);
999 }
1000 return ret;
1001 #else
1002 return 0;
1003 #endif
1004 }
1005
1006 static const KVMCapabilityInfo *
kvm_check_extension_list(KVMState * s,const KVMCapabilityInfo * list)1007 kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
1008 {
1009 while (list->name) {
1010 if (!kvm_check_extension(s, list->value)) {
1011 return list;
1012 }
1013 list++;
1014 }
1015 return NULL;
1016 }
1017
kvm_set_max_memslot_size(hwaddr max_slot_size)1018 void kvm_set_max_memslot_size(hwaddr max_slot_size)
1019 {
1020 g_assert(
1021 ROUND_UP(max_slot_size, qemu_real_host_page_size) == max_slot_size
1022 );
1023 kvm_max_slot_size = max_slot_size;
1024 }
1025
kvm_set_phys_mem(KVMMemoryListener * kml,MemoryRegionSection * section,bool add)1026 static void kvm_set_phys_mem(KVMMemoryListener *kml,
1027 MemoryRegionSection *section, bool add)
1028 {
1029 KVMSlot *mem;
1030 int err;
1031 MemoryRegion *mr = section->mr;
1032 bool writeable = !mr->readonly && !mr->rom_device;
1033 hwaddr start_addr, size, slot_size;
1034 void *ram;
1035
1036 if (!memory_region_is_ram(mr)) {
1037 if (writeable || !kvm_readonly_mem_allowed) {
1038 return;
1039 } else if (!mr->romd_mode) {
1040 /* If the memory device is not in romd_mode, then we actually want
1041 * to remove the kvm memory slot so all accesses will trap. */
1042 add = false;
1043 }
1044 }
1045
1046 size = kvm_align_section(section, &start_addr);
1047 if (!size) {
1048 return;
1049 }
1050
1051 /* use aligned delta to align the ram address */
1052 ram = memory_region_get_ram_ptr(mr) + section->offset_within_region +
1053 (start_addr - section->offset_within_address_space);
1054
1055 kvm_slots_lock(kml);
1056
1057 if (!add) {
1058 do {
1059 slot_size = MIN(kvm_max_slot_size, size);
1060 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
1061 if (!mem) {
1062 goto out;
1063 }
1064 if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1065 kvm_physical_sync_dirty_bitmap(kml, section);
1066 }
1067
1068 /* unregister the slot */
1069 g_free(mem->dirty_bmap);
1070 mem->dirty_bmap = NULL;
1071 mem->memory_size = 0;
1072 mem->flags = 0;
1073 err = kvm_set_user_memory_region(kml, mem, false);
1074 if (err) {
1075 fprintf(stderr, "%s: error unregistering slot: %s\n",
1076 __func__, strerror(-err));
1077 abort();
1078 }
1079 start_addr += slot_size;
1080 size -= slot_size;
1081 } while (size);
1082 goto out;
1083 }
1084
1085 /* register the new slot */
1086 do {
1087 slot_size = MIN(kvm_max_slot_size, size);
1088 mem = kvm_alloc_slot(kml);
1089 mem->memory_size = slot_size;
1090 mem->start_addr = start_addr;
1091 mem->ram = ram;
1092 mem->flags = kvm_mem_flags(mr);
1093
1094 if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1095 /*
1096 * Reallocate the bmap; it means it doesn't disappear in
1097 * middle of a migrate.
1098 */
1099 kvm_memslot_init_dirty_bitmap(mem);
1100 }
1101 err = kvm_set_user_memory_region(kml, mem, true);
1102 if (err) {
1103 fprintf(stderr, "%s: error registering slot: %s\n", __func__,
1104 strerror(-err));
1105 abort();
1106 }
1107 start_addr += slot_size;
1108 ram += slot_size;
1109 size -= slot_size;
1110 } while (size);
1111
1112 out:
1113 kvm_slots_unlock(kml);
1114 }
1115
kvm_region_add(MemoryListener * listener,MemoryRegionSection * section)1116 static void kvm_region_add(MemoryListener *listener,
1117 MemoryRegionSection *section)
1118 {
1119 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1120
1121 memory_region_ref(section->mr);
1122 kvm_set_phys_mem(kml, section, true);
1123 }
1124
kvm_region_del(MemoryListener * listener,MemoryRegionSection * section)1125 static void kvm_region_del(MemoryListener *listener,
1126 MemoryRegionSection *section)
1127 {
1128 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1129
1130 kvm_set_phys_mem(kml, section, false);
1131 memory_region_unref(section->mr);
1132 }
1133
kvm_log_sync(MemoryListener * listener,MemoryRegionSection * section)1134 static void kvm_log_sync(MemoryListener *listener,
1135 MemoryRegionSection *section)
1136 {
1137 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1138 int r;
1139
1140 kvm_slots_lock(kml);
1141 r = kvm_physical_sync_dirty_bitmap(kml, section);
1142 kvm_slots_unlock(kml);
1143 if (r < 0) {
1144 abort();
1145 }
1146 }
1147
kvm_log_clear(MemoryListener * listener,MemoryRegionSection * section)1148 static void kvm_log_clear(MemoryListener *listener,
1149 MemoryRegionSection *section)
1150 {
1151 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1152 int r;
1153
1154 r = kvm_physical_log_clear(kml, section);
1155 if (r < 0) {
1156 error_report_once("%s: kvm log clear failed: mr=%s "
1157 "offset=%"HWADDR_PRIx" size=%"PRIx64, __func__,
1158 section->mr->name, section->offset_within_region,
1159 int128_get64(section->size));
1160 abort();
1161 }
1162 }
1163
kvm_mem_ioeventfd_add(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)1164 static void kvm_mem_ioeventfd_add(MemoryListener *listener,
1165 MemoryRegionSection *section,
1166 bool match_data, uint64_t data,
1167 EventNotifier *e)
1168 {
1169 int fd = event_notifier_get_fd(e);
1170 int r;
1171
1172 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1173 data, true, int128_get64(section->size),
1174 match_data);
1175 if (r < 0) {
1176 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1177 __func__, strerror(-r), -r);
1178 abort();
1179 }
1180 }
1181
kvm_mem_ioeventfd_del(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)1182 static void kvm_mem_ioeventfd_del(MemoryListener *listener,
1183 MemoryRegionSection *section,
1184 bool match_data, uint64_t data,
1185 EventNotifier *e)
1186 {
1187 int fd = event_notifier_get_fd(e);
1188 int r;
1189
1190 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1191 data, false, int128_get64(section->size),
1192 match_data);
1193 if (r < 0) {
1194 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1195 __func__, strerror(-r), -r);
1196 abort();
1197 }
1198 }
1199
kvm_io_ioeventfd_add(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)1200 static void kvm_io_ioeventfd_add(MemoryListener *listener,
1201 MemoryRegionSection *section,
1202 bool match_data, uint64_t data,
1203 EventNotifier *e)
1204 {
1205 int fd = event_notifier_get_fd(e);
1206 int r;
1207
1208 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1209 data, true, int128_get64(section->size),
1210 match_data);
1211 if (r < 0) {
1212 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1213 __func__, strerror(-r), -r);
1214 abort();
1215 }
1216 }
1217
kvm_io_ioeventfd_del(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)1218 static void kvm_io_ioeventfd_del(MemoryListener *listener,
1219 MemoryRegionSection *section,
1220 bool match_data, uint64_t data,
1221 EventNotifier *e)
1222
1223 {
1224 int fd = event_notifier_get_fd(e);
1225 int r;
1226
1227 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1228 data, false, int128_get64(section->size),
1229 match_data);
1230 if (r < 0) {
1231 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1232 __func__, strerror(-r), -r);
1233 abort();
1234 }
1235 }
1236
kvm_memory_listener_register(KVMState * s,KVMMemoryListener * kml,AddressSpace * as,int as_id)1237 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
1238 AddressSpace *as, int as_id)
1239 {
1240 int i;
1241
1242 qemu_mutex_init(&kml->slots_lock);
1243 kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot));
1244 kml->as_id = as_id;
1245
1246 for (i = 0; i < s->nr_slots; i++) {
1247 kml->slots[i].slot = i;
1248 }
1249
1250 kml->listener.region_add = kvm_region_add;
1251 kml->listener.region_del = kvm_region_del;
1252 kml->listener.log_start = kvm_log_start;
1253 kml->listener.log_stop = kvm_log_stop;
1254 kml->listener.log_sync = kvm_log_sync;
1255 kml->listener.log_clear = kvm_log_clear;
1256 kml->listener.priority = 10;
1257
1258 memory_listener_register(&kml->listener, as);
1259
1260 for (i = 0; i < s->nr_as; ++i) {
1261 if (!s->as[i].as) {
1262 s->as[i].as = as;
1263 s->as[i].ml = kml;
1264 break;
1265 }
1266 }
1267 }
1268
1269 static MemoryListener kvm_io_listener = {
1270 .eventfd_add = kvm_io_ioeventfd_add,
1271 .eventfd_del = kvm_io_ioeventfd_del,
1272 .priority = 10,
1273 };
1274
kvm_set_irq(KVMState * s,int irq,int level)1275 int kvm_set_irq(KVMState *s, int irq, int level)
1276 {
1277 struct kvm_irq_level event;
1278 int ret;
1279
1280 assert(kvm_async_interrupts_enabled());
1281
1282 event.level = level;
1283 event.irq = irq;
1284 ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
1285 if (ret < 0) {
1286 perror("kvm_set_irq");
1287 abort();
1288 }
1289
1290 return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
1291 }
1292
1293 #ifdef KVM_CAP_IRQ_ROUTING
1294 typedef struct KVMMSIRoute {
1295 struct kvm_irq_routing_entry kroute;
1296 QTAILQ_ENTRY(KVMMSIRoute) entry;
1297 } KVMMSIRoute;
1298
set_gsi(KVMState * s,unsigned int gsi)1299 static void set_gsi(KVMState *s, unsigned int gsi)
1300 {
1301 set_bit(gsi, s->used_gsi_bitmap);
1302 }
1303
clear_gsi(KVMState * s,unsigned int gsi)1304 static void clear_gsi(KVMState *s, unsigned int gsi)
1305 {
1306 clear_bit(gsi, s->used_gsi_bitmap);
1307 }
1308
kvm_init_irq_routing(KVMState * s)1309 void kvm_init_irq_routing(KVMState *s)
1310 {
1311 int gsi_count, i;
1312
1313 gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1;
1314 if (gsi_count > 0) {
1315 /* Round up so we can search ints using ffs */
1316 s->used_gsi_bitmap = bitmap_new(gsi_count);
1317 s->gsi_count = gsi_count;
1318 }
1319
1320 s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
1321 s->nr_allocated_irq_routes = 0;
1322
1323 if (!kvm_direct_msi_allowed) {
1324 for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
1325 QTAILQ_INIT(&s->msi_hashtab[i]);
1326 }
1327 }
1328
1329 kvm_arch_init_irq_routing(s);
1330 }
1331
kvm_irqchip_commit_routes(KVMState * s)1332 void kvm_irqchip_commit_routes(KVMState *s)
1333 {
1334 int ret;
1335
1336 if (kvm_gsi_direct_mapping()) {
1337 return;
1338 }
1339
1340 if (!kvm_gsi_routing_enabled()) {
1341 return;
1342 }
1343
1344 s->irq_routes->flags = 0;
1345 trace_kvm_irqchip_commit_routes();
1346 ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
1347 assert(ret == 0);
1348 }
1349
kvm_add_routing_entry(KVMState * s,struct kvm_irq_routing_entry * entry)1350 static void kvm_add_routing_entry(KVMState *s,
1351 struct kvm_irq_routing_entry *entry)
1352 {
1353 struct kvm_irq_routing_entry *new;
1354 int n, size;
1355
1356 if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
1357 n = s->nr_allocated_irq_routes * 2;
1358 if (n < 64) {
1359 n = 64;
1360 }
1361 size = sizeof(struct kvm_irq_routing);
1362 size += n * sizeof(*new);
1363 s->irq_routes = g_realloc(s->irq_routes, size);
1364 s->nr_allocated_irq_routes = n;
1365 }
1366 n = s->irq_routes->nr++;
1367 new = &s->irq_routes->entries[n];
1368
1369 *new = *entry;
1370
1371 set_gsi(s, entry->gsi);
1372 }
1373
kvm_update_routing_entry(KVMState * s,struct kvm_irq_routing_entry * new_entry)1374 static int kvm_update_routing_entry(KVMState *s,
1375 struct kvm_irq_routing_entry *new_entry)
1376 {
1377 struct kvm_irq_routing_entry *entry;
1378 int n;
1379
1380 for (n = 0; n < s->irq_routes->nr; n++) {
1381 entry = &s->irq_routes->entries[n];
1382 if (entry->gsi != new_entry->gsi) {
1383 continue;
1384 }
1385
1386 if(!memcmp(entry, new_entry, sizeof *entry)) {
1387 return 0;
1388 }
1389
1390 *entry = *new_entry;
1391
1392 return 0;
1393 }
1394
1395 return -ESRCH;
1396 }
1397
kvm_irqchip_add_irq_route(KVMState * s,int irq,int irqchip,int pin)1398 void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
1399 {
1400 struct kvm_irq_routing_entry e = {};
1401
1402 assert(pin < s->gsi_count);
1403
1404 e.gsi = irq;
1405 e.type = KVM_IRQ_ROUTING_IRQCHIP;
1406 e.flags = 0;
1407 e.u.irqchip.irqchip = irqchip;
1408 e.u.irqchip.pin = pin;
1409 kvm_add_routing_entry(s, &e);
1410 }
1411
kvm_irqchip_release_virq(KVMState * s,int virq)1412 void kvm_irqchip_release_virq(KVMState *s, int virq)
1413 {
1414 struct kvm_irq_routing_entry *e;
1415 int i;
1416
1417 if (kvm_gsi_direct_mapping()) {
1418 return;
1419 }
1420
1421 for (i = 0; i < s->irq_routes->nr; i++) {
1422 e = &s->irq_routes->entries[i];
1423 if (e->gsi == virq) {
1424 s->irq_routes->nr--;
1425 *e = s->irq_routes->entries[s->irq_routes->nr];
1426 }
1427 }
1428 clear_gsi(s, virq);
1429 kvm_arch_release_virq_post(virq);
1430 trace_kvm_irqchip_release_virq(virq);
1431 }
1432
kvm_irqchip_add_change_notifier(Notifier * n)1433 void kvm_irqchip_add_change_notifier(Notifier *n)
1434 {
1435 notifier_list_add(&kvm_irqchip_change_notifiers, n);
1436 }
1437
kvm_irqchip_remove_change_notifier(Notifier * n)1438 void kvm_irqchip_remove_change_notifier(Notifier *n)
1439 {
1440 notifier_remove(n);
1441 }
1442
kvm_irqchip_change_notify(void)1443 void kvm_irqchip_change_notify(void)
1444 {
1445 notifier_list_notify(&kvm_irqchip_change_notifiers, NULL);
1446 }
1447
kvm_hash_msi(uint32_t data)1448 static unsigned int kvm_hash_msi(uint32_t data)
1449 {
1450 /* This is optimized for IA32 MSI layout. However, no other arch shall
1451 * repeat the mistake of not providing a direct MSI injection API. */
1452 return data & 0xff;
1453 }
1454
kvm_flush_dynamic_msi_routes(KVMState * s)1455 static void kvm_flush_dynamic_msi_routes(KVMState *s)
1456 {
1457 KVMMSIRoute *route, *next;
1458 unsigned int hash;
1459
1460 for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
1461 QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
1462 kvm_irqchip_release_virq(s, route->kroute.gsi);
1463 QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
1464 g_free(route);
1465 }
1466 }
1467 }
1468
kvm_irqchip_get_virq(KVMState * s)1469 static int kvm_irqchip_get_virq(KVMState *s)
1470 {
1471 int next_virq;
1472
1473 /*
1474 * PIC and IOAPIC share the first 16 GSI numbers, thus the available
1475 * GSI numbers are more than the number of IRQ route. Allocating a GSI
1476 * number can succeed even though a new route entry cannot be added.
1477 * When this happens, flush dynamic MSI entries to free IRQ route entries.
1478 */
1479 if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) {
1480 kvm_flush_dynamic_msi_routes(s);
1481 }
1482
1483 /* Return the lowest unused GSI in the bitmap */
1484 next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count);
1485 if (next_virq >= s->gsi_count) {
1486 return -ENOSPC;
1487 } else {
1488 return next_virq;
1489 }
1490 }
1491
kvm_lookup_msi_route(KVMState * s,MSIMessage msg)1492 static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg)
1493 {
1494 unsigned int hash = kvm_hash_msi(msg.data);
1495 KVMMSIRoute *route;
1496
1497 QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) {
1498 if (route->kroute.u.msi.address_lo == (uint32_t)msg.address &&
1499 route->kroute.u.msi.address_hi == (msg.address >> 32) &&
1500 route->kroute.u.msi.data == le32_to_cpu(msg.data)) {
1501 return route;
1502 }
1503 }
1504 return NULL;
1505 }
1506
kvm_irqchip_send_msi(KVMState * s,MSIMessage msg)1507 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1508 {
1509 struct kvm_msi msi;
1510 KVMMSIRoute *route;
1511
1512 if (kvm_direct_msi_allowed) {
1513 msi.address_lo = (uint32_t)msg.address;
1514 msi.address_hi = msg.address >> 32;
1515 msi.data = le32_to_cpu(msg.data);
1516 msi.flags = 0;
1517 memset(msi.pad, 0, sizeof(msi.pad));
1518
1519 return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
1520 }
1521
1522 route = kvm_lookup_msi_route(s, msg);
1523 if (!route) {
1524 int virq;
1525
1526 virq = kvm_irqchip_get_virq(s);
1527 if (virq < 0) {
1528 return virq;
1529 }
1530
1531 route = g_malloc0(sizeof(KVMMSIRoute));
1532 route->kroute.gsi = virq;
1533 route->kroute.type = KVM_IRQ_ROUTING_MSI;
1534 route->kroute.flags = 0;
1535 route->kroute.u.msi.address_lo = (uint32_t)msg.address;
1536 route->kroute.u.msi.address_hi = msg.address >> 32;
1537 route->kroute.u.msi.data = le32_to_cpu(msg.data);
1538
1539 kvm_add_routing_entry(s, &route->kroute);
1540 kvm_irqchip_commit_routes(s);
1541
1542 QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route,
1543 entry);
1544 }
1545
1546 assert(route->kroute.type == KVM_IRQ_ROUTING_MSI);
1547
1548 return kvm_set_irq(s, route->kroute.gsi, 1);
1549 }
1550
kvm_irqchip_add_msi_route(KVMState * s,int vector,PCIDevice * dev)1551 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
1552 {
1553 struct kvm_irq_routing_entry kroute = {};
1554 int virq;
1555 MSIMessage msg = {0, 0};
1556
1557 if (pci_available && dev) {
1558 msg = pci_get_msi_message(dev, vector);
1559 }
1560
1561 if (kvm_gsi_direct_mapping()) {
1562 return kvm_arch_msi_data_to_gsi(msg.data);
1563 }
1564
1565 if (!kvm_gsi_routing_enabled()) {
1566 return -ENOSYS;
1567 }
1568
1569 virq = kvm_irqchip_get_virq(s);
1570 if (virq < 0) {
1571 return virq;
1572 }
1573
1574 kroute.gsi = virq;
1575 kroute.type = KVM_IRQ_ROUTING_MSI;
1576 kroute.flags = 0;
1577 kroute.u.msi.address_lo = (uint32_t)msg.address;
1578 kroute.u.msi.address_hi = msg.address >> 32;
1579 kroute.u.msi.data = le32_to_cpu(msg.data);
1580 if (pci_available && kvm_msi_devid_required()) {
1581 kroute.flags = KVM_MSI_VALID_DEVID;
1582 kroute.u.msi.devid = pci_requester_id(dev);
1583 }
1584 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
1585 kvm_irqchip_release_virq(s, virq);
1586 return -EINVAL;
1587 }
1588
1589 trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A",
1590 vector, virq);
1591
1592 kvm_add_routing_entry(s, &kroute);
1593 kvm_arch_add_msi_route_post(&kroute, vector, dev);
1594 kvm_irqchip_commit_routes(s);
1595
1596 return virq;
1597 }
1598
kvm_irqchip_update_msi_route(KVMState * s,int virq,MSIMessage msg,PCIDevice * dev)1599 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
1600 PCIDevice *dev)
1601 {
1602 struct kvm_irq_routing_entry kroute = {};
1603
1604 if (kvm_gsi_direct_mapping()) {
1605 return 0;
1606 }
1607
1608 if (!kvm_irqchip_in_kernel()) {
1609 return -ENOSYS;
1610 }
1611
1612 kroute.gsi = virq;
1613 kroute.type = KVM_IRQ_ROUTING_MSI;
1614 kroute.flags = 0;
1615 kroute.u.msi.address_lo = (uint32_t)msg.address;
1616 kroute.u.msi.address_hi = msg.address >> 32;
1617 kroute.u.msi.data = le32_to_cpu(msg.data);
1618 if (pci_available && kvm_msi_devid_required()) {
1619 kroute.flags = KVM_MSI_VALID_DEVID;
1620 kroute.u.msi.devid = pci_requester_id(dev);
1621 }
1622 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
1623 return -EINVAL;
1624 }
1625
1626 trace_kvm_irqchip_update_msi_route(virq);
1627
1628 return kvm_update_routing_entry(s, &kroute);
1629 }
1630
kvm_irqchip_assign_irqfd(KVMState * s,int fd,int rfd,int virq,bool assign)1631 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int rfd, int virq,
1632 bool assign)
1633 {
1634 struct kvm_irqfd irqfd = {
1635 .fd = fd,
1636 .gsi = virq,
1637 .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
1638 };
1639
1640 if (rfd != -1) {
1641 irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
1642 irqfd.resamplefd = rfd;
1643 }
1644
1645 if (!kvm_irqfds_enabled()) {
1646 return -ENOSYS;
1647 }
1648
1649 return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
1650 }
1651
kvm_irqchip_add_adapter_route(KVMState * s,AdapterInfo * adapter)1652 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
1653 {
1654 struct kvm_irq_routing_entry kroute = {};
1655 int virq;
1656
1657 if (!kvm_gsi_routing_enabled()) {
1658 return -ENOSYS;
1659 }
1660
1661 virq = kvm_irqchip_get_virq(s);
1662 if (virq < 0) {
1663 return virq;
1664 }
1665
1666 kroute.gsi = virq;
1667 kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER;
1668 kroute.flags = 0;
1669 kroute.u.adapter.summary_addr = adapter->summary_addr;
1670 kroute.u.adapter.ind_addr = adapter->ind_addr;
1671 kroute.u.adapter.summary_offset = adapter->summary_offset;
1672 kroute.u.adapter.ind_offset = adapter->ind_offset;
1673 kroute.u.adapter.adapter_id = adapter->adapter_id;
1674
1675 kvm_add_routing_entry(s, &kroute);
1676
1677 return virq;
1678 }
1679
kvm_irqchip_add_hv_sint_route(KVMState * s,uint32_t vcpu,uint32_t sint)1680 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
1681 {
1682 struct kvm_irq_routing_entry kroute = {};
1683 int virq;
1684
1685 if (!kvm_gsi_routing_enabled()) {
1686 return -ENOSYS;
1687 }
1688 if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) {
1689 return -ENOSYS;
1690 }
1691 virq = kvm_irqchip_get_virq(s);
1692 if (virq < 0) {
1693 return virq;
1694 }
1695
1696 kroute.gsi = virq;
1697 kroute.type = KVM_IRQ_ROUTING_HV_SINT;
1698 kroute.flags = 0;
1699 kroute.u.hv_sint.vcpu = vcpu;
1700 kroute.u.hv_sint.sint = sint;
1701
1702 kvm_add_routing_entry(s, &kroute);
1703 kvm_irqchip_commit_routes(s);
1704
1705 return virq;
1706 }
1707
1708 #else /* !KVM_CAP_IRQ_ROUTING */
1709
kvm_init_irq_routing(KVMState * s)1710 void kvm_init_irq_routing(KVMState *s)
1711 {
1712 }
1713
kvm_irqchip_release_virq(KVMState * s,int virq)1714 void kvm_irqchip_release_virq(KVMState *s, int virq)
1715 {
1716 }
1717
kvm_irqchip_send_msi(KVMState * s,MSIMessage msg)1718 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1719 {
1720 abort();
1721 }
1722
kvm_irqchip_add_msi_route(KVMState * s,int vector,PCIDevice * dev)1723 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
1724 {
1725 return -ENOSYS;
1726 }
1727
kvm_irqchip_add_adapter_route(KVMState * s,AdapterInfo * adapter)1728 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
1729 {
1730 return -ENOSYS;
1731 }
1732
kvm_irqchip_add_hv_sint_route(KVMState * s,uint32_t vcpu,uint32_t sint)1733 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
1734 {
1735 return -ENOSYS;
1736 }
1737
kvm_irqchip_assign_irqfd(KVMState * s,int fd,int virq,bool assign)1738 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign)
1739 {
1740 abort();
1741 }
1742
kvm_irqchip_update_msi_route(KVMState * s,int virq,MSIMessage msg)1743 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
1744 {
1745 return -ENOSYS;
1746 }
1747 #endif /* !KVM_CAP_IRQ_ROUTING */
1748
kvm_irqchip_add_irqfd_notifier_gsi(KVMState * s,EventNotifier * n,EventNotifier * rn,int virq)1749 int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
1750 EventNotifier *rn, int virq)
1751 {
1752 return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n),
1753 rn ? event_notifier_get_fd(rn) : -1, virq, true);
1754 }
1755
kvm_irqchip_remove_irqfd_notifier_gsi(KVMState * s,EventNotifier * n,int virq)1756 int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
1757 int virq)
1758 {
1759 return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -1, virq,
1760 false);
1761 }
1762
kvm_irqchip_add_irqfd_notifier(KVMState * s,EventNotifier * n,EventNotifier * rn,qemu_irq irq)1763 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
1764 EventNotifier *rn, qemu_irq irq)
1765 {
1766 gpointer key, gsi;
1767 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
1768
1769 if (!found) {
1770 return -ENXIO;
1771 }
1772 return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
1773 }
1774
kvm_irqchip_remove_irqfd_notifier(KVMState * s,EventNotifier * n,qemu_irq irq)1775 int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
1776 qemu_irq irq)
1777 {
1778 gpointer key, gsi;
1779 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
1780
1781 if (!found) {
1782 return -ENXIO;
1783 }
1784 return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
1785 }
1786
kvm_irqchip_set_qemuirq_gsi(KVMState * s,qemu_irq irq,int gsi)1787 void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
1788 {
1789 g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
1790 }
1791
kvm_irqchip_create(KVMState * s)1792 static void kvm_irqchip_create(KVMState *s)
1793 {
1794 int ret;
1795
1796 assert(s->kernel_irqchip_split != ON_OFF_AUTO_AUTO);
1797 if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
1798 ;
1799 } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
1800 ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0);
1801 if (ret < 0) {
1802 fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
1803 exit(1);
1804 }
1805 } else {
1806 return;
1807 }
1808
1809 /* First probe and see if there's a arch-specific hook to create the
1810 * in-kernel irqchip for us */
1811 ret = kvm_arch_irqchip_create(s);
1812 if (ret == 0) {
1813 if (s->kernel_irqchip_split == ON_OFF_AUTO_ON) {
1814 perror("Split IRQ chip mode not supported.");
1815 exit(1);
1816 } else {
1817 ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
1818 }
1819 }
1820 if (ret < 0) {
1821 fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
1822 exit(1);
1823 }
1824
1825 kvm_kernel_irqchip = true;
1826 /* If we have an in-kernel IRQ chip then we must have asynchronous
1827 * interrupt delivery (though the reverse is not necessarily true)
1828 */
1829 kvm_async_interrupts_allowed = true;
1830 kvm_halt_in_kernel_allowed = true;
1831
1832 kvm_init_irq_routing(s);
1833
1834 s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
1835 }
1836
1837 /* Find number of supported CPUs using the recommended
1838 * procedure from the kernel API documentation to cope with
1839 * older kernels that may be missing capabilities.
1840 */
kvm_recommended_vcpus(KVMState * s)1841 static int kvm_recommended_vcpus(KVMState *s)
1842 {
1843 int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS);
1844 return (ret) ? ret : 4;
1845 }
1846
kvm_max_vcpus(KVMState * s)1847 static int kvm_max_vcpus(KVMState *s)
1848 {
1849 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
1850 return (ret) ? ret : kvm_recommended_vcpus(s);
1851 }
1852
kvm_max_vcpu_id(KVMState * s)1853 static int kvm_max_vcpu_id(KVMState *s)
1854 {
1855 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID);
1856 return (ret) ? ret : kvm_max_vcpus(s);
1857 }
1858
kvm_vcpu_id_is_valid(int vcpu_id)1859 bool kvm_vcpu_id_is_valid(int vcpu_id)
1860 {
1861 KVMState *s = KVM_STATE(current_accel());
1862 return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s);
1863 }
1864
kvm_init(MachineState * ms)1865 static int kvm_init(MachineState *ms)
1866 {
1867 MachineClass *mc = MACHINE_GET_CLASS(ms);
1868 static const char upgrade_note[] =
1869 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
1870 "(see http://sourceforge.net/projects/kvm).\n";
1871 struct {
1872 const char *name;
1873 int num;
1874 } num_cpus[] = {
1875 { "SMP", ms->smp.cpus },
1876 { "hotpluggable", ms->smp.max_cpus },
1877 { NULL, }
1878 }, *nc = num_cpus;
1879 int soft_vcpus_limit, hard_vcpus_limit;
1880 KVMState *s;
1881 const KVMCapabilityInfo *missing_cap;
1882 int ret;
1883 int type = 0;
1884 const char *kvm_type;
1885
1886 s = KVM_STATE(ms->accelerator);
1887
1888 /*
1889 * On systems where the kernel can support different base page
1890 * sizes, host page size may be different from TARGET_PAGE_SIZE,
1891 * even with KVM. TARGET_PAGE_SIZE is assumed to be the minimum
1892 * page size for the system though.
1893 */
1894 assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size);
1895
1896 s->sigmask_len = 8;
1897
1898 #ifdef KVM_CAP_SET_GUEST_DEBUG
1899 QTAILQ_INIT(&s->kvm_sw_breakpoints);
1900 #endif
1901 QLIST_INIT(&s->kvm_parked_vcpus);
1902 s->vmfd = -1;
1903 s->fd = qemu_open("/dev/kvm", O_RDWR);
1904 if (s->fd == -1) {
1905 fprintf(stderr, "Could not access KVM kernel module: %m\n");
1906 ret = -errno;
1907 goto err;
1908 }
1909
1910 ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
1911 if (ret < KVM_API_VERSION) {
1912 if (ret >= 0) {
1913 ret = -EINVAL;
1914 }
1915 fprintf(stderr, "kvm version too old\n");
1916 goto err;
1917 }
1918
1919 if (ret > KVM_API_VERSION) {
1920 ret = -EINVAL;
1921 fprintf(stderr, "kvm version not supported\n");
1922 goto err;
1923 }
1924
1925 kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
1926 s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
1927
1928 /* If unspecified, use the default value */
1929 if (!s->nr_slots) {
1930 s->nr_slots = 32;
1931 }
1932
1933 s->nr_as = kvm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE);
1934 if (s->nr_as <= 1) {
1935 s->nr_as = 1;
1936 }
1937 s->as = g_new0(struct KVMAs, s->nr_as);
1938
1939 kvm_type = qemu_opt_get(qemu_get_machine_opts(), "kvm-type");
1940 if (mc->kvm_type) {
1941 type = mc->kvm_type(ms, kvm_type);
1942 } else if (kvm_type) {
1943 ret = -EINVAL;
1944 fprintf(stderr, "Invalid argument kvm-type=%s\n", kvm_type);
1945 goto err;
1946 }
1947
1948 do {
1949 ret = kvm_ioctl(s, KVM_CREATE_VM, type);
1950 } while (ret == -EINTR);
1951
1952 if (ret < 0) {
1953 fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret,
1954 strerror(-ret));
1955
1956 #ifdef TARGET_S390X
1957 if (ret == -EINVAL) {
1958 fprintf(stderr,
1959 "Host kernel setup problem detected. Please verify:\n");
1960 fprintf(stderr, "- for kernels supporting the switch_amode or"
1961 " user_mode parameters, whether\n");
1962 fprintf(stderr,
1963 " user space is running in primary address space\n");
1964 fprintf(stderr,
1965 "- for kernels supporting the vm.allocate_pgste sysctl, "
1966 "whether it is enabled\n");
1967 }
1968 #endif
1969 goto err;
1970 }
1971
1972 s->vmfd = ret;
1973
1974 /* check the vcpu limits */
1975 soft_vcpus_limit = kvm_recommended_vcpus(s);
1976 hard_vcpus_limit = kvm_max_vcpus(s);
1977
1978 while (nc->name) {
1979 if (nc->num > soft_vcpus_limit) {
1980 warn_report("Number of %s cpus requested (%d) exceeds "
1981 "the recommended cpus supported by KVM (%d)",
1982 nc->name, nc->num, soft_vcpus_limit);
1983
1984 if (nc->num > hard_vcpus_limit) {
1985 fprintf(stderr, "Number of %s cpus requested (%d) exceeds "
1986 "the maximum cpus supported by KVM (%d)\n",
1987 nc->name, nc->num, hard_vcpus_limit);
1988 exit(1);
1989 }
1990 }
1991 nc++;
1992 }
1993
1994 missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
1995 if (!missing_cap) {
1996 missing_cap =
1997 kvm_check_extension_list(s, kvm_arch_required_capabilities);
1998 }
1999 if (missing_cap) {
2000 ret = -EINVAL;
2001 fprintf(stderr, "kvm does not support %s\n%s",
2002 missing_cap->name, upgrade_note);
2003 goto err;
2004 }
2005
2006 s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
2007 s->coalesced_pio = s->coalesced_mmio &&
2008 kvm_check_extension(s, KVM_CAP_COALESCED_PIO);
2009
2010 s->manual_dirty_log_protect =
2011 kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
2012 if (s->manual_dirty_log_protect) {
2013 ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0, 1);
2014 if (ret) {
2015 warn_report("Trying to enable KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 "
2016 "but failed. Falling back to the legacy mode. ");
2017 s->manual_dirty_log_protect = false;
2018 }
2019 }
2020
2021 #ifdef KVM_CAP_VCPU_EVENTS
2022 s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
2023 #endif
2024
2025 s->robust_singlestep =
2026 kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
2027
2028 #ifdef KVM_CAP_DEBUGREGS
2029 s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
2030 #endif
2031
2032 s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);
2033
2034 #ifdef KVM_CAP_IRQ_ROUTING
2035 kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
2036 #endif
2037
2038 s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3);
2039
2040 s->irq_set_ioctl = KVM_IRQ_LINE;
2041 if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
2042 s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
2043 }
2044
2045 kvm_readonly_mem_allowed =
2046 (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
2047
2048 kvm_eventfds_allowed =
2049 (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0);
2050
2051 kvm_irqfds_allowed =
2052 (kvm_check_extension(s, KVM_CAP_IRQFD) > 0);
2053
2054 kvm_resamplefds_allowed =
2055 (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0);
2056
2057 kvm_vm_attributes_allowed =
2058 (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0);
2059
2060 kvm_ioeventfd_any_length_allowed =
2061 (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0);
2062
2063 kvm_state = s;
2064
2065 /*
2066 * if memory encryption object is specified then initialize the memory
2067 * encryption context.
2068 */
2069 if (ms->memory_encryption) {
2070 kvm_state->memcrypt_handle = sev_guest_init(ms->memory_encryption);
2071 if (!kvm_state->memcrypt_handle) {
2072 ret = -1;
2073 goto err;
2074 }
2075
2076 kvm_state->memcrypt_encrypt_data = sev_encrypt_data;
2077 }
2078
2079 ret = kvm_arch_init(ms, s);
2080 if (ret < 0) {
2081 goto err;
2082 }
2083
2084 if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) {
2085 s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
2086 }
2087
2088 if (s->kernel_irqchip_allowed) {
2089 kvm_irqchip_create(s);
2090 }
2091
2092 if (kvm_eventfds_allowed) {
2093 s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
2094 s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
2095 }
2096 s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region;
2097 s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region;
2098
2099 kvm_memory_listener_register(s, &s->memory_listener,
2100 &address_space_memory, 0);
2101 memory_listener_register(&kvm_io_listener,
2102 &address_space_io);
2103 memory_listener_register(&kvm_coalesced_pio_listener,
2104 &address_space_io);
2105
2106 s->many_ioeventfds = kvm_check_many_ioeventfds();
2107
2108 s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
2109 if (!s->sync_mmu) {
2110 qemu_balloon_inhibit(true);
2111 }
2112
2113 return 0;
2114
2115 err:
2116 assert(ret < 0);
2117 if (s->vmfd >= 0) {
2118 close(s->vmfd);
2119 }
2120 if (s->fd != -1) {
2121 close(s->fd);
2122 }
2123 g_free(s->memory_listener.slots);
2124
2125 return ret;
2126 }
2127
kvm_set_sigmask_len(KVMState * s,unsigned int sigmask_len)2128 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len)
2129 {
2130 s->sigmask_len = sigmask_len;
2131 }
2132
kvm_handle_io(uint16_t port,MemTxAttrs attrs,void * data,int direction,int size,uint32_t count)2133 static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,
2134 int size, uint32_t count)
2135 {
2136 int i;
2137 uint8_t *ptr = data;
2138
2139 for (i = 0; i < count; i++) {
2140 address_space_rw(&address_space_io, port, attrs,
2141 ptr, size,
2142 direction == KVM_EXIT_IO_OUT);
2143 ptr += size;
2144 }
2145 }
2146
kvm_handle_internal_error(CPUState * cpu,struct kvm_run * run)2147 static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run)
2148 {
2149 fprintf(stderr, "KVM internal error. Suberror: %d\n",
2150 run->internal.suberror);
2151
2152 if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
2153 int i;
2154
2155 for (i = 0; i < run->internal.ndata; ++i) {
2156 fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
2157 i, (uint64_t)run->internal.data[i]);
2158 }
2159 }
2160 if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
2161 fprintf(stderr, "emulation failure\n");
2162 if (!kvm_arch_stop_on_emulation_error(cpu)) {
2163 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2164 return EXCP_INTERRUPT;
2165 }
2166 }
2167 /* FIXME: Should trigger a qmp message to let management know
2168 * something went wrong.
2169 */
2170 return -1;
2171 }
2172
kvm_flush_coalesced_mmio_buffer(void)2173 void kvm_flush_coalesced_mmio_buffer(void)
2174 {
2175 KVMState *s = kvm_state;
2176
2177 if (s->coalesced_flush_in_progress) {
2178 return;
2179 }
2180
2181 s->coalesced_flush_in_progress = true;
2182
2183 if (s->coalesced_mmio_ring) {
2184 struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
2185 while (ring->first != ring->last) {
2186 struct kvm_coalesced_mmio *ent;
2187
2188 ent = &ring->coalesced_mmio[ring->first];
2189
2190 if (ent->pio == 1) {
2191 address_space_write(&address_space_io, ent->phys_addr,
2192 MEMTXATTRS_UNSPECIFIED, ent->data,
2193 ent->len);
2194 } else {
2195 cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
2196 }
2197 smp_wmb();
2198 ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
2199 }
2200 }
2201
2202 s->coalesced_flush_in_progress = false;
2203 }
2204
do_kvm_cpu_synchronize_state(CPUState * cpu,run_on_cpu_data arg)2205 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2206 {
2207 if (!cpu->vcpu_dirty) {
2208 kvm_arch_get_registers(cpu);
2209 cpu->vcpu_dirty = true;
2210 }
2211 }
2212
kvm_cpu_synchronize_state(CPUState * cpu)2213 void kvm_cpu_synchronize_state(CPUState *cpu)
2214 {
2215 if (!cpu->vcpu_dirty) {
2216 run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
2217 }
2218 }
2219
do_kvm_cpu_synchronize_post_reset(CPUState * cpu,run_on_cpu_data arg)2220 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
2221 {
2222 kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
2223 cpu->vcpu_dirty = false;
2224 }
2225
kvm_cpu_synchronize_post_reset(CPUState * cpu)2226 void kvm_cpu_synchronize_post_reset(CPUState *cpu)
2227 {
2228 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2229 }
2230
do_kvm_cpu_synchronize_post_init(CPUState * cpu,run_on_cpu_data arg)2231 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
2232 {
2233 kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
2234 cpu->vcpu_dirty = false;
2235 }
2236
kvm_cpu_synchronize_post_init(CPUState * cpu)2237 void kvm_cpu_synchronize_post_init(CPUState *cpu)
2238 {
2239 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2240 }
2241
do_kvm_cpu_synchronize_pre_loadvm(CPUState * cpu,run_on_cpu_data arg)2242 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
2243 {
2244 cpu->vcpu_dirty = true;
2245 }
2246
kvm_cpu_synchronize_pre_loadvm(CPUState * cpu)2247 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
2248 {
2249 run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2250 }
2251
2252 #ifdef KVM_HAVE_MCE_INJECTION
2253 static __thread void *pending_sigbus_addr;
2254 static __thread int pending_sigbus_code;
2255 static __thread bool have_sigbus_pending;
2256 #endif
2257
kvm_cpu_kick(CPUState * cpu)2258 static void kvm_cpu_kick(CPUState *cpu)
2259 {
2260 atomic_set(&cpu->kvm_run->immediate_exit, 1);
2261 }
2262
kvm_cpu_kick_self(void)2263 static void kvm_cpu_kick_self(void)
2264 {
2265 if (kvm_immediate_exit) {
2266 kvm_cpu_kick(current_cpu);
2267 } else {
2268 qemu_cpu_kick_self();
2269 }
2270 }
2271
kvm_eat_signals(CPUState * cpu)2272 static void kvm_eat_signals(CPUState *cpu)
2273 {
2274 struct timespec ts = { 0, 0 };
2275 siginfo_t siginfo;
2276 sigset_t waitset;
2277 sigset_t chkset;
2278 int r;
2279
2280 if (kvm_immediate_exit) {
2281 atomic_set(&cpu->kvm_run->immediate_exit, 0);
2282 /* Write kvm_run->immediate_exit before the cpu->exit_request
2283 * write in kvm_cpu_exec.
2284 */
2285 smp_wmb();
2286 return;
2287 }
2288
2289 sigemptyset(&waitset);
2290 sigaddset(&waitset, SIG_IPI);
2291
2292 do {
2293 r = sigtimedwait(&waitset, &siginfo, &ts);
2294 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
2295 perror("sigtimedwait");
2296 exit(1);
2297 }
2298
2299 r = sigpending(&chkset);
2300 if (r == -1) {
2301 perror("sigpending");
2302 exit(1);
2303 }
2304 } while (sigismember(&chkset, SIG_IPI));
2305 }
2306
kvm_cpu_exec(CPUState * cpu)2307 int kvm_cpu_exec(CPUState *cpu)
2308 {
2309 struct kvm_run *run = cpu->kvm_run;
2310 int ret, run_ret;
2311
2312 DPRINTF("kvm_cpu_exec()\n");
2313
2314 if (kvm_arch_process_async_events(cpu)) {
2315 atomic_set(&cpu->exit_request, 0);
2316 return EXCP_HLT;
2317 }
2318
2319 qemu_mutex_unlock_iothread();
2320 cpu_exec_start(cpu);
2321
2322 do {
2323 MemTxAttrs attrs;
2324
2325 if (cpu->vcpu_dirty) {
2326 kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
2327 cpu->vcpu_dirty = false;
2328 }
2329
2330 kvm_arch_pre_run(cpu, run);
2331 if (atomic_read(&cpu->exit_request)) {
2332 DPRINTF("interrupt exit requested\n");
2333 /*
2334 * KVM requires us to reenter the kernel after IO exits to complete
2335 * instruction emulation. This self-signal will ensure that we
2336 * leave ASAP again.
2337 */
2338 kvm_cpu_kick_self();
2339 }
2340
2341 /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
2342 * Matching barrier in kvm_eat_signals.
2343 */
2344 smp_rmb();
2345
2346 run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
2347
2348 attrs = kvm_arch_post_run(cpu, run);
2349
2350 #ifdef KVM_HAVE_MCE_INJECTION
2351 if (unlikely(have_sigbus_pending)) {
2352 qemu_mutex_lock_iothread();
2353 kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
2354 pending_sigbus_addr);
2355 have_sigbus_pending = false;
2356 qemu_mutex_unlock_iothread();
2357 }
2358 #endif
2359
2360 if (run_ret < 0) {
2361 if (run_ret == -EINTR || run_ret == -EAGAIN) {
2362 DPRINTF("io window exit\n");
2363 kvm_eat_signals(cpu);
2364 ret = EXCP_INTERRUPT;
2365 break;
2366 }
2367 fprintf(stderr, "error: kvm run failed %s\n",
2368 strerror(-run_ret));
2369 #ifdef TARGET_PPC
2370 if (run_ret == -EBUSY) {
2371 fprintf(stderr,
2372 "This is probably because your SMT is enabled.\n"
2373 "VCPU can only run on primary threads with all "
2374 "secondary threads offline.\n");
2375 }
2376 #endif
2377 ret = -1;
2378 break;
2379 }
2380
2381 trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
2382 switch (run->exit_reason) {
2383 case KVM_EXIT_IO:
2384 DPRINTF("handle_io\n");
2385 /* Called outside BQL */
2386 kvm_handle_io(run->io.port, attrs,
2387 (uint8_t *)run + run->io.data_offset,
2388 run->io.direction,
2389 run->io.size,
2390 run->io.count);
2391 ret = 0;
2392 break;
2393 case KVM_EXIT_MMIO:
2394 DPRINTF("handle_mmio\n");
2395 /* Called outside BQL */
2396 address_space_rw(&address_space_memory,
2397 run->mmio.phys_addr, attrs,
2398 run->mmio.data,
2399 run->mmio.len,
2400 run->mmio.is_write);
2401 ret = 0;
2402 break;
2403 case KVM_EXIT_IRQ_WINDOW_OPEN:
2404 DPRINTF("irq_window_open\n");
2405 ret = EXCP_INTERRUPT;
2406 break;
2407 case KVM_EXIT_SHUTDOWN:
2408 DPRINTF("shutdown\n");
2409 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
2410 ret = EXCP_INTERRUPT;
2411 break;
2412 case KVM_EXIT_UNKNOWN:
2413 fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
2414 (uint64_t)run->hw.hardware_exit_reason);
2415 ret = -1;
2416 break;
2417 case KVM_EXIT_INTERNAL_ERROR:
2418 ret = kvm_handle_internal_error(cpu, run);
2419 break;
2420 case KVM_EXIT_SYSTEM_EVENT:
2421 switch (run->system_event.type) {
2422 case KVM_SYSTEM_EVENT_SHUTDOWN:
2423 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
2424 ret = EXCP_INTERRUPT;
2425 break;
2426 case KVM_SYSTEM_EVENT_RESET:
2427 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
2428 ret = EXCP_INTERRUPT;
2429 break;
2430 case KVM_SYSTEM_EVENT_CRASH:
2431 kvm_cpu_synchronize_state(cpu);
2432 qemu_mutex_lock_iothread();
2433 qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2434 qemu_mutex_unlock_iothread();
2435 ret = 0;
2436 break;
2437 default:
2438 DPRINTF("kvm_arch_handle_exit\n");
2439 ret = kvm_arch_handle_exit(cpu, run);
2440 break;
2441 }
2442 break;
2443 default:
2444 DPRINTF("kvm_arch_handle_exit\n");
2445 ret = kvm_arch_handle_exit(cpu, run);
2446 break;
2447 }
2448 } while (ret == 0);
2449
2450 cpu_exec_end(cpu);
2451 qemu_mutex_lock_iothread();
2452
2453 if (ret < 0) {
2454 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2455 vm_stop(RUN_STATE_INTERNAL_ERROR);
2456 }
2457
2458 atomic_set(&cpu->exit_request, 0);
2459 return ret;
2460 }
2461
kvm_ioctl(KVMState * s,int type,...)2462 int kvm_ioctl(KVMState *s, int type, ...)
2463 {
2464 int ret;
2465 void *arg;
2466 va_list ap;
2467
2468 va_start(ap, type);
2469 arg = va_arg(ap, void *);
2470 va_end(ap);
2471
2472 trace_kvm_ioctl(type, arg);
2473 ret = ioctl(s->fd, type, arg);
2474 if (ret == -1) {
2475 ret = -errno;
2476 }
2477 return ret;
2478 }
2479
kvm_vm_ioctl(KVMState * s,int type,...)2480 int kvm_vm_ioctl(KVMState *s, int type, ...)
2481 {
2482 int ret;
2483 void *arg;
2484 va_list ap;
2485
2486 va_start(ap, type);
2487 arg = va_arg(ap, void *);
2488 va_end(ap);
2489
2490 trace_kvm_vm_ioctl(type, arg);
2491 ret = ioctl(s->vmfd, type, arg);
2492 if (ret == -1) {
2493 ret = -errno;
2494 }
2495 return ret;
2496 }
2497
kvm_vcpu_ioctl(CPUState * cpu,int type,...)2498 int kvm_vcpu_ioctl(CPUState *cpu, int type, ...)
2499 {
2500 int ret;
2501 void *arg;
2502 va_list ap;
2503
2504 va_start(ap, type);
2505 arg = va_arg(ap, void *);
2506 va_end(ap);
2507
2508 trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
2509 ret = ioctl(cpu->kvm_fd, type, arg);
2510 if (ret == -1) {
2511 ret = -errno;
2512 }
2513 return ret;
2514 }
2515
kvm_device_ioctl(int fd,int type,...)2516 int kvm_device_ioctl(int fd, int type, ...)
2517 {
2518 int ret;
2519 void *arg;
2520 va_list ap;
2521
2522 va_start(ap, type);
2523 arg = va_arg(ap, void *);
2524 va_end(ap);
2525
2526 trace_kvm_device_ioctl(fd, type, arg);
2527 ret = ioctl(fd, type, arg);
2528 if (ret == -1) {
2529 ret = -errno;
2530 }
2531 return ret;
2532 }
2533
kvm_vm_check_attr(KVMState * s,uint32_t group,uint64_t attr)2534 int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr)
2535 {
2536 int ret;
2537 struct kvm_device_attr attribute = {
2538 .group = group,
2539 .attr = attr,
2540 };
2541
2542 if (!kvm_vm_attributes_allowed) {
2543 return 0;
2544 }
2545
2546 ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute);
2547 /* kvm returns 0 on success for HAS_DEVICE_ATTR */
2548 return ret ? 0 : 1;
2549 }
2550
kvm_device_check_attr(int dev_fd,uint32_t group,uint64_t attr)2551 int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
2552 {
2553 struct kvm_device_attr attribute = {
2554 .group = group,
2555 .attr = attr,
2556 .flags = 0,
2557 };
2558
2559 return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1;
2560 }
2561
kvm_device_access(int fd,int group,uint64_t attr,void * val,bool write,Error ** errp)2562 int kvm_device_access(int fd, int group, uint64_t attr,
2563 void *val, bool write, Error **errp)
2564 {
2565 struct kvm_device_attr kvmattr;
2566 int err;
2567
2568 kvmattr.flags = 0;
2569 kvmattr.group = group;
2570 kvmattr.attr = attr;
2571 kvmattr.addr = (uintptr_t)val;
2572
2573 err = kvm_device_ioctl(fd,
2574 write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
2575 &kvmattr);
2576 if (err < 0) {
2577 error_setg_errno(errp, -err,
2578 "KVM_%s_DEVICE_ATTR failed: Group %d "
2579 "attr 0x%016" PRIx64,
2580 write ? "SET" : "GET", group, attr);
2581 }
2582 return err;
2583 }
2584
kvm_has_sync_mmu(void)2585 bool kvm_has_sync_mmu(void)
2586 {
2587 return kvm_state->sync_mmu;
2588 }
2589
kvm_has_vcpu_events(void)2590 int kvm_has_vcpu_events(void)
2591 {
2592 return kvm_state->vcpu_events;
2593 }
2594
kvm_has_robust_singlestep(void)2595 int kvm_has_robust_singlestep(void)
2596 {
2597 return kvm_state->robust_singlestep;
2598 }
2599
kvm_has_debugregs(void)2600 int kvm_has_debugregs(void)
2601 {
2602 return kvm_state->debugregs;
2603 }
2604
kvm_max_nested_state_length(void)2605 int kvm_max_nested_state_length(void)
2606 {
2607 return kvm_state->max_nested_state_len;
2608 }
2609
kvm_has_many_ioeventfds(void)2610 int kvm_has_many_ioeventfds(void)
2611 {
2612 if (!kvm_enabled()) {
2613 return 0;
2614 }
2615 return kvm_state->many_ioeventfds;
2616 }
2617
kvm_has_gsi_routing(void)2618 int kvm_has_gsi_routing(void)
2619 {
2620 #ifdef KVM_CAP_IRQ_ROUTING
2621 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
2622 #else
2623 return false;
2624 #endif
2625 }
2626
kvm_has_intx_set_mask(void)2627 int kvm_has_intx_set_mask(void)
2628 {
2629 return kvm_state->intx_set_mask;
2630 }
2631
kvm_arm_supports_user_irq(void)2632 bool kvm_arm_supports_user_irq(void)
2633 {
2634 return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ);
2635 }
2636
2637 #ifdef KVM_CAP_SET_GUEST_DEBUG
kvm_find_sw_breakpoint(CPUState * cpu,target_ulong pc)2638 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu,
2639 target_ulong pc)
2640 {
2641 struct kvm_sw_breakpoint *bp;
2642
2643 QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
2644 if (bp->pc == pc) {
2645 return bp;
2646 }
2647 }
2648 return NULL;
2649 }
2650
kvm_sw_breakpoints_active(CPUState * cpu)2651 int kvm_sw_breakpoints_active(CPUState *cpu)
2652 {
2653 return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
2654 }
2655
2656 struct kvm_set_guest_debug_data {
2657 struct kvm_guest_debug dbg;
2658 int err;
2659 };
2660
kvm_invoke_set_guest_debug(CPUState * cpu,run_on_cpu_data data)2661 static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data)
2662 {
2663 struct kvm_set_guest_debug_data *dbg_data =
2664 (struct kvm_set_guest_debug_data *) data.host_ptr;
2665
2666 dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG,
2667 &dbg_data->dbg);
2668 }
2669
kvm_update_guest_debug(CPUState * cpu,unsigned long reinject_trap)2670 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
2671 {
2672 struct kvm_set_guest_debug_data data;
2673
2674 data.dbg.control = reinject_trap;
2675
2676 if (cpu->singlestep_enabled) {
2677 data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
2678 }
2679 kvm_arch_update_guest_debug(cpu, &data.dbg);
2680
2681 run_on_cpu(cpu, kvm_invoke_set_guest_debug,
2682 RUN_ON_CPU_HOST_PTR(&data));
2683 return data.err;
2684 }
2685
kvm_insert_breakpoint(CPUState * cpu,target_ulong addr,target_ulong len,int type)2686 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
2687 target_ulong len, int type)
2688 {
2689 struct kvm_sw_breakpoint *bp;
2690 int err;
2691
2692 if (type == GDB_BREAKPOINT_SW) {
2693 bp = kvm_find_sw_breakpoint(cpu, addr);
2694 if (bp) {
2695 bp->use_count++;
2696 return 0;
2697 }
2698
2699 bp = g_malloc(sizeof(struct kvm_sw_breakpoint));
2700 bp->pc = addr;
2701 bp->use_count = 1;
2702 err = kvm_arch_insert_sw_breakpoint(cpu, bp);
2703 if (err) {
2704 g_free(bp);
2705 return err;
2706 }
2707
2708 QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
2709 } else {
2710 err = kvm_arch_insert_hw_breakpoint(addr, len, type);
2711 if (err) {
2712 return err;
2713 }
2714 }
2715
2716 CPU_FOREACH(cpu) {
2717 err = kvm_update_guest_debug(cpu, 0);
2718 if (err) {
2719 return err;
2720 }
2721 }
2722 return 0;
2723 }
2724
kvm_remove_breakpoint(CPUState * cpu,target_ulong addr,target_ulong len,int type)2725 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
2726 target_ulong len, int type)
2727 {
2728 struct kvm_sw_breakpoint *bp;
2729 int err;
2730
2731 if (type == GDB_BREAKPOINT_SW) {
2732 bp = kvm_find_sw_breakpoint(cpu, addr);
2733 if (!bp) {
2734 return -ENOENT;
2735 }
2736
2737 if (bp->use_count > 1) {
2738 bp->use_count--;
2739 return 0;
2740 }
2741
2742 err = kvm_arch_remove_sw_breakpoint(cpu, bp);
2743 if (err) {
2744 return err;
2745 }
2746
2747 QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
2748 g_free(bp);
2749 } else {
2750 err = kvm_arch_remove_hw_breakpoint(addr, len, type);
2751 if (err) {
2752 return err;
2753 }
2754 }
2755
2756 CPU_FOREACH(cpu) {
2757 err = kvm_update_guest_debug(cpu, 0);
2758 if (err) {
2759 return err;
2760 }
2761 }
2762 return 0;
2763 }
2764
kvm_remove_all_breakpoints(CPUState * cpu)2765 void kvm_remove_all_breakpoints(CPUState *cpu)
2766 {
2767 struct kvm_sw_breakpoint *bp, *next;
2768 KVMState *s = cpu->kvm_state;
2769 CPUState *tmpcpu;
2770
2771 QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
2772 if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
2773 /* Try harder to find a CPU that currently sees the breakpoint. */
2774 CPU_FOREACH(tmpcpu) {
2775 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) {
2776 break;
2777 }
2778 }
2779 }
2780 QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
2781 g_free(bp);
2782 }
2783 kvm_arch_remove_all_hw_breakpoints();
2784
2785 CPU_FOREACH(cpu) {
2786 kvm_update_guest_debug(cpu, 0);
2787 }
2788 }
2789
2790 #else /* !KVM_CAP_SET_GUEST_DEBUG */
2791
kvm_update_guest_debug(CPUState * cpu,unsigned long reinject_trap)2792 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
2793 {
2794 return -EINVAL;
2795 }
2796
kvm_insert_breakpoint(CPUState * cpu,target_ulong addr,target_ulong len,int type)2797 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
2798 target_ulong len, int type)
2799 {
2800 return -EINVAL;
2801 }
2802
kvm_remove_breakpoint(CPUState * cpu,target_ulong addr,target_ulong len,int type)2803 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
2804 target_ulong len, int type)
2805 {
2806 return -EINVAL;
2807 }
2808
kvm_remove_all_breakpoints(CPUState * cpu)2809 void kvm_remove_all_breakpoints(CPUState *cpu)
2810 {
2811 }
2812 #endif /* !KVM_CAP_SET_GUEST_DEBUG */
2813
kvm_set_signal_mask(CPUState * cpu,const sigset_t * sigset)2814 static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
2815 {
2816 KVMState *s = kvm_state;
2817 struct kvm_signal_mask *sigmask;
2818 int r;
2819
2820 sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
2821
2822 sigmask->len = s->sigmask_len;
2823 memcpy(sigmask->sigset, sigset, sizeof(*sigset));
2824 r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
2825 g_free(sigmask);
2826
2827 return r;
2828 }
2829
kvm_ipi_signal(int sig)2830 static void kvm_ipi_signal(int sig)
2831 {
2832 if (current_cpu) {
2833 assert(kvm_immediate_exit);
2834 kvm_cpu_kick(current_cpu);
2835 }
2836 }
2837
kvm_init_cpu_signals(CPUState * cpu)2838 void kvm_init_cpu_signals(CPUState *cpu)
2839 {
2840 int r;
2841 sigset_t set;
2842 struct sigaction sigact;
2843
2844 memset(&sigact, 0, sizeof(sigact));
2845 sigact.sa_handler = kvm_ipi_signal;
2846 sigaction(SIG_IPI, &sigact, NULL);
2847
2848 pthread_sigmask(SIG_BLOCK, NULL, &set);
2849 #if defined KVM_HAVE_MCE_INJECTION
2850 sigdelset(&set, SIGBUS);
2851 pthread_sigmask(SIG_SETMASK, &set, NULL);
2852 #endif
2853 sigdelset(&set, SIG_IPI);
2854 if (kvm_immediate_exit) {
2855 r = pthread_sigmask(SIG_SETMASK, &set, NULL);
2856 } else {
2857 r = kvm_set_signal_mask(cpu, &set);
2858 }
2859 if (r) {
2860 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
2861 exit(1);
2862 }
2863 }
2864
2865 /* Called asynchronously in VCPU thread. */
kvm_on_sigbus_vcpu(CPUState * cpu,int code,void * addr)2866 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
2867 {
2868 #ifdef KVM_HAVE_MCE_INJECTION
2869 if (have_sigbus_pending) {
2870 return 1;
2871 }
2872 have_sigbus_pending = true;
2873 pending_sigbus_addr = addr;
2874 pending_sigbus_code = code;
2875 atomic_set(&cpu->exit_request, 1);
2876 return 0;
2877 #else
2878 return 1;
2879 #endif
2880 }
2881
2882 /* Called synchronously (via signalfd) in main thread. */
kvm_on_sigbus(int code,void * addr)2883 int kvm_on_sigbus(int code, void *addr)
2884 {
2885 #ifdef KVM_HAVE_MCE_INJECTION
2886 /* Action required MCE kills the process if SIGBUS is blocked. Because
2887 * that's what happens in the I/O thread, where we handle MCE via signalfd,
2888 * we can only get action optional here.
2889 */
2890 assert(code != BUS_MCEERR_AR);
2891 kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
2892 return 0;
2893 #else
2894 return 1;
2895 #endif
2896 }
2897
kvm_create_device(KVMState * s,uint64_t type,bool test)2898 int kvm_create_device(KVMState *s, uint64_t type, bool test)
2899 {
2900 int ret;
2901 struct kvm_create_device create_dev;
2902
2903 create_dev.type = type;
2904 create_dev.fd = -1;
2905 create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;
2906
2907 if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) {
2908 return -ENOTSUP;
2909 }
2910
2911 ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev);
2912 if (ret) {
2913 return ret;
2914 }
2915
2916 return test ? 0 : create_dev.fd;
2917 }
2918
kvm_device_supported(int vmfd,uint64_t type)2919 bool kvm_device_supported(int vmfd, uint64_t type)
2920 {
2921 struct kvm_create_device create_dev = {
2922 .type = type,
2923 .fd = -1,
2924 .flags = KVM_CREATE_DEVICE_TEST,
2925 };
2926
2927 if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) {
2928 return false;
2929 }
2930
2931 return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0);
2932 }
2933
kvm_set_one_reg(CPUState * cs,uint64_t id,void * source)2934 int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source)
2935 {
2936 struct kvm_one_reg reg;
2937 int r;
2938
2939 reg.id = id;
2940 reg.addr = (uintptr_t) source;
2941 r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®);
2942 if (r) {
2943 trace_kvm_failed_reg_set(id, strerror(-r));
2944 }
2945 return r;
2946 }
2947
kvm_get_one_reg(CPUState * cs,uint64_t id,void * target)2948 int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target)
2949 {
2950 struct kvm_one_reg reg;
2951 int r;
2952
2953 reg.id = id;
2954 reg.addr = (uintptr_t) target;
2955 r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®);
2956 if (r) {
2957 trace_kvm_failed_reg_get(id, strerror(-r));
2958 }
2959 return r;
2960 }
2961
kvm_accel_has_memory(MachineState * ms,AddressSpace * as,hwaddr start_addr,hwaddr size)2962 static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as,
2963 hwaddr start_addr, hwaddr size)
2964 {
2965 KVMState *kvm = KVM_STATE(ms->accelerator);
2966 int i;
2967
2968 for (i = 0; i < kvm->nr_as; ++i) {
2969 if (kvm->as[i].as == as && kvm->as[i].ml) {
2970 size = MIN(kvm_max_slot_size, size);
2971 return NULL != kvm_lookup_matching_slot(kvm->as[i].ml,
2972 start_addr, size);
2973 }
2974 }
2975
2976 return false;
2977 }
2978
kvm_get_kvm_shadow_mem(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)2979 static void kvm_get_kvm_shadow_mem(Object *obj, Visitor *v,
2980 const char *name, void *opaque,
2981 Error **errp)
2982 {
2983 KVMState *s = KVM_STATE(obj);
2984 int64_t value = s->kvm_shadow_mem;
2985
2986 visit_type_int(v, name, &value, errp);
2987 }
2988
kvm_set_kvm_shadow_mem(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)2989 static void kvm_set_kvm_shadow_mem(Object *obj, Visitor *v,
2990 const char *name, void *opaque,
2991 Error **errp)
2992 {
2993 KVMState *s = KVM_STATE(obj);
2994 Error *error = NULL;
2995 int64_t value;
2996
2997 visit_type_int(v, name, &value, &error);
2998 if (error) {
2999 error_propagate(errp, error);
3000 return;
3001 }
3002
3003 s->kvm_shadow_mem = value;
3004 }
3005
kvm_set_kernel_irqchip(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)3006 static void kvm_set_kernel_irqchip(Object *obj, Visitor *v,
3007 const char *name, void *opaque,
3008 Error **errp)
3009 {
3010 Error *err = NULL;
3011 KVMState *s = KVM_STATE(obj);
3012 OnOffSplit mode;
3013
3014 visit_type_OnOffSplit(v, name, &mode, &err);
3015 if (err) {
3016 error_propagate(errp, err);
3017 return;
3018 } else {
3019 switch (mode) {
3020 case ON_OFF_SPLIT_ON:
3021 s->kernel_irqchip_allowed = true;
3022 s->kernel_irqchip_required = true;
3023 s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
3024 break;
3025 case ON_OFF_SPLIT_OFF:
3026 s->kernel_irqchip_allowed = false;
3027 s->kernel_irqchip_required = false;
3028 s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
3029 break;
3030 case ON_OFF_SPLIT_SPLIT:
3031 s->kernel_irqchip_allowed = true;
3032 s->kernel_irqchip_required = true;
3033 s->kernel_irqchip_split = ON_OFF_AUTO_ON;
3034 break;
3035 default:
3036 /* The value was checked in visit_type_OnOffSplit() above. If
3037 * we get here, then something is wrong in QEMU.
3038 */
3039 abort();
3040 }
3041 }
3042 }
3043
kvm_kernel_irqchip_allowed(void)3044 bool kvm_kernel_irqchip_allowed(void)
3045 {
3046 return kvm_state->kernel_irqchip_allowed;
3047 }
3048
kvm_kernel_irqchip_required(void)3049 bool kvm_kernel_irqchip_required(void)
3050 {
3051 return kvm_state->kernel_irqchip_required;
3052 }
3053
kvm_kernel_irqchip_split(void)3054 bool kvm_kernel_irqchip_split(void)
3055 {
3056 return kvm_state->kernel_irqchip_split == ON_OFF_AUTO_ON;
3057 }
3058
kvm_accel_instance_init(Object * obj)3059 static void kvm_accel_instance_init(Object *obj)
3060 {
3061 KVMState *s = KVM_STATE(obj);
3062
3063 s->kvm_shadow_mem = -1;
3064 s->kernel_irqchip_allowed = true;
3065 s->kernel_irqchip_split = ON_OFF_AUTO_AUTO;
3066 }
3067
kvm_accel_class_init(ObjectClass * oc,void * data)3068 static void kvm_accel_class_init(ObjectClass *oc, void *data)
3069 {
3070 AccelClass *ac = ACCEL_CLASS(oc);
3071 ac->name = "KVM";
3072 ac->init_machine = kvm_init;
3073 ac->has_memory = kvm_accel_has_memory;
3074 ac->allowed = &kvm_allowed;
3075
3076 object_class_property_add(oc, "kernel-irqchip", "on|off|split",
3077 NULL, kvm_set_kernel_irqchip,
3078 NULL, NULL, &error_abort);
3079 object_class_property_set_description(oc, "kernel-irqchip",
3080 "Configure KVM in-kernel irqchip", &error_abort);
3081
3082 object_class_property_add(oc, "kvm-shadow-mem", "int",
3083 kvm_get_kvm_shadow_mem, kvm_set_kvm_shadow_mem,
3084 NULL, NULL, &error_abort);
3085 object_class_property_set_description(oc, "kvm-shadow-mem",
3086 "KVM shadow MMU size", &error_abort);
3087 }
3088
3089 static const TypeInfo kvm_accel_type = {
3090 .name = TYPE_KVM_ACCEL,
3091 .parent = TYPE_ACCEL,
3092 .instance_init = kvm_accel_instance_init,
3093 .class_init = kvm_accel_class_init,
3094 .instance_size = sizeof(KVMState),
3095 };
3096
kvm_type_init(void)3097 static void kvm_type_init(void)
3098 {
3099 type_register_static(&kvm_accel_type);
3100 }
3101
3102 type_init(kvm_type_init);
3103