xref: /qemu/target/i386/kvm/kvm.c (revision 02326733)
1 /*
2  * QEMU KVM support
3  *
4  * Copyright (C) 2006-2008 Qumranet Technologies
5  * Copyright IBM, Corp. 2008
6  *
7  * Authors:
8  *  Anthony Liguori   <aliguori@us.ibm.com>
9  *
10  * This work is licensed under the terms of the GNU GPL, version 2 or later.
11  * See the COPYING file in the top-level directory.
12  *
13  */
14 
15 #include "qemu/osdep.h"
16 #include "qapi/qapi-events-run-state.h"
17 #include "qapi/error.h"
18 #include "qapi/visitor.h"
19 #include <sys/ioctl.h>
20 #include <sys/utsname.h>
21 #include <sys/syscall.h>
22 
23 #include <linux/kvm.h>
24 #include "standard-headers/asm-x86/kvm_para.h"
25 #include "hw/xen/interface/arch-x86/cpuid.h"
26 
27 #include "cpu.h"
28 #include "host-cpu.h"
29 #include "sysemu/sysemu.h"
30 #include "sysemu/hw_accel.h"
31 #include "sysemu/kvm_int.h"
32 #include "sysemu/runstate.h"
33 #include "kvm_i386.h"
34 #include "../confidential-guest.h"
35 #include "sev.h"
36 #include "xen-emu.h"
37 #include "hyperv.h"
38 #include "hyperv-proto.h"
39 
40 #include "exec/gdbstub.h"
41 #include "qemu/host-utils.h"
42 #include "qemu/main-loop.h"
43 #include "qemu/ratelimit.h"
44 #include "qemu/config-file.h"
45 #include "qemu/error-report.h"
46 #include "qemu/memalign.h"
47 #include "hw/i386/x86.h"
48 #include "hw/i386/kvm/xen_evtchn.h"
49 #include "hw/i386/pc.h"
50 #include "hw/i386/apic.h"
51 #include "hw/i386/apic_internal.h"
52 #include "hw/i386/apic-msidef.h"
53 #include "hw/i386/intel_iommu.h"
54 #include "hw/i386/x86-iommu.h"
55 #include "hw/i386/e820_memory_layout.h"
56 
57 #include "hw/xen/xen.h"
58 
59 #include "hw/pci/pci.h"
60 #include "hw/pci/msi.h"
61 #include "hw/pci/msix.h"
62 #include "migration/blocker.h"
63 #include "exec/memattrs.h"
64 #include "trace.h"
65 
66 #include CONFIG_DEVICES
67 
68 //#define DEBUG_KVM
69 
70 #ifdef DEBUG_KVM
71 #define DPRINTF(fmt, ...) \
72     do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
73 #else
74 #define DPRINTF(fmt, ...) \
75     do { } while (0)
76 #endif
77 
78 /* From arch/x86/kvm/lapic.h */
79 #define KVM_APIC_BUS_CYCLE_NS       1
80 #define KVM_APIC_BUS_FREQUENCY      (1000000000ULL / KVM_APIC_BUS_CYCLE_NS)
81 
82 #define MSR_KVM_WALL_CLOCK  0x11
83 #define MSR_KVM_SYSTEM_TIME 0x12
84 
85 /* A 4096-byte buffer can hold the 8-byte kvm_msrs header, plus
86  * 255 kvm_msr_entry structs */
87 #define MSR_BUF_SIZE 4096
88 
89 static void kvm_init_msrs(X86CPU *cpu);
90 
91 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
92     KVM_CAP_INFO(SET_TSS_ADDR),
93     KVM_CAP_INFO(EXT_CPUID),
94     KVM_CAP_INFO(MP_STATE),
95     KVM_CAP_INFO(SIGNAL_MSI),
96     KVM_CAP_INFO(IRQ_ROUTING),
97     KVM_CAP_INFO(DEBUGREGS),
98     KVM_CAP_INFO(XSAVE),
99     KVM_CAP_INFO(VCPU_EVENTS),
100     KVM_CAP_INFO(X86_ROBUST_SINGLESTEP),
101     KVM_CAP_INFO(MCE),
102     KVM_CAP_INFO(ADJUST_CLOCK),
103     KVM_CAP_INFO(SET_IDENTITY_MAP_ADDR),
104     KVM_CAP_LAST_INFO
105 };
106 
107 static bool has_msr_star;
108 static bool has_msr_hsave_pa;
109 static bool has_msr_tsc_aux;
110 static bool has_msr_tsc_adjust;
111 static bool has_msr_tsc_deadline;
112 static bool has_msr_feature_control;
113 static bool has_msr_misc_enable;
114 static bool has_msr_smbase;
115 static bool has_msr_bndcfgs;
116 static int lm_capable_kernel;
117 static bool has_msr_hv_hypercall;
118 static bool has_msr_hv_crash;
119 static bool has_msr_hv_reset;
120 static bool has_msr_hv_vpindex;
121 static bool hv_vpindex_settable;
122 static bool has_msr_hv_runtime;
123 static bool has_msr_hv_synic;
124 static bool has_msr_hv_stimer;
125 static bool has_msr_hv_frequencies;
126 static bool has_msr_hv_reenlightenment;
127 static bool has_msr_hv_syndbg_options;
128 static bool has_msr_xss;
129 static bool has_msr_umwait;
130 static bool has_msr_spec_ctrl;
131 static bool has_tsc_scale_msr;
132 static bool has_msr_tsx_ctrl;
133 static bool has_msr_virt_ssbd;
134 static bool has_msr_smi_count;
135 static bool has_msr_arch_capabs;
136 static bool has_msr_core_capabs;
137 static bool has_msr_vmx_vmfunc;
138 static bool has_msr_ucode_rev;
139 static bool has_msr_vmx_procbased_ctls2;
140 static bool has_msr_perf_capabs;
141 static bool has_msr_pkrs;
142 
143 static uint32_t has_architectural_pmu_version;
144 static uint32_t num_architectural_pmu_gp_counters;
145 static uint32_t num_architectural_pmu_fixed_counters;
146 
147 static int has_xsave2;
148 static int has_xcrs;
149 static int has_sregs2;
150 static int has_exception_payload;
151 static int has_triple_fault_event;
152 
153 static bool has_msr_mcg_ext_ctl;
154 
155 static struct kvm_cpuid2 *cpuid_cache;
156 static struct kvm_cpuid2 *hv_cpuid_cache;
157 static struct kvm_msr_list *kvm_feature_msrs;
158 
159 static KVMMSRHandlers msr_handlers[KVM_MSR_FILTER_MAX_RANGES];
160 
161 #define BUS_LOCK_SLICE_TIME 1000000000ULL /* ns */
162 static RateLimit bus_lock_ratelimit_ctrl;
163 static int kvm_get_one_msr(X86CPU *cpu, int index, uint64_t *value);
164 
165 static const char *vm_type_name[] = {
166     [KVM_X86_DEFAULT_VM] = "default",
167     [KVM_X86_SEV_VM] = "SEV",
168     [KVM_X86_SEV_ES_VM] = "SEV-ES",
169 };
170 
171 bool kvm_is_vm_type_supported(int type)
172 {
173     uint32_t machine_types;
174 
175     /*
176      * old KVM doesn't support KVM_CAP_VM_TYPES but KVM_X86_DEFAULT_VM
177      * is always supported
178      */
179     if (type == KVM_X86_DEFAULT_VM) {
180         return true;
181     }
182 
183     machine_types = kvm_check_extension(KVM_STATE(current_machine->accelerator),
184                                         KVM_CAP_VM_TYPES);
185     return !!(machine_types & BIT(type));
186 }
187 
188 int kvm_get_vm_type(MachineState *ms)
189 {
190     int kvm_type = KVM_X86_DEFAULT_VM;
191 
192     if (ms->cgs) {
193         if (!object_dynamic_cast(OBJECT(ms->cgs), TYPE_X86_CONFIDENTIAL_GUEST)) {
194             error_report("configuration type %s not supported for x86 guests",
195                          object_get_typename(OBJECT(ms->cgs)));
196             exit(1);
197         }
198         kvm_type = x86_confidential_guest_kvm_type(
199             X86_CONFIDENTIAL_GUEST(ms->cgs));
200     }
201 
202     if (!kvm_is_vm_type_supported(kvm_type)) {
203         error_report("vm-type %s not supported by KVM", vm_type_name[kvm_type]);
204         exit(1);
205     }
206 
207     return kvm_type;
208 }
209 
210 bool kvm_has_smm(void)
211 {
212     return kvm_vm_check_extension(kvm_state, KVM_CAP_X86_SMM);
213 }
214 
215 bool kvm_has_adjust_clock_stable(void)
216 {
217     int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
218 
219     return (ret & KVM_CLOCK_TSC_STABLE);
220 }
221 
222 bool kvm_has_exception_payload(void)
223 {
224     return has_exception_payload;
225 }
226 
227 static bool kvm_x2apic_api_set_flags(uint64_t flags)
228 {
229     KVMState *s = KVM_STATE(current_accel());
230 
231     return !kvm_vm_enable_cap(s, KVM_CAP_X2APIC_API, 0, flags);
232 }
233 
234 #define MEMORIZE(fn, _result) \
235     ({ \
236         static bool _memorized; \
237         \
238         if (_memorized) { \
239             return _result; \
240         } \
241         _memorized = true; \
242         _result = fn; \
243     })
244 
245 static bool has_x2apic_api;
246 
247 bool kvm_has_x2apic_api(void)
248 {
249     return has_x2apic_api;
250 }
251 
252 bool kvm_enable_x2apic(void)
253 {
254     return MEMORIZE(
255              kvm_x2apic_api_set_flags(KVM_X2APIC_API_USE_32BIT_IDS |
256                                       KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK),
257              has_x2apic_api);
258 }
259 
260 bool kvm_hv_vpindex_settable(void)
261 {
262     return hv_vpindex_settable;
263 }
264 
265 static int kvm_get_tsc(CPUState *cs)
266 {
267     X86CPU *cpu = X86_CPU(cs);
268     CPUX86State *env = &cpu->env;
269     uint64_t value;
270     int ret;
271 
272     if (env->tsc_valid) {
273         return 0;
274     }
275 
276     env->tsc_valid = !runstate_is_running();
277 
278     ret = kvm_get_one_msr(cpu, MSR_IA32_TSC, &value);
279     if (ret < 0) {
280         return ret;
281     }
282 
283     env->tsc = value;
284     return 0;
285 }
286 
287 static inline void do_kvm_synchronize_tsc(CPUState *cpu, run_on_cpu_data arg)
288 {
289     kvm_get_tsc(cpu);
290 }
291 
292 void kvm_synchronize_all_tsc(void)
293 {
294     CPUState *cpu;
295 
296     if (kvm_enabled()) {
297         CPU_FOREACH(cpu) {
298             run_on_cpu(cpu, do_kvm_synchronize_tsc, RUN_ON_CPU_NULL);
299         }
300     }
301 }
302 
303 static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
304 {
305     struct kvm_cpuid2 *cpuid;
306     int r, size;
307 
308     size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
309     cpuid = g_malloc0(size);
310     cpuid->nent = max;
311     r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid);
312     if (r == 0 && cpuid->nent >= max) {
313         r = -E2BIG;
314     }
315     if (r < 0) {
316         if (r == -E2BIG) {
317             g_free(cpuid);
318             return NULL;
319         } else {
320             fprintf(stderr, "KVM_GET_SUPPORTED_CPUID failed: %s\n",
321                     strerror(-r));
322             exit(1);
323         }
324     }
325     return cpuid;
326 }
327 
328 /* Run KVM_GET_SUPPORTED_CPUID ioctl(), allocating a buffer large enough
329  * for all entries.
330  */
331 static struct kvm_cpuid2 *get_supported_cpuid(KVMState *s)
332 {
333     struct kvm_cpuid2 *cpuid;
334     int max = 1;
335 
336     if (cpuid_cache != NULL) {
337         return cpuid_cache;
338     }
339     while ((cpuid = try_get_cpuid(s, max)) == NULL) {
340         max *= 2;
341     }
342     cpuid_cache = cpuid;
343     return cpuid;
344 }
345 
346 static bool host_tsx_broken(void)
347 {
348     int family, model, stepping;\
349     char vendor[CPUID_VENDOR_SZ + 1];
350 
351     host_cpu_vendor_fms(vendor, &family, &model, &stepping);
352 
353     /* Check if we are running on a Haswell host known to have broken TSX */
354     return !strcmp(vendor, CPUID_VENDOR_INTEL) &&
355            (family == 6) &&
356            ((model == 63 && stepping < 4) ||
357             model == 60 || model == 69 || model == 70);
358 }
359 
360 /* Returns the value for a specific register on the cpuid entry
361  */
362 static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg)
363 {
364     uint32_t ret = 0;
365     switch (reg) {
366     case R_EAX:
367         ret = entry->eax;
368         break;
369     case R_EBX:
370         ret = entry->ebx;
371         break;
372     case R_ECX:
373         ret = entry->ecx;
374         break;
375     case R_EDX:
376         ret = entry->edx;
377         break;
378     }
379     return ret;
380 }
381 
382 /* Find matching entry for function/index on kvm_cpuid2 struct
383  */
384 static struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid,
385                                                  uint32_t function,
386                                                  uint32_t index)
387 {
388     int i;
389     for (i = 0; i < cpuid->nent; ++i) {
390         if (cpuid->entries[i].function == function &&
391             cpuid->entries[i].index == index) {
392             return &cpuid->entries[i];
393         }
394     }
395     /* not found: */
396     return NULL;
397 }
398 
399 uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
400                                       uint32_t index, int reg)
401 {
402     struct kvm_cpuid2 *cpuid;
403     uint32_t ret = 0;
404     uint32_t cpuid_1_edx, unused;
405     uint64_t bitmask;
406 
407     cpuid = get_supported_cpuid(s);
408 
409     struct kvm_cpuid_entry2 *entry = cpuid_find_entry(cpuid, function, index);
410     if (entry) {
411         ret = cpuid_entry_get_reg(entry, reg);
412     }
413 
414     /* Fixups for the data returned by KVM, below */
415 
416     if (function == 1 && reg == R_EDX) {
417         /* KVM before 2.6.30 misreports the following features */
418         ret |= CPUID_MTRR | CPUID_PAT | CPUID_MCE | CPUID_MCA;
419         /* KVM never reports CPUID_HT but QEMU can support when vcpus > 1 */
420         ret |= CPUID_HT;
421     } else if (function == 1 && reg == R_ECX) {
422         /* We can set the hypervisor flag, even if KVM does not return it on
423          * GET_SUPPORTED_CPUID
424          */
425         ret |= CPUID_EXT_HYPERVISOR;
426         /* tsc-deadline flag is not returned by GET_SUPPORTED_CPUID, but it
427          * can be enabled if the kernel has KVM_CAP_TSC_DEADLINE_TIMER,
428          * and the irqchip is in the kernel.
429          */
430         if (kvm_irqchip_in_kernel() &&
431                 kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) {
432             ret |= CPUID_EXT_TSC_DEADLINE_TIMER;
433         }
434 
435         /* x2apic is reported by GET_SUPPORTED_CPUID, but it can't be enabled
436          * without the in-kernel irqchip
437          */
438         if (!kvm_irqchip_in_kernel()) {
439             ret &= ~CPUID_EXT_X2APIC;
440         }
441 
442         if (enable_cpu_pm) {
443             int disable_exits = kvm_check_extension(s,
444                                                     KVM_CAP_X86_DISABLE_EXITS);
445 
446             if (disable_exits & KVM_X86_DISABLE_EXITS_MWAIT) {
447                 ret |= CPUID_EXT_MONITOR;
448             }
449         }
450     } else if (function == 6 && reg == R_EAX) {
451         ret |= CPUID_6_EAX_ARAT; /* safe to allow because of emulated APIC */
452     } else if (function == 7 && index == 0 && reg == R_EBX) {
453         /* Not new instructions, just an optimization.  */
454         uint32_t ebx;
455         host_cpuid(7, 0, &unused, &ebx, &unused, &unused);
456         ret |= ebx & CPUID_7_0_EBX_ERMS;
457 
458         if (host_tsx_broken()) {
459             ret &= ~(CPUID_7_0_EBX_RTM | CPUID_7_0_EBX_HLE);
460         }
461     } else if (function == 7 && index == 0 && reg == R_EDX) {
462         /* Not new instructions, just an optimization.  */
463         uint32_t edx;
464         host_cpuid(7, 0, &unused, &unused, &unused, &edx);
465         ret |= edx & CPUID_7_0_EDX_FSRM;
466 
467         /*
468          * Linux v4.17-v4.20 incorrectly return ARCH_CAPABILITIES on SVM hosts.
469          * We can detect the bug by checking if MSR_IA32_ARCH_CAPABILITIES is
470          * returned by KVM_GET_MSR_INDEX_LIST.
471          */
472         if (!has_msr_arch_capabs) {
473             ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES;
474         }
475     } else if (function == 7 && index == 1 && reg == R_EAX) {
476         /* Not new instructions, just an optimization.  */
477         uint32_t eax;
478         host_cpuid(7, 1, &eax, &unused, &unused, &unused);
479         ret |= eax & (CPUID_7_1_EAX_FZRM | CPUID_7_1_EAX_FSRS | CPUID_7_1_EAX_FSRC);
480     } else if (function == 7 && index == 2 && reg == R_EDX) {
481         uint32_t edx;
482         host_cpuid(7, 2, &unused, &unused, &unused, &edx);
483         ret |= edx & CPUID_7_2_EDX_MCDT_NO;
484     } else if (function == 0xd && index == 0 &&
485                (reg == R_EAX || reg == R_EDX)) {
486         /*
487          * The value returned by KVM_GET_SUPPORTED_CPUID does not include
488          * features that still have to be enabled with the arch_prctl
489          * system call.  QEMU needs the full value, which is retrieved
490          * with KVM_GET_DEVICE_ATTR.
491          */
492         struct kvm_device_attr attr = {
493             .group = 0,
494             .attr = KVM_X86_XCOMP_GUEST_SUPP,
495             .addr = (unsigned long) &bitmask
496         };
497 
498         bool sys_attr = kvm_check_extension(s, KVM_CAP_SYS_ATTRIBUTES);
499         if (!sys_attr) {
500             return ret;
501         }
502 
503         int rc = kvm_ioctl(s, KVM_GET_DEVICE_ATTR, &attr);
504         if (rc < 0) {
505             if (rc != -ENXIO) {
506                 warn_report("KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) "
507                             "error: %d", rc);
508             }
509             return ret;
510         }
511         ret = (reg == R_EAX) ? bitmask : bitmask >> 32;
512     } else if (function == 0x80000001 && reg == R_ECX) {
513         /*
514          * It's safe to enable TOPOEXT even if it's not returned by
515          * GET_SUPPORTED_CPUID.  Unconditionally enabling TOPOEXT here allows
516          * us to keep CPU models including TOPOEXT runnable on older kernels.
517          */
518         ret |= CPUID_EXT3_TOPOEXT;
519     } else if (function == 0x80000001 && reg == R_EDX) {
520         /* On Intel, kvm returns cpuid according to the Intel spec,
521          * so add missing bits according to the AMD spec:
522          */
523         cpuid_1_edx = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
524         ret |= cpuid_1_edx & CPUID_EXT2_AMD_ALIASES;
525     } else if (function == KVM_CPUID_FEATURES && reg == R_EAX) {
526         /* kvm_pv_unhalt is reported by GET_SUPPORTED_CPUID, but it can't
527          * be enabled without the in-kernel irqchip
528          */
529         if (!kvm_irqchip_in_kernel()) {
530             ret &= ~(1U << KVM_FEATURE_PV_UNHALT);
531         }
532         if (kvm_irqchip_is_split()) {
533             ret |= 1U << KVM_FEATURE_MSI_EXT_DEST_ID;
534         }
535     } else if (function == KVM_CPUID_FEATURES && reg == R_EDX) {
536         ret |= 1U << KVM_HINTS_REALTIME;
537     }
538 
539     return ret;
540 }
541 
542 uint64_t kvm_arch_get_supported_msr_feature(KVMState *s, uint32_t index)
543 {
544     struct {
545         struct kvm_msrs info;
546         struct kvm_msr_entry entries[1];
547     } msr_data = {};
548     uint64_t value;
549     uint32_t ret, can_be_one, must_be_one;
550 
551     if (kvm_feature_msrs == NULL) { /* Host doesn't support feature MSRs */
552         return 0;
553     }
554 
555     /* Check if requested MSR is supported feature MSR */
556     int i;
557     for (i = 0; i < kvm_feature_msrs->nmsrs; i++)
558         if (kvm_feature_msrs->indices[i] == index) {
559             break;
560         }
561     if (i == kvm_feature_msrs->nmsrs) {
562         return 0; /* if the feature MSR is not supported, simply return 0 */
563     }
564 
565     msr_data.info.nmsrs = 1;
566     msr_data.entries[0].index = index;
567 
568     ret = kvm_ioctl(s, KVM_GET_MSRS, &msr_data);
569     if (ret != 1) {
570         error_report("KVM get MSR (index=0x%x) feature failed, %s",
571             index, strerror(-ret));
572         exit(1);
573     }
574 
575     value = msr_data.entries[0].data;
576     switch (index) {
577     case MSR_IA32_VMX_PROCBASED_CTLS2:
578         if (!has_msr_vmx_procbased_ctls2) {
579             /* KVM forgot to add these bits for some time, do this ourselves. */
580             if (kvm_arch_get_supported_cpuid(s, 0xD, 1, R_ECX) &
581                 CPUID_XSAVE_XSAVES) {
582                 value |= (uint64_t)VMX_SECONDARY_EXEC_XSAVES << 32;
583             }
584             if (kvm_arch_get_supported_cpuid(s, 1, 0, R_ECX) &
585                 CPUID_EXT_RDRAND) {
586                 value |= (uint64_t)VMX_SECONDARY_EXEC_RDRAND_EXITING << 32;
587             }
588             if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) &
589                 CPUID_7_0_EBX_INVPCID) {
590                 value |= (uint64_t)VMX_SECONDARY_EXEC_ENABLE_INVPCID << 32;
591             }
592             if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) &
593                 CPUID_7_0_EBX_RDSEED) {
594                 value |= (uint64_t)VMX_SECONDARY_EXEC_RDSEED_EXITING << 32;
595             }
596             if (kvm_arch_get_supported_cpuid(s, 0x80000001, 0, R_EDX) &
597                 CPUID_EXT2_RDTSCP) {
598                 value |= (uint64_t)VMX_SECONDARY_EXEC_RDTSCP << 32;
599             }
600         }
601         /* fall through */
602     case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
603     case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
604     case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
605     case MSR_IA32_VMX_TRUE_EXIT_CTLS:
606         /*
607          * Return true for bits that can be one, but do not have to be one.
608          * The SDM tells us which bits could have a "must be one" setting,
609          * so we can do the opposite transformation in make_vmx_msr_value.
610          */
611         must_be_one = (uint32_t)value;
612         can_be_one = (uint32_t)(value >> 32);
613         return can_be_one & ~must_be_one;
614 
615     default:
616         return value;
617     }
618 }
619 
620 static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
621                                      int *max_banks)
622 {
623     *max_banks = kvm_check_extension(s, KVM_CAP_MCE);
624     return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
625 }
626 
627 static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code)
628 {
629     CPUState *cs = CPU(cpu);
630     CPUX86State *env = &cpu->env;
631     uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
632                       MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S;
633     uint64_t mcg_status = MCG_STATUS_MCIP;
634     int flags = 0;
635 
636     if (code == BUS_MCEERR_AR) {
637         status |= MCI_STATUS_AR | 0x134;
638         mcg_status |= MCG_STATUS_RIPV | MCG_STATUS_EIPV;
639     } else {
640         status |= 0xc0;
641         mcg_status |= MCG_STATUS_RIPV;
642     }
643 
644     flags = cpu_x86_support_mca_broadcast(env) ? MCE_INJECT_BROADCAST : 0;
645     /* We need to read back the value of MSR_EXT_MCG_CTL that was set by the
646      * guest kernel back into env->mcg_ext_ctl.
647      */
648     cpu_synchronize_state(cs);
649     if (env->mcg_ext_ctl & MCG_EXT_CTL_LMCE_EN) {
650         mcg_status |= MCG_STATUS_LMCE;
651         flags = 0;
652     }
653 
654     cpu_x86_inject_mce(NULL, cpu, 9, status, mcg_status, paddr,
655                        (MCM_ADDR_PHYS << 6) | 0xc, flags);
656 }
657 
658 static void emit_hypervisor_memory_failure(MemoryFailureAction action, bool ar)
659 {
660     MemoryFailureFlags mff = {.action_required = ar, .recursive = false};
661 
662     qapi_event_send_memory_failure(MEMORY_FAILURE_RECIPIENT_HYPERVISOR, action,
663                                    &mff);
664 }
665 
666 static void hardware_memory_error(void *host_addr)
667 {
668     emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_FATAL, true);
669     error_report("QEMU got Hardware memory error at addr %p", host_addr);
670     exit(1);
671 }
672 
673 void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
674 {
675     X86CPU *cpu = X86_CPU(c);
676     CPUX86State *env = &cpu->env;
677     ram_addr_t ram_addr;
678     hwaddr paddr;
679 
680     /* If we get an action required MCE, it has been injected by KVM
681      * while the VM was running.  An action optional MCE instead should
682      * be coming from the main thread, which qemu_init_sigbus identifies
683      * as the "early kill" thread.
684      */
685     assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
686 
687     if ((env->mcg_cap & MCG_SER_P) && addr) {
688         ram_addr = qemu_ram_addr_from_host(addr);
689         if (ram_addr != RAM_ADDR_INVALID &&
690             kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
691             kvm_hwpoison_page_add(ram_addr);
692             kvm_mce_inject(cpu, paddr, code);
693 
694             /*
695              * Use different logging severity based on error type.
696              * If there is additional MCE reporting on the hypervisor, QEMU VA
697              * could be another source to identify the PA and MCE details.
698              */
699             if (code == BUS_MCEERR_AR) {
700                 error_report("Guest MCE Memory Error at QEMU addr %p and "
701                     "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
702                     addr, paddr, "BUS_MCEERR_AR");
703             } else {
704                  warn_report("Guest MCE Memory Error at QEMU addr %p and "
705                      "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
706                      addr, paddr, "BUS_MCEERR_AO");
707             }
708 
709             return;
710         }
711 
712         if (code == BUS_MCEERR_AO) {
713             warn_report("Hardware memory error at addr %p of type %s "
714                 "for memory used by QEMU itself instead of guest system!",
715                  addr, "BUS_MCEERR_AO");
716         }
717     }
718 
719     if (code == BUS_MCEERR_AR) {
720         hardware_memory_error(addr);
721     }
722 
723     /* Hope we are lucky for AO MCE, just notify a event */
724     emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_IGNORE, false);
725 }
726 
727 static void kvm_queue_exception(CPUX86State *env,
728                                 int32_t exception_nr,
729                                 uint8_t exception_has_payload,
730                                 uint64_t exception_payload)
731 {
732     assert(env->exception_nr == -1);
733     assert(!env->exception_pending);
734     assert(!env->exception_injected);
735     assert(!env->exception_has_payload);
736 
737     env->exception_nr = exception_nr;
738 
739     if (has_exception_payload) {
740         env->exception_pending = 1;
741 
742         env->exception_has_payload = exception_has_payload;
743         env->exception_payload = exception_payload;
744     } else {
745         env->exception_injected = 1;
746 
747         if (exception_nr == EXCP01_DB) {
748             assert(exception_has_payload);
749             env->dr[6] = exception_payload;
750         } else if (exception_nr == EXCP0E_PAGE) {
751             assert(exception_has_payload);
752             env->cr[2] = exception_payload;
753         } else {
754             assert(!exception_has_payload);
755         }
756     }
757 }
758 
759 static void cpu_update_state(void *opaque, bool running, RunState state)
760 {
761     CPUX86State *env = opaque;
762 
763     if (running) {
764         env->tsc_valid = false;
765     }
766 }
767 
768 unsigned long kvm_arch_vcpu_id(CPUState *cs)
769 {
770     X86CPU *cpu = X86_CPU(cs);
771     return cpu->apic_id;
772 }
773 
774 #ifndef KVM_CPUID_SIGNATURE_NEXT
775 #define KVM_CPUID_SIGNATURE_NEXT                0x40000100
776 #endif
777 
778 static bool hyperv_enabled(X86CPU *cpu)
779 {
780     return kvm_check_extension(kvm_state, KVM_CAP_HYPERV) > 0 &&
781         ((cpu->hyperv_spinlock_attempts != HYPERV_SPINLOCK_NEVER_NOTIFY) ||
782          cpu->hyperv_features || cpu->hyperv_passthrough);
783 }
784 
785 /*
786  * Check whether target_freq is within conservative
787  * ntp correctable bounds (250ppm) of freq
788  */
789 static inline bool freq_within_bounds(int freq, int target_freq)
790 {
791         int max_freq = freq + (freq * 250 / 1000000);
792         int min_freq = freq - (freq * 250 / 1000000);
793 
794         if (target_freq >= min_freq && target_freq <= max_freq) {
795                 return true;
796         }
797 
798         return false;
799 }
800 
801 static int kvm_arch_set_tsc_khz(CPUState *cs)
802 {
803     X86CPU *cpu = X86_CPU(cs);
804     CPUX86State *env = &cpu->env;
805     int r, cur_freq;
806     bool set_ioctl = false;
807 
808     if (!env->tsc_khz) {
809         return 0;
810     }
811 
812     cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
813                kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : -ENOTSUP;
814 
815     /*
816      * If TSC scaling is supported, attempt to set TSC frequency.
817      */
818     if (kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL)) {
819         set_ioctl = true;
820     }
821 
822     /*
823      * If desired TSC frequency is within bounds of NTP correction,
824      * attempt to set TSC frequency.
825      */
826     if (cur_freq != -ENOTSUP && freq_within_bounds(cur_freq, env->tsc_khz)) {
827         set_ioctl = true;
828     }
829 
830     r = set_ioctl ?
831         kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) :
832         -ENOTSUP;
833 
834     if (r < 0) {
835         /* When KVM_SET_TSC_KHZ fails, it's an error only if the current
836          * TSC frequency doesn't match the one we want.
837          */
838         cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
839                    kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
840                    -ENOTSUP;
841         if (cur_freq <= 0 || cur_freq != env->tsc_khz) {
842             warn_report("TSC frequency mismatch between "
843                         "VM (%" PRId64 " kHz) and host (%d kHz), "
844                         "and TSC scaling unavailable",
845                         env->tsc_khz, cur_freq);
846             return r;
847         }
848     }
849 
850     return 0;
851 }
852 
853 static bool tsc_is_stable_and_known(CPUX86State *env)
854 {
855     if (!env->tsc_khz) {
856         return false;
857     }
858     return (env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC)
859         || env->user_tsc_khz;
860 }
861 
862 #define DEFAULT_EVMCS_VERSION ((1 << 8) | 1)
863 
864 static struct {
865     const char *desc;
866     struct {
867         uint32_t func;
868         int reg;
869         uint32_t bits;
870     } flags[2];
871     uint64_t dependencies;
872 } kvm_hyperv_properties[] = {
873     [HYPERV_FEAT_RELAXED] = {
874         .desc = "relaxed timing (hv-relaxed)",
875         .flags = {
876             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
877              .bits = HV_RELAXED_TIMING_RECOMMENDED}
878         }
879     },
880     [HYPERV_FEAT_VAPIC] = {
881         .desc = "virtual APIC (hv-vapic)",
882         .flags = {
883             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
884              .bits = HV_APIC_ACCESS_AVAILABLE}
885         }
886     },
887     [HYPERV_FEAT_TIME] = {
888         .desc = "clocksources (hv-time)",
889         .flags = {
890             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
891              .bits = HV_TIME_REF_COUNT_AVAILABLE | HV_REFERENCE_TSC_AVAILABLE}
892         }
893     },
894     [HYPERV_FEAT_CRASH] = {
895         .desc = "crash MSRs (hv-crash)",
896         .flags = {
897             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
898              .bits = HV_GUEST_CRASH_MSR_AVAILABLE}
899         }
900     },
901     [HYPERV_FEAT_RESET] = {
902         .desc = "reset MSR (hv-reset)",
903         .flags = {
904             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
905              .bits = HV_RESET_AVAILABLE}
906         }
907     },
908     [HYPERV_FEAT_VPINDEX] = {
909         .desc = "VP_INDEX MSR (hv-vpindex)",
910         .flags = {
911             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
912              .bits = HV_VP_INDEX_AVAILABLE}
913         }
914     },
915     [HYPERV_FEAT_RUNTIME] = {
916         .desc = "VP_RUNTIME MSR (hv-runtime)",
917         .flags = {
918             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
919              .bits = HV_VP_RUNTIME_AVAILABLE}
920         }
921     },
922     [HYPERV_FEAT_SYNIC] = {
923         .desc = "synthetic interrupt controller (hv-synic)",
924         .flags = {
925             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
926              .bits = HV_SYNIC_AVAILABLE}
927         }
928     },
929     [HYPERV_FEAT_STIMER] = {
930         .desc = "synthetic timers (hv-stimer)",
931         .flags = {
932             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
933              .bits = HV_SYNTIMERS_AVAILABLE}
934         },
935         .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_TIME)
936     },
937     [HYPERV_FEAT_FREQUENCIES] = {
938         .desc = "frequency MSRs (hv-frequencies)",
939         .flags = {
940             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
941              .bits = HV_ACCESS_FREQUENCY_MSRS},
942             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
943              .bits = HV_FREQUENCY_MSRS_AVAILABLE}
944         }
945     },
946     [HYPERV_FEAT_REENLIGHTENMENT] = {
947         .desc = "reenlightenment MSRs (hv-reenlightenment)",
948         .flags = {
949             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
950              .bits = HV_ACCESS_REENLIGHTENMENTS_CONTROL}
951         }
952     },
953     [HYPERV_FEAT_TLBFLUSH] = {
954         .desc = "paravirtualized TLB flush (hv-tlbflush)",
955         .flags = {
956             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
957              .bits = HV_REMOTE_TLB_FLUSH_RECOMMENDED |
958              HV_EX_PROCESSOR_MASKS_RECOMMENDED}
959         },
960         .dependencies = BIT(HYPERV_FEAT_VPINDEX)
961     },
962     [HYPERV_FEAT_EVMCS] = {
963         .desc = "enlightened VMCS (hv-evmcs)",
964         .flags = {
965             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
966              .bits = HV_ENLIGHTENED_VMCS_RECOMMENDED}
967         },
968         .dependencies = BIT(HYPERV_FEAT_VAPIC)
969     },
970     [HYPERV_FEAT_IPI] = {
971         .desc = "paravirtualized IPI (hv-ipi)",
972         .flags = {
973             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
974              .bits = HV_CLUSTER_IPI_RECOMMENDED |
975              HV_EX_PROCESSOR_MASKS_RECOMMENDED}
976         },
977         .dependencies = BIT(HYPERV_FEAT_VPINDEX)
978     },
979     [HYPERV_FEAT_STIMER_DIRECT] = {
980         .desc = "direct mode synthetic timers (hv-stimer-direct)",
981         .flags = {
982             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
983              .bits = HV_STIMER_DIRECT_MODE_AVAILABLE}
984         },
985         .dependencies = BIT(HYPERV_FEAT_STIMER)
986     },
987     [HYPERV_FEAT_AVIC] = {
988         .desc = "AVIC/APICv support (hv-avic/hv-apicv)",
989         .flags = {
990             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
991              .bits = HV_DEPRECATING_AEOI_RECOMMENDED}
992         }
993     },
994 #ifdef CONFIG_SYNDBG
995     [HYPERV_FEAT_SYNDBG] = {
996         .desc = "Enable synthetic kernel debugger channel (hv-syndbg)",
997         .flags = {
998             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
999              .bits = HV_FEATURE_DEBUG_MSRS_AVAILABLE}
1000         },
1001         .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_RELAXED)
1002     },
1003 #endif
1004     [HYPERV_FEAT_MSR_BITMAP] = {
1005         .desc = "enlightened MSR-Bitmap (hv-emsr-bitmap)",
1006         .flags = {
1007             {.func = HV_CPUID_NESTED_FEATURES, .reg = R_EAX,
1008              .bits = HV_NESTED_MSR_BITMAP}
1009         }
1010     },
1011     [HYPERV_FEAT_XMM_INPUT] = {
1012         .desc = "XMM fast hypercall input (hv-xmm-input)",
1013         .flags = {
1014             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
1015              .bits = HV_HYPERCALL_XMM_INPUT_AVAILABLE}
1016         }
1017     },
1018     [HYPERV_FEAT_TLBFLUSH_EXT] = {
1019         .desc = "Extended gva ranges for TLB flush hypercalls (hv-tlbflush-ext)",
1020         .flags = {
1021             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
1022              .bits = HV_EXT_GVA_RANGES_FLUSH_AVAILABLE}
1023         },
1024         .dependencies = BIT(HYPERV_FEAT_TLBFLUSH)
1025     },
1026     [HYPERV_FEAT_TLBFLUSH_DIRECT] = {
1027         .desc = "direct TLB flush (hv-tlbflush-direct)",
1028         .flags = {
1029             {.func = HV_CPUID_NESTED_FEATURES, .reg = R_EAX,
1030              .bits = HV_NESTED_DIRECT_FLUSH}
1031         },
1032         .dependencies = BIT(HYPERV_FEAT_VAPIC)
1033     },
1034 };
1035 
1036 static struct kvm_cpuid2 *try_get_hv_cpuid(CPUState *cs, int max,
1037                                            bool do_sys_ioctl)
1038 {
1039     struct kvm_cpuid2 *cpuid;
1040     int r, size;
1041 
1042     size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
1043     cpuid = g_malloc0(size);
1044     cpuid->nent = max;
1045 
1046     if (do_sys_ioctl) {
1047         r = kvm_ioctl(kvm_state, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
1048     } else {
1049         r = kvm_vcpu_ioctl(cs, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
1050     }
1051     if (r == 0 && cpuid->nent >= max) {
1052         r = -E2BIG;
1053     }
1054     if (r < 0) {
1055         if (r == -E2BIG) {
1056             g_free(cpuid);
1057             return NULL;
1058         } else {
1059             fprintf(stderr, "KVM_GET_SUPPORTED_HV_CPUID failed: %s\n",
1060                     strerror(-r));
1061             exit(1);
1062         }
1063     }
1064     return cpuid;
1065 }
1066 
1067 /*
1068  * Run KVM_GET_SUPPORTED_HV_CPUID ioctl(), allocating a buffer large enough
1069  * for all entries.
1070  */
1071 static struct kvm_cpuid2 *get_supported_hv_cpuid(CPUState *cs)
1072 {
1073     struct kvm_cpuid2 *cpuid;
1074     /* 0x40000000..0x40000005, 0x4000000A, 0x40000080..0x40000082 leaves */
1075     int max = 11;
1076     int i;
1077     bool do_sys_ioctl;
1078 
1079     do_sys_ioctl =
1080         kvm_check_extension(kvm_state, KVM_CAP_SYS_HYPERV_CPUID) > 0;
1081 
1082     /*
1083      * Non-empty KVM context is needed when KVM_CAP_SYS_HYPERV_CPUID is
1084      * unsupported, kvm_hyperv_expand_features() checks for that.
1085      */
1086     assert(do_sys_ioctl || cs->kvm_state);
1087 
1088     /*
1089      * When the buffer is too small, KVM_GET_SUPPORTED_HV_CPUID fails with
1090      * -E2BIG, however, it doesn't report back the right size. Keep increasing
1091      * it and re-trying until we succeed.
1092      */
1093     while ((cpuid = try_get_hv_cpuid(cs, max, do_sys_ioctl)) == NULL) {
1094         max++;
1095     }
1096 
1097     /*
1098      * KVM_GET_SUPPORTED_HV_CPUID does not set EVMCS CPUID bit before
1099      * KVM_CAP_HYPERV_ENLIGHTENED_VMCS is enabled but we want to get the
1100      * information early, just check for the capability and set the bit
1101      * manually.
1102      */
1103     if (!do_sys_ioctl && kvm_check_extension(cs->kvm_state,
1104                             KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) {
1105         for (i = 0; i < cpuid->nent; i++) {
1106             if (cpuid->entries[i].function == HV_CPUID_ENLIGHTMENT_INFO) {
1107                 cpuid->entries[i].eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED;
1108             }
1109         }
1110     }
1111 
1112     return cpuid;
1113 }
1114 
1115 /*
1116  * When KVM_GET_SUPPORTED_HV_CPUID is not supported we fill CPUID feature
1117  * leaves from KVM_CAP_HYPERV* and present MSRs data.
1118  */
1119 static struct kvm_cpuid2 *get_supported_hv_cpuid_legacy(CPUState *cs)
1120 {
1121     X86CPU *cpu = X86_CPU(cs);
1122     struct kvm_cpuid2 *cpuid;
1123     struct kvm_cpuid_entry2 *entry_feat, *entry_recomm;
1124 
1125     /* HV_CPUID_FEATURES, HV_CPUID_ENLIGHTMENT_INFO */
1126     cpuid = g_malloc0(sizeof(*cpuid) + 2 * sizeof(*cpuid->entries));
1127     cpuid->nent = 2;
1128 
1129     /* HV_CPUID_VENDOR_AND_MAX_FUNCTIONS */
1130     entry_feat = &cpuid->entries[0];
1131     entry_feat->function = HV_CPUID_FEATURES;
1132 
1133     entry_recomm = &cpuid->entries[1];
1134     entry_recomm->function = HV_CPUID_ENLIGHTMENT_INFO;
1135     entry_recomm->ebx = cpu->hyperv_spinlock_attempts;
1136 
1137     if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV) > 0) {
1138         entry_feat->eax |= HV_HYPERCALL_AVAILABLE;
1139         entry_feat->eax |= HV_APIC_ACCESS_AVAILABLE;
1140         entry_feat->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
1141         entry_recomm->eax |= HV_RELAXED_TIMING_RECOMMENDED;
1142         entry_recomm->eax |= HV_APIC_ACCESS_RECOMMENDED;
1143     }
1144 
1145     if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_TIME) > 0) {
1146         entry_feat->eax |= HV_TIME_REF_COUNT_AVAILABLE;
1147         entry_feat->eax |= HV_REFERENCE_TSC_AVAILABLE;
1148     }
1149 
1150     if (has_msr_hv_frequencies) {
1151         entry_feat->eax |= HV_ACCESS_FREQUENCY_MSRS;
1152         entry_feat->edx |= HV_FREQUENCY_MSRS_AVAILABLE;
1153     }
1154 
1155     if (has_msr_hv_crash) {
1156         entry_feat->edx |= HV_GUEST_CRASH_MSR_AVAILABLE;
1157     }
1158 
1159     if (has_msr_hv_reenlightenment) {
1160         entry_feat->eax |= HV_ACCESS_REENLIGHTENMENTS_CONTROL;
1161     }
1162 
1163     if (has_msr_hv_reset) {
1164         entry_feat->eax |= HV_RESET_AVAILABLE;
1165     }
1166 
1167     if (has_msr_hv_vpindex) {
1168         entry_feat->eax |= HV_VP_INDEX_AVAILABLE;
1169     }
1170 
1171     if (has_msr_hv_runtime) {
1172         entry_feat->eax |= HV_VP_RUNTIME_AVAILABLE;
1173     }
1174 
1175     if (has_msr_hv_synic) {
1176         unsigned int cap = cpu->hyperv_synic_kvm_only ?
1177             KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2;
1178 
1179         if (kvm_check_extension(cs->kvm_state, cap) > 0) {
1180             entry_feat->eax |= HV_SYNIC_AVAILABLE;
1181         }
1182     }
1183 
1184     if (has_msr_hv_stimer) {
1185         entry_feat->eax |= HV_SYNTIMERS_AVAILABLE;
1186     }
1187 
1188     if (has_msr_hv_syndbg_options) {
1189         entry_feat->edx |= HV_GUEST_DEBUGGING_AVAILABLE;
1190         entry_feat->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE;
1191         entry_feat->ebx |= HV_PARTITION_DEBUGGING_ALLOWED;
1192     }
1193 
1194     if (kvm_check_extension(cs->kvm_state,
1195                             KVM_CAP_HYPERV_TLBFLUSH) > 0) {
1196         entry_recomm->eax |= HV_REMOTE_TLB_FLUSH_RECOMMENDED;
1197         entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
1198     }
1199 
1200     if (kvm_check_extension(cs->kvm_state,
1201                             KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) {
1202         entry_recomm->eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED;
1203     }
1204 
1205     if (kvm_check_extension(cs->kvm_state,
1206                             KVM_CAP_HYPERV_SEND_IPI) > 0) {
1207         entry_recomm->eax |= HV_CLUSTER_IPI_RECOMMENDED;
1208         entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
1209     }
1210 
1211     return cpuid;
1212 }
1213 
1214 static uint32_t hv_cpuid_get_host(CPUState *cs, uint32_t func, int reg)
1215 {
1216     struct kvm_cpuid_entry2 *entry;
1217     struct kvm_cpuid2 *cpuid;
1218 
1219     if (hv_cpuid_cache) {
1220         cpuid = hv_cpuid_cache;
1221     } else {
1222         if (kvm_check_extension(kvm_state, KVM_CAP_HYPERV_CPUID) > 0) {
1223             cpuid = get_supported_hv_cpuid(cs);
1224         } else {
1225             /*
1226              * 'cs->kvm_state' may be NULL when Hyper-V features are expanded
1227              * before KVM context is created but this is only done when
1228              * KVM_CAP_SYS_HYPERV_CPUID is supported and it implies
1229              * KVM_CAP_HYPERV_CPUID.
1230              */
1231             assert(cs->kvm_state);
1232 
1233             cpuid = get_supported_hv_cpuid_legacy(cs);
1234         }
1235         hv_cpuid_cache = cpuid;
1236     }
1237 
1238     if (!cpuid) {
1239         return 0;
1240     }
1241 
1242     entry = cpuid_find_entry(cpuid, func, 0);
1243     if (!entry) {
1244         return 0;
1245     }
1246 
1247     return cpuid_entry_get_reg(entry, reg);
1248 }
1249 
1250 static bool hyperv_feature_supported(CPUState *cs, int feature)
1251 {
1252     uint32_t func, bits;
1253     int i, reg;
1254 
1255     for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties[feature].flags); i++) {
1256 
1257         func = kvm_hyperv_properties[feature].flags[i].func;
1258         reg = kvm_hyperv_properties[feature].flags[i].reg;
1259         bits = kvm_hyperv_properties[feature].flags[i].bits;
1260 
1261         if (!func) {
1262             continue;
1263         }
1264 
1265         if ((hv_cpuid_get_host(cs, func, reg) & bits) != bits) {
1266             return false;
1267         }
1268     }
1269 
1270     return true;
1271 }
1272 
1273 /* Checks that all feature dependencies are enabled */
1274 static bool hv_feature_check_deps(X86CPU *cpu, int feature, Error **errp)
1275 {
1276     uint64_t deps;
1277     int dep_feat;
1278 
1279     deps = kvm_hyperv_properties[feature].dependencies;
1280     while (deps) {
1281         dep_feat = ctz64(deps);
1282         if (!(hyperv_feat_enabled(cpu, dep_feat))) {
1283             error_setg(errp, "Hyper-V %s requires Hyper-V %s",
1284                        kvm_hyperv_properties[feature].desc,
1285                        kvm_hyperv_properties[dep_feat].desc);
1286             return false;
1287         }
1288         deps &= ~(1ull << dep_feat);
1289     }
1290 
1291     return true;
1292 }
1293 
1294 static uint32_t hv_build_cpuid_leaf(CPUState *cs, uint32_t func, int reg)
1295 {
1296     X86CPU *cpu = X86_CPU(cs);
1297     uint32_t r = 0;
1298     int i, j;
1299 
1300     for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties); i++) {
1301         if (!hyperv_feat_enabled(cpu, i)) {
1302             continue;
1303         }
1304 
1305         for (j = 0; j < ARRAY_SIZE(kvm_hyperv_properties[i].flags); j++) {
1306             if (kvm_hyperv_properties[i].flags[j].func != func) {
1307                 continue;
1308             }
1309             if (kvm_hyperv_properties[i].flags[j].reg != reg) {
1310                 continue;
1311             }
1312 
1313             r |= kvm_hyperv_properties[i].flags[j].bits;
1314         }
1315     }
1316 
1317     /* HV_CPUID_NESTED_FEATURES.EAX also encodes the supported eVMCS range */
1318     if (func == HV_CPUID_NESTED_FEATURES && reg == R_EAX) {
1319         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) {
1320             r |= DEFAULT_EVMCS_VERSION;
1321         }
1322     }
1323 
1324     return r;
1325 }
1326 
1327 /*
1328  * Expand Hyper-V CPU features. In partucular, check that all the requested
1329  * features are supported by the host and the sanity of the configuration
1330  * (that all the required dependencies are included). Also, this takes care
1331  * of 'hv_passthrough' mode and fills the environment with all supported
1332  * Hyper-V features.
1333  */
1334 bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp)
1335 {
1336     CPUState *cs = CPU(cpu);
1337     Error *local_err = NULL;
1338     int feat;
1339 
1340     if (!hyperv_enabled(cpu))
1341         return true;
1342 
1343     /*
1344      * When kvm_hyperv_expand_features is called at CPU feature expansion
1345      * time per-CPU kvm_state is not available yet so we can only proceed
1346      * when KVM_CAP_SYS_HYPERV_CPUID is supported.
1347      */
1348     if (!cs->kvm_state &&
1349         !kvm_check_extension(kvm_state, KVM_CAP_SYS_HYPERV_CPUID))
1350         return true;
1351 
1352     if (cpu->hyperv_passthrough) {
1353         cpu->hyperv_vendor_id[0] =
1354             hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_EBX);
1355         cpu->hyperv_vendor_id[1] =
1356             hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_ECX);
1357         cpu->hyperv_vendor_id[2] =
1358             hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_EDX);
1359         cpu->hyperv_vendor = g_realloc(cpu->hyperv_vendor,
1360                                        sizeof(cpu->hyperv_vendor_id) + 1);
1361         memcpy(cpu->hyperv_vendor, cpu->hyperv_vendor_id,
1362                sizeof(cpu->hyperv_vendor_id));
1363         cpu->hyperv_vendor[sizeof(cpu->hyperv_vendor_id)] = 0;
1364 
1365         cpu->hyperv_interface_id[0] =
1366             hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EAX);
1367         cpu->hyperv_interface_id[1] =
1368             hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EBX);
1369         cpu->hyperv_interface_id[2] =
1370             hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_ECX);
1371         cpu->hyperv_interface_id[3] =
1372             hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EDX);
1373 
1374         cpu->hyperv_ver_id_build =
1375             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EAX);
1376         cpu->hyperv_ver_id_major =
1377             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EBX) >> 16;
1378         cpu->hyperv_ver_id_minor =
1379             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EBX) & 0xffff;
1380         cpu->hyperv_ver_id_sp =
1381             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_ECX);
1382         cpu->hyperv_ver_id_sb =
1383             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EDX) >> 24;
1384         cpu->hyperv_ver_id_sn =
1385             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EDX) & 0xffffff;
1386 
1387         cpu->hv_max_vps = hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS,
1388                                             R_EAX);
1389         cpu->hyperv_limits[0] =
1390             hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_EBX);
1391         cpu->hyperv_limits[1] =
1392             hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_ECX);
1393         cpu->hyperv_limits[2] =
1394             hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_EDX);
1395 
1396         cpu->hyperv_spinlock_attempts =
1397             hv_cpuid_get_host(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EBX);
1398 
1399         /*
1400          * Mark feature as enabled in 'cpu->hyperv_features' as
1401          * hv_build_cpuid_leaf() uses this info to build guest CPUIDs.
1402          */
1403         for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) {
1404             if (hyperv_feature_supported(cs, feat)) {
1405                 cpu->hyperv_features |= BIT(feat);
1406             }
1407         }
1408     } else {
1409         /* Check features availability and dependencies */
1410         for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) {
1411             /* If the feature was not requested skip it. */
1412             if (!hyperv_feat_enabled(cpu, feat)) {
1413                 continue;
1414             }
1415 
1416             /* Check if the feature is supported by KVM */
1417             if (!hyperv_feature_supported(cs, feat)) {
1418                 error_setg(errp, "Hyper-V %s is not supported by kernel",
1419                            kvm_hyperv_properties[feat].desc);
1420                 return false;
1421             }
1422 
1423             /* Check dependencies */
1424             if (!hv_feature_check_deps(cpu, feat, &local_err)) {
1425                 error_propagate(errp, local_err);
1426                 return false;
1427             }
1428         }
1429     }
1430 
1431     /* Additional dependencies not covered by kvm_hyperv_properties[] */
1432     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) &&
1433         !cpu->hyperv_synic_kvm_only &&
1434         !hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)) {
1435         error_setg(errp, "Hyper-V %s requires Hyper-V %s",
1436                    kvm_hyperv_properties[HYPERV_FEAT_SYNIC].desc,
1437                    kvm_hyperv_properties[HYPERV_FEAT_VPINDEX].desc);
1438         return false;
1439     }
1440 
1441     return true;
1442 }
1443 
1444 /*
1445  * Fill in Hyper-V CPUIDs. Returns the number of entries filled in cpuid_ent.
1446  */
1447 static int hyperv_fill_cpuids(CPUState *cs,
1448                               struct kvm_cpuid_entry2 *cpuid_ent)
1449 {
1450     X86CPU *cpu = X86_CPU(cs);
1451     struct kvm_cpuid_entry2 *c;
1452     uint32_t signature[3];
1453     uint32_t cpuid_i = 0, max_cpuid_leaf = 0;
1454     uint32_t nested_eax =
1455         hv_build_cpuid_leaf(cs, HV_CPUID_NESTED_FEATURES, R_EAX);
1456 
1457     max_cpuid_leaf = nested_eax ? HV_CPUID_NESTED_FEATURES :
1458         HV_CPUID_IMPLEMENT_LIMITS;
1459 
1460     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG)) {
1461         max_cpuid_leaf =
1462             MAX(max_cpuid_leaf, HV_CPUID_SYNDBG_PLATFORM_CAPABILITIES);
1463     }
1464 
1465     c = &cpuid_ent[cpuid_i++];
1466     c->function = HV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
1467     c->eax = max_cpuid_leaf;
1468     c->ebx = cpu->hyperv_vendor_id[0];
1469     c->ecx = cpu->hyperv_vendor_id[1];
1470     c->edx = cpu->hyperv_vendor_id[2];
1471 
1472     c = &cpuid_ent[cpuid_i++];
1473     c->function = HV_CPUID_INTERFACE;
1474     c->eax = cpu->hyperv_interface_id[0];
1475     c->ebx = cpu->hyperv_interface_id[1];
1476     c->ecx = cpu->hyperv_interface_id[2];
1477     c->edx = cpu->hyperv_interface_id[3];
1478 
1479     c = &cpuid_ent[cpuid_i++];
1480     c->function = HV_CPUID_VERSION;
1481     c->eax = cpu->hyperv_ver_id_build;
1482     c->ebx = (uint32_t)cpu->hyperv_ver_id_major << 16 |
1483         cpu->hyperv_ver_id_minor;
1484     c->ecx = cpu->hyperv_ver_id_sp;
1485     c->edx = (uint32_t)cpu->hyperv_ver_id_sb << 24 |
1486         (cpu->hyperv_ver_id_sn & 0xffffff);
1487 
1488     c = &cpuid_ent[cpuid_i++];
1489     c->function = HV_CPUID_FEATURES;
1490     c->eax = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EAX);
1491     c->ebx = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EBX);
1492     c->edx = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EDX);
1493 
1494     /* Unconditionally required with any Hyper-V enlightenment */
1495     c->eax |= HV_HYPERCALL_AVAILABLE;
1496 
1497     /* SynIC and Vmbus devices require messages/signals hypercalls */
1498     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) &&
1499         !cpu->hyperv_synic_kvm_only) {
1500         c->ebx |= HV_POST_MESSAGES | HV_SIGNAL_EVENTS;
1501     }
1502 
1503 
1504     /* Not exposed by KVM but needed to make CPU hotplug in Windows work */
1505     c->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
1506 
1507     c = &cpuid_ent[cpuid_i++];
1508     c->function = HV_CPUID_ENLIGHTMENT_INFO;
1509     c->eax = hv_build_cpuid_leaf(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EAX);
1510     c->ebx = cpu->hyperv_spinlock_attempts;
1511 
1512     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC) &&
1513         !hyperv_feat_enabled(cpu, HYPERV_FEAT_AVIC)) {
1514         c->eax |= HV_APIC_ACCESS_RECOMMENDED;
1515     }
1516 
1517     if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_ON) {
1518         c->eax |= HV_NO_NONARCH_CORESHARING;
1519     } else if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO) {
1520         c->eax |= hv_cpuid_get_host(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EAX) &
1521             HV_NO_NONARCH_CORESHARING;
1522     }
1523 
1524     c = &cpuid_ent[cpuid_i++];
1525     c->function = HV_CPUID_IMPLEMENT_LIMITS;
1526     c->eax = cpu->hv_max_vps;
1527     c->ebx = cpu->hyperv_limits[0];
1528     c->ecx = cpu->hyperv_limits[1];
1529     c->edx = cpu->hyperv_limits[2];
1530 
1531     if (nested_eax) {
1532         uint32_t function;
1533 
1534         /* Create zeroed 0x40000006..0x40000009 leaves */
1535         for (function = HV_CPUID_IMPLEMENT_LIMITS + 1;
1536              function < HV_CPUID_NESTED_FEATURES; function++) {
1537             c = &cpuid_ent[cpuid_i++];
1538             c->function = function;
1539         }
1540 
1541         c = &cpuid_ent[cpuid_i++];
1542         c->function = HV_CPUID_NESTED_FEATURES;
1543         c->eax = nested_eax;
1544     }
1545 
1546     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG)) {
1547         c = &cpuid_ent[cpuid_i++];
1548         c->function = HV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS;
1549         c->eax = hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) ?
1550             HV_CPUID_NESTED_FEATURES : HV_CPUID_IMPLEMENT_LIMITS;
1551         memcpy(signature, "Microsoft VS", 12);
1552         c->eax = 0;
1553         c->ebx = signature[0];
1554         c->ecx = signature[1];
1555         c->edx = signature[2];
1556 
1557         c = &cpuid_ent[cpuid_i++];
1558         c->function = HV_CPUID_SYNDBG_INTERFACE;
1559         memcpy(signature, "VS#1\0\0\0\0\0\0\0\0", 12);
1560         c->eax = signature[0];
1561         c->ebx = 0;
1562         c->ecx = 0;
1563         c->edx = 0;
1564 
1565         c = &cpuid_ent[cpuid_i++];
1566         c->function = HV_CPUID_SYNDBG_PLATFORM_CAPABILITIES;
1567         c->eax = HV_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING;
1568         c->ebx = 0;
1569         c->ecx = 0;
1570         c->edx = 0;
1571     }
1572 
1573     return cpuid_i;
1574 }
1575 
1576 static Error *hv_passthrough_mig_blocker;
1577 static Error *hv_no_nonarch_cs_mig_blocker;
1578 
1579 /* Checks that the exposed eVMCS version range is supported by KVM */
1580 static bool evmcs_version_supported(uint16_t evmcs_version,
1581                                     uint16_t supported_evmcs_version)
1582 {
1583     uint8_t min_version = evmcs_version & 0xff;
1584     uint8_t max_version = evmcs_version >> 8;
1585     uint8_t min_supported_version = supported_evmcs_version & 0xff;
1586     uint8_t max_supported_version = supported_evmcs_version >> 8;
1587 
1588     return (min_version >= min_supported_version) &&
1589         (max_version <= max_supported_version);
1590 }
1591 
1592 static int hyperv_init_vcpu(X86CPU *cpu)
1593 {
1594     CPUState *cs = CPU(cpu);
1595     Error *local_err = NULL;
1596     int ret;
1597 
1598     if (cpu->hyperv_passthrough && hv_passthrough_mig_blocker == NULL) {
1599         error_setg(&hv_passthrough_mig_blocker,
1600                    "'hv-passthrough' CPU flag prevents migration, use explicit"
1601                    " set of hv-* flags instead");
1602         ret = migrate_add_blocker(&hv_passthrough_mig_blocker, &local_err);
1603         if (ret < 0) {
1604             error_report_err(local_err);
1605             return ret;
1606         }
1607     }
1608 
1609     if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO &&
1610         hv_no_nonarch_cs_mig_blocker == NULL) {
1611         error_setg(&hv_no_nonarch_cs_mig_blocker,
1612                    "'hv-no-nonarch-coresharing=auto' CPU flag prevents migration"
1613                    " use explicit 'hv-no-nonarch-coresharing=on' instead (but"
1614                    " make sure SMT is disabled and/or that vCPUs are properly"
1615                    " pinned)");
1616         ret = migrate_add_blocker(&hv_no_nonarch_cs_mig_blocker, &local_err);
1617         if (ret < 0) {
1618             error_report_err(local_err);
1619             return ret;
1620         }
1621     }
1622 
1623     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) && !hv_vpindex_settable) {
1624         /*
1625          * the kernel doesn't support setting vp_index; assert that its value
1626          * is in sync
1627          */
1628         uint64_t value;
1629 
1630         ret = kvm_get_one_msr(cpu, HV_X64_MSR_VP_INDEX, &value);
1631         if (ret < 0) {
1632             return ret;
1633         }
1634 
1635         if (value != hyperv_vp_index(CPU(cpu))) {
1636             error_report("kernel's vp_index != QEMU's vp_index");
1637             return -ENXIO;
1638         }
1639     }
1640 
1641     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
1642         uint32_t synic_cap = cpu->hyperv_synic_kvm_only ?
1643             KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2;
1644         ret = kvm_vcpu_enable_cap(cs, synic_cap, 0);
1645         if (ret < 0) {
1646             error_report("failed to turn on HyperV SynIC in KVM: %s",
1647                          strerror(-ret));
1648             return ret;
1649         }
1650 
1651         if (!cpu->hyperv_synic_kvm_only) {
1652             ret = hyperv_x86_synic_add(cpu);
1653             if (ret < 0) {
1654                 error_report("failed to create HyperV SynIC: %s",
1655                              strerror(-ret));
1656                 return ret;
1657             }
1658         }
1659     }
1660 
1661     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) {
1662         uint16_t evmcs_version = DEFAULT_EVMCS_VERSION;
1663         uint16_t supported_evmcs_version;
1664 
1665         ret = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENLIGHTENED_VMCS, 0,
1666                                   (uintptr_t)&supported_evmcs_version);
1667 
1668         /*
1669          * KVM is required to support EVMCS ver.1. as that's what 'hv-evmcs'
1670          * option sets. Note: we hardcode the maximum supported eVMCS version
1671          * to '1' as well so 'hv-evmcs' feature is migratable even when (and if)
1672          * ver.2 is implemented. A new option (e.g. 'hv-evmcs=2') will then have
1673          * to be added.
1674          */
1675         if (ret < 0) {
1676             error_report("Hyper-V %s is not supported by kernel",
1677                          kvm_hyperv_properties[HYPERV_FEAT_EVMCS].desc);
1678             return ret;
1679         }
1680 
1681         if (!evmcs_version_supported(evmcs_version, supported_evmcs_version)) {
1682             error_report("eVMCS version range [%d..%d] is not supported by "
1683                          "kernel (supported: [%d..%d])", evmcs_version & 0xff,
1684                          evmcs_version >> 8, supported_evmcs_version & 0xff,
1685                          supported_evmcs_version >> 8);
1686             return -ENOTSUP;
1687         }
1688     }
1689 
1690     if (cpu->hyperv_enforce_cpuid) {
1691         ret = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENFORCE_CPUID, 0, 1);
1692         if (ret < 0) {
1693             error_report("failed to enable KVM_CAP_HYPERV_ENFORCE_CPUID: %s",
1694                          strerror(-ret));
1695             return ret;
1696         }
1697     }
1698 
1699     /* Skip SynIC and VP_INDEX since they are hard deps already */
1700     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_STIMER) &&
1701         hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC) &&
1702         hyperv_feat_enabled(cpu, HYPERV_FEAT_RUNTIME)) {
1703         hyperv_x86_set_vmbus_recommended_features_enabled();
1704     }
1705 
1706     return 0;
1707 }
1708 
1709 static Error *invtsc_mig_blocker;
1710 
1711 #define KVM_MAX_CPUID_ENTRIES  100
1712 
1713 static void kvm_init_xsave(CPUX86State *env)
1714 {
1715     if (has_xsave2) {
1716         env->xsave_buf_len = QEMU_ALIGN_UP(has_xsave2, 4096);
1717     } else {
1718         env->xsave_buf_len = sizeof(struct kvm_xsave);
1719     }
1720 
1721     env->xsave_buf = qemu_memalign(4096, env->xsave_buf_len);
1722     memset(env->xsave_buf, 0, env->xsave_buf_len);
1723     /*
1724      * The allocated storage must be large enough for all of the
1725      * possible XSAVE state components.
1726      */
1727     assert(kvm_arch_get_supported_cpuid(kvm_state, 0xd, 0, R_ECX) <=
1728            env->xsave_buf_len);
1729 }
1730 
1731 static void kvm_init_nested_state(CPUX86State *env)
1732 {
1733     struct kvm_vmx_nested_state_hdr *vmx_hdr;
1734     uint32_t size;
1735 
1736     if (!env->nested_state) {
1737         return;
1738     }
1739 
1740     size = env->nested_state->size;
1741 
1742     memset(env->nested_state, 0, size);
1743     env->nested_state->size = size;
1744 
1745     if (cpu_has_vmx(env)) {
1746         env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX;
1747         vmx_hdr = &env->nested_state->hdr.vmx;
1748         vmx_hdr->vmxon_pa = -1ull;
1749         vmx_hdr->vmcs12_pa = -1ull;
1750     } else if (cpu_has_svm(env)) {
1751         env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM;
1752     }
1753 }
1754 
1755 static uint32_t kvm_x86_build_cpuid(CPUX86State *env,
1756                                     struct kvm_cpuid_entry2 *entries,
1757                                     uint32_t cpuid_i)
1758 {
1759     uint32_t limit, i, j;
1760     uint32_t unused;
1761     struct kvm_cpuid_entry2 *c;
1762 
1763     cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
1764 
1765     for (i = 0; i <= limit; i++) {
1766         j = 0;
1767         if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1768             goto full;
1769         }
1770         c = &entries[cpuid_i++];
1771         switch (i) {
1772         case 2: {
1773             /* Keep reading function 2 till all the input is received */
1774             int times;
1775 
1776             c->function = i;
1777             c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
1778                        KVM_CPUID_FLAG_STATE_READ_NEXT;
1779             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1780             times = c->eax & 0xff;
1781 
1782             for (j = 1; j < times; ++j) {
1783                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1784                     goto full;
1785                 }
1786                 c = &entries[cpuid_i++];
1787                 c->function = i;
1788                 c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC;
1789                 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1790             }
1791             break;
1792         }
1793         case 0x1f:
1794             if (env->nr_dies < 2) {
1795                 cpuid_i--;
1796                 break;
1797             }
1798             /* fallthrough */
1799         case 4:
1800         case 0xb:
1801         case 0xd:
1802             for (j = 0; ; j++) {
1803                 if (i == 0xd && j == 64) {
1804                     break;
1805                 }
1806 
1807                 c->function = i;
1808                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1809                 c->index = j;
1810                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1811 
1812                 if (i == 4 && c->eax == 0) {
1813                     break;
1814                 }
1815                 if (i == 0xb && !(c->ecx & 0xff00)) {
1816                     break;
1817                 }
1818                 if (i == 0x1f && !(c->ecx & 0xff00)) {
1819                     break;
1820                 }
1821                 if (i == 0xd && c->eax == 0) {
1822                     continue;
1823                 }
1824                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1825                     goto full;
1826                 }
1827                 c = &entries[cpuid_i++];
1828             }
1829             break;
1830         case 0x12:
1831             for (j = 0; ; j++) {
1832                 c->function = i;
1833                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1834                 c->index = j;
1835                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1836 
1837                 if (j > 1 && (c->eax & 0xf) != 1) {
1838                     break;
1839                 }
1840 
1841                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1842                     goto full;
1843                 }
1844                 c = &entries[cpuid_i++];
1845             }
1846             break;
1847         case 0x7:
1848         case 0x14:
1849         case 0x1d:
1850         case 0x1e: {
1851             uint32_t times;
1852 
1853             c->function = i;
1854             c->index = 0;
1855             c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1856             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1857             times = c->eax;
1858 
1859             for (j = 1; j <= times; ++j) {
1860                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1861                     goto full;
1862                 }
1863                 c = &entries[cpuid_i++];
1864                 c->function = i;
1865                 c->index = j;
1866                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1867                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1868             }
1869             break;
1870         }
1871         default:
1872             c->function = i;
1873             c->flags = 0;
1874             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1875             if (!c->eax && !c->ebx && !c->ecx && !c->edx) {
1876                 /*
1877                  * KVM already returns all zeroes if a CPUID entry is missing,
1878                  * so we can omit it and avoid hitting KVM's 80-entry limit.
1879                  */
1880                 cpuid_i--;
1881             }
1882             break;
1883         }
1884     }
1885 
1886     if (limit >= 0x0a) {
1887         uint32_t eax, edx;
1888 
1889         cpu_x86_cpuid(env, 0x0a, 0, &eax, &unused, &unused, &edx);
1890 
1891         has_architectural_pmu_version = eax & 0xff;
1892         if (has_architectural_pmu_version > 0) {
1893             num_architectural_pmu_gp_counters = (eax & 0xff00) >> 8;
1894 
1895             /* Shouldn't be more than 32, since that's the number of bits
1896              * available in EBX to tell us _which_ counters are available.
1897              * Play it safe.
1898              */
1899             if (num_architectural_pmu_gp_counters > MAX_GP_COUNTERS) {
1900                 num_architectural_pmu_gp_counters = MAX_GP_COUNTERS;
1901             }
1902 
1903             if (has_architectural_pmu_version > 1) {
1904                 num_architectural_pmu_fixed_counters = edx & 0x1f;
1905 
1906                 if (num_architectural_pmu_fixed_counters > MAX_FIXED_COUNTERS) {
1907                     num_architectural_pmu_fixed_counters = MAX_FIXED_COUNTERS;
1908                 }
1909             }
1910         }
1911     }
1912 
1913     cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused);
1914 
1915     for (i = 0x80000000; i <= limit; i++) {
1916         j = 0;
1917         if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1918             goto full;
1919         }
1920         c = &entries[cpuid_i++];
1921 
1922         switch (i) {
1923         case 0x8000001d:
1924             /* Query for all AMD cache information leaves */
1925             for (j = 0; ; j++) {
1926                 c->function = i;
1927                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1928                 c->index = j;
1929                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1930 
1931                 if (c->eax == 0) {
1932                     break;
1933                 }
1934                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1935                     goto full;
1936                 }
1937                 c = &entries[cpuid_i++];
1938             }
1939             break;
1940         default:
1941             c->function = i;
1942             c->flags = 0;
1943             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1944             if (!c->eax && !c->ebx && !c->ecx && !c->edx) {
1945                 /*
1946                  * KVM already returns all zeroes if a CPUID entry is missing,
1947                  * so we can omit it and avoid hitting KVM's 80-entry limit.
1948                  */
1949                 cpuid_i--;
1950             }
1951             break;
1952         }
1953     }
1954 
1955     /* Call Centaur's CPUID instructions they are supported. */
1956     if (env->cpuid_xlevel2 > 0) {
1957         cpu_x86_cpuid(env, 0xC0000000, 0, &limit, &unused, &unused, &unused);
1958 
1959         for (i = 0xC0000000; i <= limit; i++) {
1960             j = 0;
1961             if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1962                 goto full;
1963             }
1964             c = &entries[cpuid_i++];
1965 
1966             c->function = i;
1967             c->flags = 0;
1968             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1969         }
1970     }
1971 
1972     return cpuid_i;
1973 
1974 full:
1975     fprintf(stderr, "cpuid_data is full, no space for "
1976             "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
1977     abort();
1978 }
1979 
1980 int kvm_arch_init_vcpu(CPUState *cs)
1981 {
1982     struct {
1983         struct kvm_cpuid2 cpuid;
1984         struct kvm_cpuid_entry2 entries[KVM_MAX_CPUID_ENTRIES];
1985     } cpuid_data;
1986     /*
1987      * The kernel defines these structs with padding fields so there
1988      * should be no extra padding in our cpuid_data struct.
1989      */
1990     QEMU_BUILD_BUG_ON(sizeof(cpuid_data) !=
1991                       sizeof(struct kvm_cpuid2) +
1992                       sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES);
1993 
1994     X86CPU *cpu = X86_CPU(cs);
1995     CPUX86State *env = &cpu->env;
1996     uint32_t cpuid_i;
1997     struct kvm_cpuid_entry2 *c;
1998     uint32_t signature[3];
1999     int kvm_base = KVM_CPUID_SIGNATURE;
2000     int max_nested_state_len;
2001     int r;
2002     Error *local_err = NULL;
2003 
2004     memset(&cpuid_data, 0, sizeof(cpuid_data));
2005 
2006     cpuid_i = 0;
2007 
2008     has_xsave2 = kvm_check_extension(cs->kvm_state, KVM_CAP_XSAVE2);
2009 
2010     r = kvm_arch_set_tsc_khz(cs);
2011     if (r < 0) {
2012         return r;
2013     }
2014 
2015     /* vcpu's TSC frequency is either specified by user, or following
2016      * the value used by KVM if the former is not present. In the
2017      * latter case, we query it from KVM and record in env->tsc_khz,
2018      * so that vcpu's TSC frequency can be migrated later via this field.
2019      */
2020     if (!env->tsc_khz) {
2021         r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
2022             kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
2023             -ENOTSUP;
2024         if (r > 0) {
2025             env->tsc_khz = r;
2026         }
2027     }
2028 
2029     env->apic_bus_freq = KVM_APIC_BUS_FREQUENCY;
2030 
2031     /*
2032      * kvm_hyperv_expand_features() is called here for the second time in case
2033      * KVM_CAP_SYS_HYPERV_CPUID is not supported. While we can't possibly handle
2034      * 'query-cpu-model-expansion' in this case as we don't have a KVM vCPU to
2035      * check which Hyper-V enlightenments are supported and which are not, we
2036      * can still proceed and check/expand Hyper-V enlightenments here so legacy
2037      * behavior is preserved.
2038      */
2039     if (!kvm_hyperv_expand_features(cpu, &local_err)) {
2040         error_report_err(local_err);
2041         return -ENOSYS;
2042     }
2043 
2044     if (hyperv_enabled(cpu)) {
2045         r = hyperv_init_vcpu(cpu);
2046         if (r) {
2047             return r;
2048         }
2049 
2050         cpuid_i = hyperv_fill_cpuids(cs, cpuid_data.entries);
2051         kvm_base = KVM_CPUID_SIGNATURE_NEXT;
2052         has_msr_hv_hypercall = true;
2053     }
2054 
2055     if (cs->kvm_state->xen_version) {
2056 #ifdef CONFIG_XEN_EMU
2057         struct kvm_cpuid_entry2 *xen_max_leaf;
2058 
2059         memcpy(signature, "XenVMMXenVMM", 12);
2060 
2061         xen_max_leaf = c = &cpuid_data.entries[cpuid_i++];
2062         c->function = kvm_base + XEN_CPUID_SIGNATURE;
2063         c->eax = kvm_base + XEN_CPUID_TIME;
2064         c->ebx = signature[0];
2065         c->ecx = signature[1];
2066         c->edx = signature[2];
2067 
2068         c = &cpuid_data.entries[cpuid_i++];
2069         c->function = kvm_base + XEN_CPUID_VENDOR;
2070         c->eax = cs->kvm_state->xen_version;
2071         c->ebx = 0;
2072         c->ecx = 0;
2073         c->edx = 0;
2074 
2075         c = &cpuid_data.entries[cpuid_i++];
2076         c->function = kvm_base + XEN_CPUID_HVM_MSR;
2077         /* Number of hypercall-transfer pages */
2078         c->eax = 1;
2079         /* Hypercall MSR base address */
2080         if (hyperv_enabled(cpu)) {
2081             c->ebx = XEN_HYPERCALL_MSR_HYPERV;
2082             kvm_xen_init(cs->kvm_state, c->ebx);
2083         } else {
2084             c->ebx = XEN_HYPERCALL_MSR;
2085         }
2086         c->ecx = 0;
2087         c->edx = 0;
2088 
2089         c = &cpuid_data.entries[cpuid_i++];
2090         c->function = kvm_base + XEN_CPUID_TIME;
2091         c->eax = ((!!tsc_is_stable_and_known(env) << 1) |
2092             (!!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP) << 2));
2093         /* default=0 (emulate if necessary) */
2094         c->ebx = 0;
2095         /* guest tsc frequency */
2096         c->ecx = env->user_tsc_khz;
2097         /* guest tsc incarnation (migration count) */
2098         c->edx = 0;
2099 
2100         c = &cpuid_data.entries[cpuid_i++];
2101         c->function = kvm_base + XEN_CPUID_HVM;
2102         xen_max_leaf->eax = kvm_base + XEN_CPUID_HVM;
2103         if (cs->kvm_state->xen_version >= XEN_VERSION(4, 5)) {
2104             c->function = kvm_base + XEN_CPUID_HVM;
2105 
2106             if (cpu->xen_vapic) {
2107                 c->eax |= XEN_HVM_CPUID_APIC_ACCESS_VIRT;
2108                 c->eax |= XEN_HVM_CPUID_X2APIC_VIRT;
2109             }
2110 
2111             c->eax |= XEN_HVM_CPUID_IOMMU_MAPPINGS;
2112 
2113             if (cs->kvm_state->xen_version >= XEN_VERSION(4, 6)) {
2114                 c->eax |= XEN_HVM_CPUID_VCPU_ID_PRESENT;
2115                 c->ebx = cs->cpu_index;
2116             }
2117 
2118             if (cs->kvm_state->xen_version >= XEN_VERSION(4, 17)) {
2119                 c->eax |= XEN_HVM_CPUID_UPCALL_VECTOR;
2120             }
2121         }
2122 
2123         r = kvm_xen_init_vcpu(cs);
2124         if (r) {
2125             return r;
2126         }
2127 
2128         kvm_base += 0x100;
2129 #else /* CONFIG_XEN_EMU */
2130         /* This should never happen as kvm_arch_init() would have died first. */
2131         fprintf(stderr, "Cannot enable Xen CPUID without Xen support\n");
2132         abort();
2133 #endif
2134     } else if (cpu->expose_kvm) {
2135         memcpy(signature, "KVMKVMKVM\0\0\0", 12);
2136         c = &cpuid_data.entries[cpuid_i++];
2137         c->function = KVM_CPUID_SIGNATURE | kvm_base;
2138         c->eax = KVM_CPUID_FEATURES | kvm_base;
2139         c->ebx = signature[0];
2140         c->ecx = signature[1];
2141         c->edx = signature[2];
2142 
2143         c = &cpuid_data.entries[cpuid_i++];
2144         c->function = KVM_CPUID_FEATURES | kvm_base;
2145         c->eax = env->features[FEAT_KVM];
2146         c->edx = env->features[FEAT_KVM_HINTS];
2147     }
2148 
2149     if (cpu->kvm_pv_enforce_cpuid) {
2150         r = kvm_vcpu_enable_cap(cs, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 0, 1);
2151         if (r < 0) {
2152             fprintf(stderr,
2153                     "failed to enable KVM_CAP_ENFORCE_PV_FEATURE_CPUID: %s",
2154                     strerror(-r));
2155             abort();
2156         }
2157     }
2158 
2159     cpuid_i = kvm_x86_build_cpuid(env, cpuid_data.entries, cpuid_i);
2160     cpuid_data.cpuid.nent = cpuid_i;
2161 
2162     if (((env->cpuid_version >> 8)&0xF) >= 6
2163         && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) ==
2164            (CPUID_MCE | CPUID_MCA)) {
2165         uint64_t mcg_cap, unsupported_caps;
2166         int banks;
2167         int ret;
2168 
2169         ret = kvm_get_mce_cap_supported(cs->kvm_state, &mcg_cap, &banks);
2170         if (ret < 0) {
2171             fprintf(stderr, "kvm_get_mce_cap_supported: %s", strerror(-ret));
2172             return ret;
2173         }
2174 
2175         if (banks < (env->mcg_cap & MCG_CAP_BANKS_MASK)) {
2176             error_report("kvm: Unsupported MCE bank count (QEMU = %d, KVM = %d)",
2177                          (int)(env->mcg_cap & MCG_CAP_BANKS_MASK), banks);
2178             return -ENOTSUP;
2179         }
2180 
2181         unsupported_caps = env->mcg_cap & ~(mcg_cap | MCG_CAP_BANKS_MASK);
2182         if (unsupported_caps) {
2183             if (unsupported_caps & MCG_LMCE_P) {
2184                 error_report("kvm: LMCE not supported");
2185                 return -ENOTSUP;
2186             }
2187             warn_report("Unsupported MCG_CAP bits: 0x%" PRIx64,
2188                         unsupported_caps);
2189         }
2190 
2191         env->mcg_cap &= mcg_cap | MCG_CAP_BANKS_MASK;
2192         ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, &env->mcg_cap);
2193         if (ret < 0) {
2194             fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret));
2195             return ret;
2196         }
2197     }
2198 
2199     cpu->vmsentry = qemu_add_vm_change_state_handler(cpu_update_state, env);
2200 
2201     c = cpuid_find_entry(&cpuid_data.cpuid, 1, 0);
2202     if (c) {
2203         has_msr_feature_control = !!(c->ecx & CPUID_EXT_VMX) ||
2204                                   !!(c->ecx & CPUID_EXT_SMX);
2205     }
2206 
2207     c = cpuid_find_entry(&cpuid_data.cpuid, 7, 0);
2208     if (c && (c->ebx & CPUID_7_0_EBX_SGX)) {
2209         has_msr_feature_control = true;
2210     }
2211 
2212     if (env->mcg_cap & MCG_LMCE_P) {
2213         has_msr_mcg_ext_ctl = has_msr_feature_control = true;
2214     }
2215 
2216     if (!env->user_tsc_khz) {
2217         if ((env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
2218             invtsc_mig_blocker == NULL) {
2219             error_setg(&invtsc_mig_blocker,
2220                        "State blocked by non-migratable CPU device"
2221                        " (invtsc flag)");
2222             r = migrate_add_blocker(&invtsc_mig_blocker, &local_err);
2223             if (r < 0) {
2224                 error_report_err(local_err);
2225                 return r;
2226             }
2227         }
2228     }
2229 
2230     if (cpu->vmware_cpuid_freq
2231         /* Guests depend on 0x40000000 to detect this feature, so only expose
2232          * it if KVM exposes leaf 0x40000000. (Conflicts with Hyper-V) */
2233         && cpu->expose_kvm
2234         && kvm_base == KVM_CPUID_SIGNATURE
2235         /* TSC clock must be stable and known for this feature. */
2236         && tsc_is_stable_and_known(env)) {
2237 
2238         c = &cpuid_data.entries[cpuid_i++];
2239         c->function = KVM_CPUID_SIGNATURE | 0x10;
2240         c->eax = env->tsc_khz;
2241         c->ebx = env->apic_bus_freq / 1000; /* Hz to KHz */
2242         c->ecx = c->edx = 0;
2243 
2244         c = cpuid_find_entry(&cpuid_data.cpuid, kvm_base, 0);
2245         c->eax = MAX(c->eax, KVM_CPUID_SIGNATURE | 0x10);
2246     }
2247 
2248     cpuid_data.cpuid.nent = cpuid_i;
2249 
2250     cpuid_data.cpuid.padding = 0;
2251     r = kvm_vcpu_ioctl(cs, KVM_SET_CPUID2, &cpuid_data);
2252     if (r) {
2253         goto fail;
2254     }
2255     kvm_init_xsave(env);
2256 
2257     max_nested_state_len = kvm_max_nested_state_length();
2258     if (max_nested_state_len > 0) {
2259         assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data));
2260 
2261         if (cpu_has_vmx(env) || cpu_has_svm(env)) {
2262             env->nested_state = g_malloc0(max_nested_state_len);
2263             env->nested_state->size = max_nested_state_len;
2264 
2265             kvm_init_nested_state(env);
2266         }
2267     }
2268 
2269     cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE);
2270 
2271     if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) {
2272         has_msr_tsc_aux = false;
2273     }
2274 
2275     kvm_init_msrs(cpu);
2276 
2277     return 0;
2278 
2279  fail:
2280     migrate_del_blocker(&invtsc_mig_blocker);
2281 
2282     return r;
2283 }
2284 
2285 int kvm_arch_destroy_vcpu(CPUState *cs)
2286 {
2287     X86CPU *cpu = X86_CPU(cs);
2288     CPUX86State *env = &cpu->env;
2289 
2290     g_free(env->xsave_buf);
2291 
2292     g_free(cpu->kvm_msr_buf);
2293     cpu->kvm_msr_buf = NULL;
2294 
2295     g_free(env->nested_state);
2296     env->nested_state = NULL;
2297 
2298     qemu_del_vm_change_state_handler(cpu->vmsentry);
2299 
2300     return 0;
2301 }
2302 
2303 void kvm_arch_reset_vcpu(X86CPU *cpu)
2304 {
2305     CPUX86State *env = &cpu->env;
2306 
2307     env->xcr0 = 1;
2308     if (kvm_irqchip_in_kernel()) {
2309         env->mp_state = cpu_is_bsp(cpu) ? KVM_MP_STATE_RUNNABLE :
2310                                           KVM_MP_STATE_UNINITIALIZED;
2311     } else {
2312         env->mp_state = KVM_MP_STATE_RUNNABLE;
2313     }
2314 
2315     /* enabled by default */
2316     env->poll_control_msr = 1;
2317 
2318     kvm_init_nested_state(env);
2319 
2320     sev_es_set_reset_vector(CPU(cpu));
2321 }
2322 
2323 void kvm_arch_after_reset_vcpu(X86CPU *cpu)
2324 {
2325     CPUX86State *env = &cpu->env;
2326     int i;
2327 
2328     /*
2329      * Reset SynIC after all other devices have been reset to let them remove
2330      * their SINT routes first.
2331      */
2332     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
2333         for (i = 0; i < ARRAY_SIZE(env->msr_hv_synic_sint); i++) {
2334             env->msr_hv_synic_sint[i] = HV_SINT_MASKED;
2335         }
2336 
2337         hyperv_x86_synic_reset(cpu);
2338     }
2339 }
2340 
2341 void kvm_arch_do_init_vcpu(X86CPU *cpu)
2342 {
2343     CPUX86State *env = &cpu->env;
2344 
2345     /* APs get directly into wait-for-SIPI state.  */
2346     if (env->mp_state == KVM_MP_STATE_UNINITIALIZED) {
2347         env->mp_state = KVM_MP_STATE_INIT_RECEIVED;
2348     }
2349 }
2350 
2351 static int kvm_get_supported_feature_msrs(KVMState *s)
2352 {
2353     int ret = 0;
2354 
2355     if (kvm_feature_msrs != NULL) {
2356         return 0;
2357     }
2358 
2359     if (!kvm_check_extension(s, KVM_CAP_GET_MSR_FEATURES)) {
2360         return 0;
2361     }
2362 
2363     struct kvm_msr_list msr_list;
2364 
2365     msr_list.nmsrs = 0;
2366     ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, &msr_list);
2367     if (ret < 0 && ret != -E2BIG) {
2368         error_report("Fetch KVM feature MSR list failed: %s",
2369             strerror(-ret));
2370         return ret;
2371     }
2372 
2373     assert(msr_list.nmsrs > 0);
2374     kvm_feature_msrs = g_malloc0(sizeof(msr_list) +
2375                  msr_list.nmsrs * sizeof(msr_list.indices[0]));
2376 
2377     kvm_feature_msrs->nmsrs = msr_list.nmsrs;
2378     ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, kvm_feature_msrs);
2379 
2380     if (ret < 0) {
2381         error_report("Fetch KVM feature MSR list failed: %s",
2382             strerror(-ret));
2383         g_free(kvm_feature_msrs);
2384         kvm_feature_msrs = NULL;
2385         return ret;
2386     }
2387 
2388     return 0;
2389 }
2390 
2391 static int kvm_get_supported_msrs(KVMState *s)
2392 {
2393     int ret = 0;
2394     struct kvm_msr_list msr_list, *kvm_msr_list;
2395 
2396     /*
2397      *  Obtain MSR list from KVM.  These are the MSRs that we must
2398      *  save/restore.
2399      */
2400     msr_list.nmsrs = 0;
2401     ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list);
2402     if (ret < 0 && ret != -E2BIG) {
2403         return ret;
2404     }
2405     /*
2406      * Old kernel modules had a bug and could write beyond the provided
2407      * memory. Allocate at least a safe amount of 1K.
2408      */
2409     kvm_msr_list = g_malloc0(MAX(1024, sizeof(msr_list) +
2410                                           msr_list.nmsrs *
2411                                           sizeof(msr_list.indices[0])));
2412 
2413     kvm_msr_list->nmsrs = msr_list.nmsrs;
2414     ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
2415     if (ret >= 0) {
2416         int i;
2417 
2418         for (i = 0; i < kvm_msr_list->nmsrs; i++) {
2419             switch (kvm_msr_list->indices[i]) {
2420             case MSR_STAR:
2421                 has_msr_star = true;
2422                 break;
2423             case MSR_VM_HSAVE_PA:
2424                 has_msr_hsave_pa = true;
2425                 break;
2426             case MSR_TSC_AUX:
2427                 has_msr_tsc_aux = true;
2428                 break;
2429             case MSR_TSC_ADJUST:
2430                 has_msr_tsc_adjust = true;
2431                 break;
2432             case MSR_IA32_TSCDEADLINE:
2433                 has_msr_tsc_deadline = true;
2434                 break;
2435             case MSR_IA32_SMBASE:
2436                 has_msr_smbase = true;
2437                 break;
2438             case MSR_SMI_COUNT:
2439                 has_msr_smi_count = true;
2440                 break;
2441             case MSR_IA32_MISC_ENABLE:
2442                 has_msr_misc_enable = true;
2443                 break;
2444             case MSR_IA32_BNDCFGS:
2445                 has_msr_bndcfgs = true;
2446                 break;
2447             case MSR_IA32_XSS:
2448                 has_msr_xss = true;
2449                 break;
2450             case MSR_IA32_UMWAIT_CONTROL:
2451                 has_msr_umwait = true;
2452                 break;
2453             case HV_X64_MSR_CRASH_CTL:
2454                 has_msr_hv_crash = true;
2455                 break;
2456             case HV_X64_MSR_RESET:
2457                 has_msr_hv_reset = true;
2458                 break;
2459             case HV_X64_MSR_VP_INDEX:
2460                 has_msr_hv_vpindex = true;
2461                 break;
2462             case HV_X64_MSR_VP_RUNTIME:
2463                 has_msr_hv_runtime = true;
2464                 break;
2465             case HV_X64_MSR_SCONTROL:
2466                 has_msr_hv_synic = true;
2467                 break;
2468             case HV_X64_MSR_STIMER0_CONFIG:
2469                 has_msr_hv_stimer = true;
2470                 break;
2471             case HV_X64_MSR_TSC_FREQUENCY:
2472                 has_msr_hv_frequencies = true;
2473                 break;
2474             case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2475                 has_msr_hv_reenlightenment = true;
2476                 break;
2477             case HV_X64_MSR_SYNDBG_OPTIONS:
2478                 has_msr_hv_syndbg_options = true;
2479                 break;
2480             case MSR_IA32_SPEC_CTRL:
2481                 has_msr_spec_ctrl = true;
2482                 break;
2483             case MSR_AMD64_TSC_RATIO:
2484                 has_tsc_scale_msr = true;
2485                 break;
2486             case MSR_IA32_TSX_CTRL:
2487                 has_msr_tsx_ctrl = true;
2488                 break;
2489             case MSR_VIRT_SSBD:
2490                 has_msr_virt_ssbd = true;
2491                 break;
2492             case MSR_IA32_ARCH_CAPABILITIES:
2493                 has_msr_arch_capabs = true;
2494                 break;
2495             case MSR_IA32_CORE_CAPABILITY:
2496                 has_msr_core_capabs = true;
2497                 break;
2498             case MSR_IA32_PERF_CAPABILITIES:
2499                 has_msr_perf_capabs = true;
2500                 break;
2501             case MSR_IA32_VMX_VMFUNC:
2502                 has_msr_vmx_vmfunc = true;
2503                 break;
2504             case MSR_IA32_UCODE_REV:
2505                 has_msr_ucode_rev = true;
2506                 break;
2507             case MSR_IA32_VMX_PROCBASED_CTLS2:
2508                 has_msr_vmx_procbased_ctls2 = true;
2509                 break;
2510             case MSR_IA32_PKRS:
2511                 has_msr_pkrs = true;
2512                 break;
2513             }
2514         }
2515     }
2516 
2517     g_free(kvm_msr_list);
2518 
2519     return ret;
2520 }
2521 
2522 static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, uint32_t msr,
2523                                         uint64_t *val)
2524 {
2525     CPUState *cs = CPU(cpu);
2526 
2527     *val = cs->nr_threads * cs->nr_cores; /* thread count, bits 15..0 */
2528     *val |= ((uint32_t)cs->nr_cores << 16); /* core count, bits 31..16 */
2529 
2530     return true;
2531 }
2532 
2533 static Notifier smram_machine_done;
2534 static KVMMemoryListener smram_listener;
2535 static AddressSpace smram_address_space;
2536 static MemoryRegion smram_as_root;
2537 static MemoryRegion smram_as_mem;
2538 
2539 static void register_smram_listener(Notifier *n, void *unused)
2540 {
2541     MemoryRegion *smram =
2542         (MemoryRegion *) object_resolve_path("/machine/smram", NULL);
2543 
2544     /* Outer container... */
2545     memory_region_init(&smram_as_root, OBJECT(kvm_state), "mem-container-smram", ~0ull);
2546     memory_region_set_enabled(&smram_as_root, true);
2547 
2548     /* ... with two regions inside: normal system memory with low
2549      * priority, and...
2550      */
2551     memory_region_init_alias(&smram_as_mem, OBJECT(kvm_state), "mem-smram",
2552                              get_system_memory(), 0, ~0ull);
2553     memory_region_add_subregion_overlap(&smram_as_root, 0, &smram_as_mem, 0);
2554     memory_region_set_enabled(&smram_as_mem, true);
2555 
2556     if (smram) {
2557         /* ... SMRAM with higher priority */
2558         memory_region_add_subregion_overlap(&smram_as_root, 0, smram, 10);
2559         memory_region_set_enabled(smram, true);
2560     }
2561 
2562     address_space_init(&smram_address_space, &smram_as_root, "KVM-SMRAM");
2563     kvm_memory_listener_register(kvm_state, &smram_listener,
2564                                  &smram_address_space, 1, "kvm-smram");
2565 }
2566 
2567 int kvm_arch_get_default_type(MachineState *ms)
2568 {
2569     return 0;
2570 }
2571 
2572 int kvm_arch_init(MachineState *ms, KVMState *s)
2573 {
2574     uint64_t identity_base = 0xfffbc000;
2575     uint64_t shadow_mem;
2576     int ret;
2577     struct utsname utsname;
2578     Error *local_err = NULL;
2579 
2580     /*
2581      * Initialize SEV context, if required
2582      *
2583      * If no memory encryption is requested (ms->cgs == NULL) this is
2584      * a no-op.
2585      *
2586      * It's also a no-op if a non-SEV confidential guest support
2587      * mechanism is selected.  SEV is the only mechanism available to
2588      * select on x86 at present, so this doesn't arise, but if new
2589      * mechanisms are supported in future (e.g. TDX), they'll need
2590      * their own initialization either here or elsewhere.
2591      */
2592     if (ms->cgs) {
2593         ret = confidential_guest_kvm_init(ms->cgs, &local_err);
2594         if (ret < 0) {
2595             error_report_err(local_err);
2596             return ret;
2597         }
2598     }
2599 
2600     has_xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
2601     has_sregs2 = kvm_check_extension(s, KVM_CAP_SREGS2) > 0;
2602 
2603     hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX);
2604 
2605     has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD);
2606     if (has_exception_payload) {
2607         ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true);
2608         if (ret < 0) {
2609             error_report("kvm: Failed to enable exception payload cap: %s",
2610                          strerror(-ret));
2611             return ret;
2612         }
2613     }
2614 
2615     has_triple_fault_event = kvm_check_extension(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT);
2616     if (has_triple_fault_event) {
2617         ret = kvm_vm_enable_cap(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT, 0, true);
2618         if (ret < 0) {
2619             error_report("kvm: Failed to enable triple fault event cap: %s",
2620                          strerror(-ret));
2621             return ret;
2622         }
2623     }
2624 
2625     if (s->xen_version) {
2626 #ifdef CONFIG_XEN_EMU
2627         if (!object_dynamic_cast(OBJECT(ms), TYPE_PC_MACHINE)) {
2628             error_report("kvm: Xen support only available in PC machine");
2629             return -ENOTSUP;
2630         }
2631         /* hyperv_enabled() doesn't work yet. */
2632         uint32_t msr = XEN_HYPERCALL_MSR;
2633         ret = kvm_xen_init(s, msr);
2634         if (ret < 0) {
2635             return ret;
2636         }
2637 #else
2638         error_report("kvm: Xen support not enabled in qemu");
2639         return -ENOTSUP;
2640 #endif
2641     }
2642 
2643     ret = kvm_get_supported_msrs(s);
2644     if (ret < 0) {
2645         return ret;
2646     }
2647 
2648     kvm_get_supported_feature_msrs(s);
2649 
2650     uname(&utsname);
2651     lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;
2652 
2653     /*
2654      * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly.
2655      * In order to use vm86 mode, an EPT identity map and a TSS  are needed.
2656      * Since these must be part of guest physical memory, we need to allocate
2657      * them, both by setting their start addresses in the kernel and by
2658      * creating a corresponding e820 entry. We need 4 pages before the BIOS,
2659      * so this value allows up to 16M BIOSes.
2660      */
2661     identity_base = 0xfeffc000;
2662     ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base);
2663     if (ret < 0) {
2664         return ret;
2665     }
2666 
2667     /* Set TSS base one page after EPT identity map. */
2668     ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, identity_base + 0x1000);
2669     if (ret < 0) {
2670         return ret;
2671     }
2672 
2673     /* Tell fw_cfg to notify the BIOS to reserve the range. */
2674     ret = e820_add_entry(identity_base, 0x4000, E820_RESERVED);
2675     if (ret < 0) {
2676         fprintf(stderr, "e820_add_entry() table is full\n");
2677         return ret;
2678     }
2679 
2680     shadow_mem = object_property_get_int(OBJECT(s), "kvm-shadow-mem", &error_abort);
2681     if (shadow_mem != -1) {
2682         shadow_mem /= 4096;
2683         ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem);
2684         if (ret < 0) {
2685             return ret;
2686         }
2687     }
2688 
2689     if (kvm_check_extension(s, KVM_CAP_X86_SMM) &&
2690         object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE) &&
2691         x86_machine_is_smm_enabled(X86_MACHINE(ms))) {
2692         smram_machine_done.notify = register_smram_listener;
2693         qemu_add_machine_init_done_notifier(&smram_machine_done);
2694     }
2695 
2696     if (enable_cpu_pm) {
2697         int disable_exits = kvm_check_extension(s, KVM_CAP_X86_DISABLE_EXITS);
2698 /* Work around for kernel header with a typo. TODO: fix header and drop. */
2699 #if defined(KVM_X86_DISABLE_EXITS_HTL) && !defined(KVM_X86_DISABLE_EXITS_HLT)
2700 #define KVM_X86_DISABLE_EXITS_HLT KVM_X86_DISABLE_EXITS_HTL
2701 #endif
2702         if (disable_exits) {
2703             disable_exits &= (KVM_X86_DISABLE_EXITS_MWAIT |
2704                               KVM_X86_DISABLE_EXITS_HLT |
2705                               KVM_X86_DISABLE_EXITS_PAUSE |
2706                               KVM_X86_DISABLE_EXITS_CSTATE);
2707         }
2708 
2709         ret = kvm_vm_enable_cap(s, KVM_CAP_X86_DISABLE_EXITS, 0,
2710                                 disable_exits);
2711         if (ret < 0) {
2712             error_report("kvm: guest stopping CPU not supported: %s",
2713                          strerror(-ret));
2714         }
2715     }
2716 
2717     if (object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE)) {
2718         X86MachineState *x86ms = X86_MACHINE(ms);
2719 
2720         if (x86ms->bus_lock_ratelimit > 0) {
2721             ret = kvm_check_extension(s, KVM_CAP_X86_BUS_LOCK_EXIT);
2722             if (!(ret & KVM_BUS_LOCK_DETECTION_EXIT)) {
2723                 error_report("kvm: bus lock detection unsupported");
2724                 return -ENOTSUP;
2725             }
2726             ret = kvm_vm_enable_cap(s, KVM_CAP_X86_BUS_LOCK_EXIT, 0,
2727                                     KVM_BUS_LOCK_DETECTION_EXIT);
2728             if (ret < 0) {
2729                 error_report("kvm: Failed to enable bus lock detection cap: %s",
2730                              strerror(-ret));
2731                 return ret;
2732             }
2733             ratelimit_init(&bus_lock_ratelimit_ctrl);
2734             ratelimit_set_speed(&bus_lock_ratelimit_ctrl,
2735                                 x86ms->bus_lock_ratelimit, BUS_LOCK_SLICE_TIME);
2736         }
2737     }
2738 
2739     if (s->notify_vmexit != NOTIFY_VMEXIT_OPTION_DISABLE &&
2740         kvm_check_extension(s, KVM_CAP_X86_NOTIFY_VMEXIT)) {
2741             uint64_t notify_window_flags =
2742                 ((uint64_t)s->notify_window << 32) |
2743                 KVM_X86_NOTIFY_VMEXIT_ENABLED |
2744                 KVM_X86_NOTIFY_VMEXIT_USER;
2745             ret = kvm_vm_enable_cap(s, KVM_CAP_X86_NOTIFY_VMEXIT, 0,
2746                                     notify_window_flags);
2747             if (ret < 0) {
2748                 error_report("kvm: Failed to enable notify vmexit cap: %s",
2749                              strerror(-ret));
2750                 return ret;
2751             }
2752     }
2753     if (kvm_vm_check_extension(s, KVM_CAP_X86_USER_SPACE_MSR)) {
2754         bool r;
2755 
2756         ret = kvm_vm_enable_cap(s, KVM_CAP_X86_USER_SPACE_MSR, 0,
2757                                 KVM_MSR_EXIT_REASON_FILTER);
2758         if (ret) {
2759             error_report("Could not enable user space MSRs: %s",
2760                          strerror(-ret));
2761             exit(1);
2762         }
2763 
2764         r = kvm_filter_msr(s, MSR_CORE_THREAD_COUNT,
2765                            kvm_rdmsr_core_thread_count, NULL);
2766         if (!r) {
2767             error_report("Could not install MSR_CORE_THREAD_COUNT handler: %s",
2768                          strerror(-ret));
2769             exit(1);
2770         }
2771     }
2772 
2773     return 0;
2774 }
2775 
2776 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
2777 {
2778     lhs->selector = rhs->selector;
2779     lhs->base = rhs->base;
2780     lhs->limit = rhs->limit;
2781     lhs->type = 3;
2782     lhs->present = 1;
2783     lhs->dpl = 3;
2784     lhs->db = 0;
2785     lhs->s = 1;
2786     lhs->l = 0;
2787     lhs->g = 0;
2788     lhs->avl = 0;
2789     lhs->unusable = 0;
2790 }
2791 
2792 static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
2793 {
2794     unsigned flags = rhs->flags;
2795     lhs->selector = rhs->selector;
2796     lhs->base = rhs->base;
2797     lhs->limit = rhs->limit;
2798     lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
2799     lhs->present = (flags & DESC_P_MASK) != 0;
2800     lhs->dpl = (flags >> DESC_DPL_SHIFT) & 3;
2801     lhs->db = (flags >> DESC_B_SHIFT) & 1;
2802     lhs->s = (flags & DESC_S_MASK) != 0;
2803     lhs->l = (flags >> DESC_L_SHIFT) & 1;
2804     lhs->g = (flags & DESC_G_MASK) != 0;
2805     lhs->avl = (flags & DESC_AVL_MASK) != 0;
2806     lhs->unusable = !lhs->present;
2807     lhs->padding = 0;
2808 }
2809 
2810 static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
2811 {
2812     lhs->selector = rhs->selector;
2813     lhs->base = rhs->base;
2814     lhs->limit = rhs->limit;
2815     lhs->flags = (rhs->type << DESC_TYPE_SHIFT) |
2816                  ((rhs->present && !rhs->unusable) * DESC_P_MASK) |
2817                  (rhs->dpl << DESC_DPL_SHIFT) |
2818                  (rhs->db << DESC_B_SHIFT) |
2819                  (rhs->s * DESC_S_MASK) |
2820                  (rhs->l << DESC_L_SHIFT) |
2821                  (rhs->g * DESC_G_MASK) |
2822                  (rhs->avl * DESC_AVL_MASK);
2823 }
2824 
2825 static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set)
2826 {
2827     if (set) {
2828         *kvm_reg = *qemu_reg;
2829     } else {
2830         *qemu_reg = *kvm_reg;
2831     }
2832 }
2833 
2834 static int kvm_getput_regs(X86CPU *cpu, int set)
2835 {
2836     CPUX86State *env = &cpu->env;
2837     struct kvm_regs regs;
2838     int ret = 0;
2839 
2840     if (!set) {
2841         ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_REGS, &regs);
2842         if (ret < 0) {
2843             return ret;
2844         }
2845     }
2846 
2847     kvm_getput_reg(&regs.rax, &env->regs[R_EAX], set);
2848     kvm_getput_reg(&regs.rbx, &env->regs[R_EBX], set);
2849     kvm_getput_reg(&regs.rcx, &env->regs[R_ECX], set);
2850     kvm_getput_reg(&regs.rdx, &env->regs[R_EDX], set);
2851     kvm_getput_reg(&regs.rsi, &env->regs[R_ESI], set);
2852     kvm_getput_reg(&regs.rdi, &env->regs[R_EDI], set);
2853     kvm_getput_reg(&regs.rsp, &env->regs[R_ESP], set);
2854     kvm_getput_reg(&regs.rbp, &env->regs[R_EBP], set);
2855 #ifdef TARGET_X86_64
2856     kvm_getput_reg(&regs.r8, &env->regs[8], set);
2857     kvm_getput_reg(&regs.r9, &env->regs[9], set);
2858     kvm_getput_reg(&regs.r10, &env->regs[10], set);
2859     kvm_getput_reg(&regs.r11, &env->regs[11], set);
2860     kvm_getput_reg(&regs.r12, &env->regs[12], set);
2861     kvm_getput_reg(&regs.r13, &env->regs[13], set);
2862     kvm_getput_reg(&regs.r14, &env->regs[14], set);
2863     kvm_getput_reg(&regs.r15, &env->regs[15], set);
2864 #endif
2865 
2866     kvm_getput_reg(&regs.rflags, &env->eflags, set);
2867     kvm_getput_reg(&regs.rip, &env->eip, set);
2868 
2869     if (set) {
2870         ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_REGS, &regs);
2871     }
2872 
2873     return ret;
2874 }
2875 
2876 static int kvm_put_xsave(X86CPU *cpu)
2877 {
2878     CPUX86State *env = &cpu->env;
2879     void *xsave = env->xsave_buf;
2880 
2881     x86_cpu_xsave_all_areas(cpu, xsave, env->xsave_buf_len);
2882 
2883     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
2884 }
2885 
2886 static int kvm_put_xcrs(X86CPU *cpu)
2887 {
2888     CPUX86State *env = &cpu->env;
2889     struct kvm_xcrs xcrs = {};
2890 
2891     if (!has_xcrs) {
2892         return 0;
2893     }
2894 
2895     xcrs.nr_xcrs = 1;
2896     xcrs.flags = 0;
2897     xcrs.xcrs[0].xcr = 0;
2898     xcrs.xcrs[0].value = env->xcr0;
2899     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XCRS, &xcrs);
2900 }
2901 
2902 static int kvm_put_sregs(X86CPU *cpu)
2903 {
2904     CPUX86State *env = &cpu->env;
2905     struct kvm_sregs sregs;
2906 
2907     /*
2908      * The interrupt_bitmap is ignored because KVM_SET_SREGS is
2909      * always followed by KVM_SET_VCPU_EVENTS.
2910      */
2911     memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap));
2912 
2913     if ((env->eflags & VM_MASK)) {
2914         set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
2915         set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
2916         set_v8086_seg(&sregs.es, &env->segs[R_ES]);
2917         set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
2918         set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
2919         set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
2920     } else {
2921         set_seg(&sregs.cs, &env->segs[R_CS]);
2922         set_seg(&sregs.ds, &env->segs[R_DS]);
2923         set_seg(&sregs.es, &env->segs[R_ES]);
2924         set_seg(&sregs.fs, &env->segs[R_FS]);
2925         set_seg(&sregs.gs, &env->segs[R_GS]);
2926         set_seg(&sregs.ss, &env->segs[R_SS]);
2927     }
2928 
2929     set_seg(&sregs.tr, &env->tr);
2930     set_seg(&sregs.ldt, &env->ldt);
2931 
2932     sregs.idt.limit = env->idt.limit;
2933     sregs.idt.base = env->idt.base;
2934     memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
2935     sregs.gdt.limit = env->gdt.limit;
2936     sregs.gdt.base = env->gdt.base;
2937     memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
2938 
2939     sregs.cr0 = env->cr[0];
2940     sregs.cr2 = env->cr[2];
2941     sregs.cr3 = env->cr[3];
2942     sregs.cr4 = env->cr[4];
2943 
2944     sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state);
2945     sregs.apic_base = cpu_get_apic_base(cpu->apic_state);
2946 
2947     sregs.efer = env->efer;
2948 
2949     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs);
2950 }
2951 
2952 static int kvm_put_sregs2(X86CPU *cpu)
2953 {
2954     CPUX86State *env = &cpu->env;
2955     struct kvm_sregs2 sregs;
2956     int i;
2957 
2958     sregs.flags = 0;
2959 
2960     if ((env->eflags & VM_MASK)) {
2961         set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
2962         set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
2963         set_v8086_seg(&sregs.es, &env->segs[R_ES]);
2964         set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
2965         set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
2966         set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
2967     } else {
2968         set_seg(&sregs.cs, &env->segs[R_CS]);
2969         set_seg(&sregs.ds, &env->segs[R_DS]);
2970         set_seg(&sregs.es, &env->segs[R_ES]);
2971         set_seg(&sregs.fs, &env->segs[R_FS]);
2972         set_seg(&sregs.gs, &env->segs[R_GS]);
2973         set_seg(&sregs.ss, &env->segs[R_SS]);
2974     }
2975 
2976     set_seg(&sregs.tr, &env->tr);
2977     set_seg(&sregs.ldt, &env->ldt);
2978 
2979     sregs.idt.limit = env->idt.limit;
2980     sregs.idt.base = env->idt.base;
2981     memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
2982     sregs.gdt.limit = env->gdt.limit;
2983     sregs.gdt.base = env->gdt.base;
2984     memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
2985 
2986     sregs.cr0 = env->cr[0];
2987     sregs.cr2 = env->cr[2];
2988     sregs.cr3 = env->cr[3];
2989     sregs.cr4 = env->cr[4];
2990 
2991     sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state);
2992     sregs.apic_base = cpu_get_apic_base(cpu->apic_state);
2993 
2994     sregs.efer = env->efer;
2995 
2996     if (env->pdptrs_valid) {
2997         for (i = 0; i < 4; i++) {
2998             sregs.pdptrs[i] = env->pdptrs[i];
2999         }
3000         sregs.flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
3001     }
3002 
3003     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS2, &sregs);
3004 }
3005 
3006 
3007 static void kvm_msr_buf_reset(X86CPU *cpu)
3008 {
3009     memset(cpu->kvm_msr_buf, 0, MSR_BUF_SIZE);
3010 }
3011 
3012 static void kvm_msr_entry_add(X86CPU *cpu, uint32_t index, uint64_t value)
3013 {
3014     struct kvm_msrs *msrs = cpu->kvm_msr_buf;
3015     void *limit = ((void *)msrs) + MSR_BUF_SIZE;
3016     struct kvm_msr_entry *entry = &msrs->entries[msrs->nmsrs];
3017 
3018     assert((void *)(entry + 1) <= limit);
3019 
3020     entry->index = index;
3021     entry->reserved = 0;
3022     entry->data = value;
3023     msrs->nmsrs++;
3024 }
3025 
3026 static int kvm_put_one_msr(X86CPU *cpu, int index, uint64_t value)
3027 {
3028     kvm_msr_buf_reset(cpu);
3029     kvm_msr_entry_add(cpu, index, value);
3030 
3031     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
3032 }
3033 
3034 static int kvm_get_one_msr(X86CPU *cpu, int index, uint64_t *value)
3035 {
3036     int ret;
3037     struct {
3038         struct kvm_msrs info;
3039         struct kvm_msr_entry entries[1];
3040     } msr_data = {
3041         .info.nmsrs = 1,
3042         .entries[0].index = index,
3043     };
3044 
3045     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data);
3046     if (ret < 0) {
3047         return ret;
3048     }
3049     assert(ret == 1);
3050     *value = msr_data.entries[0].data;
3051     return ret;
3052 }
3053 void kvm_put_apicbase(X86CPU *cpu, uint64_t value)
3054 {
3055     int ret;
3056 
3057     ret = kvm_put_one_msr(cpu, MSR_IA32_APICBASE, value);
3058     assert(ret == 1);
3059 }
3060 
3061 static int kvm_put_tscdeadline_msr(X86CPU *cpu)
3062 {
3063     CPUX86State *env = &cpu->env;
3064     int ret;
3065 
3066     if (!has_msr_tsc_deadline) {
3067         return 0;
3068     }
3069 
3070     ret = kvm_put_one_msr(cpu, MSR_IA32_TSCDEADLINE, env->tsc_deadline);
3071     if (ret < 0) {
3072         return ret;
3073     }
3074 
3075     assert(ret == 1);
3076     return 0;
3077 }
3078 
3079 /*
3080  * Provide a separate write service for the feature control MSR in order to
3081  * kick the VCPU out of VMXON or even guest mode on reset. This has to be done
3082  * before writing any other state because forcibly leaving nested mode
3083  * invalidates the VCPU state.
3084  */
3085 static int kvm_put_msr_feature_control(X86CPU *cpu)
3086 {
3087     int ret;
3088 
3089     if (!has_msr_feature_control) {
3090         return 0;
3091     }
3092 
3093     ret = kvm_put_one_msr(cpu, MSR_IA32_FEATURE_CONTROL,
3094                           cpu->env.msr_ia32_feature_control);
3095     if (ret < 0) {
3096         return ret;
3097     }
3098 
3099     assert(ret == 1);
3100     return 0;
3101 }
3102 
3103 static uint64_t make_vmx_msr_value(uint32_t index, uint32_t features)
3104 {
3105     uint32_t default1, can_be_one, can_be_zero;
3106     uint32_t must_be_one;
3107 
3108     switch (index) {
3109     case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3110         default1 = 0x00000016;
3111         break;
3112     case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
3113         default1 = 0x0401e172;
3114         break;
3115     case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
3116         default1 = 0x000011ff;
3117         break;
3118     case MSR_IA32_VMX_TRUE_EXIT_CTLS:
3119         default1 = 0x00036dff;
3120         break;
3121     case MSR_IA32_VMX_PROCBASED_CTLS2:
3122         default1 = 0;
3123         break;
3124     default:
3125         abort();
3126     }
3127 
3128     /* If a feature bit is set, the control can be either set or clear.
3129      * Otherwise the value is limited to either 0 or 1 by default1.
3130      */
3131     can_be_one = features | default1;
3132     can_be_zero = features | ~default1;
3133     must_be_one = ~can_be_zero;
3134 
3135     /*
3136      * Bit 0:31 -> 0 if the control bit can be zero (i.e. 1 if it must be one).
3137      * Bit 32:63 -> 1 if the control bit can be one.
3138      */
3139     return must_be_one | (((uint64_t)can_be_one) << 32);
3140 }
3141 
3142 static void kvm_msr_entry_add_vmx(X86CPU *cpu, FeatureWordArray f)
3143 {
3144     uint64_t kvm_vmx_basic =
3145         kvm_arch_get_supported_msr_feature(kvm_state,
3146                                            MSR_IA32_VMX_BASIC);
3147 
3148     if (!kvm_vmx_basic) {
3149         /* If the kernel doesn't support VMX feature (kvm_intel.nested=0),
3150          * then kvm_vmx_basic will be 0 and KVM_SET_MSR will fail.
3151          */
3152         return;
3153     }
3154 
3155     uint64_t kvm_vmx_misc =
3156         kvm_arch_get_supported_msr_feature(kvm_state,
3157                                            MSR_IA32_VMX_MISC);
3158     uint64_t kvm_vmx_ept_vpid =
3159         kvm_arch_get_supported_msr_feature(kvm_state,
3160                                            MSR_IA32_VMX_EPT_VPID_CAP);
3161 
3162     /*
3163      * If the guest is 64-bit, a value of 1 is allowed for the host address
3164      * space size vmexit control.
3165      */
3166     uint64_t fixed_vmx_exit = f[FEAT_8000_0001_EDX] & CPUID_EXT2_LM
3167         ? (uint64_t)VMX_VM_EXIT_HOST_ADDR_SPACE_SIZE << 32 : 0;
3168 
3169     /*
3170      * Bits 0-30, 32-44 and 50-53 come from the host.  KVM should
3171      * not change them for backwards compatibility.
3172      */
3173     uint64_t fixed_vmx_basic = kvm_vmx_basic &
3174         (MSR_VMX_BASIC_VMCS_REVISION_MASK |
3175          MSR_VMX_BASIC_VMXON_REGION_SIZE_MASK |
3176          MSR_VMX_BASIC_VMCS_MEM_TYPE_MASK);
3177 
3178     /*
3179      * Same for bits 0-4 and 25-27.  Bits 16-24 (CR3 target count) can
3180      * change in the future but are always zero for now, clear them to be
3181      * future proof.  Bits 32-63 in theory could change, though KVM does
3182      * not support dual-monitor treatment and probably never will; mask
3183      * them out as well.
3184      */
3185     uint64_t fixed_vmx_misc = kvm_vmx_misc &
3186         (MSR_VMX_MISC_PREEMPTION_TIMER_SHIFT_MASK |
3187          MSR_VMX_MISC_MAX_MSR_LIST_SIZE_MASK);
3188 
3189     /*
3190      * EPT memory types should not change either, so we do not bother
3191      * adding features for them.
3192      */
3193     uint64_t fixed_vmx_ept_mask =
3194             (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_ENABLE_EPT ?
3195              MSR_VMX_EPT_UC | MSR_VMX_EPT_WB : 0);
3196     uint64_t fixed_vmx_ept_vpid = kvm_vmx_ept_vpid & fixed_vmx_ept_mask;
3197 
3198     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
3199                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
3200                                          f[FEAT_VMX_PROCBASED_CTLS]));
3201     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PINBASED_CTLS,
3202                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_PINBASED_CTLS,
3203                                          f[FEAT_VMX_PINBASED_CTLS]));
3204     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_EXIT_CTLS,
3205                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_EXIT_CTLS,
3206                                          f[FEAT_VMX_EXIT_CTLS]) | fixed_vmx_exit);
3207     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_ENTRY_CTLS,
3208                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_ENTRY_CTLS,
3209                                          f[FEAT_VMX_ENTRY_CTLS]));
3210     kvm_msr_entry_add(cpu, MSR_IA32_VMX_PROCBASED_CTLS2,
3211                       make_vmx_msr_value(MSR_IA32_VMX_PROCBASED_CTLS2,
3212                                          f[FEAT_VMX_SECONDARY_CTLS]));
3213     kvm_msr_entry_add(cpu, MSR_IA32_VMX_EPT_VPID_CAP,
3214                       f[FEAT_VMX_EPT_VPID_CAPS] | fixed_vmx_ept_vpid);
3215     kvm_msr_entry_add(cpu, MSR_IA32_VMX_BASIC,
3216                       f[FEAT_VMX_BASIC] | fixed_vmx_basic);
3217     kvm_msr_entry_add(cpu, MSR_IA32_VMX_MISC,
3218                       f[FEAT_VMX_MISC] | fixed_vmx_misc);
3219     if (has_msr_vmx_vmfunc) {
3220         kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMFUNC, f[FEAT_VMX_VMFUNC]);
3221     }
3222 
3223     /*
3224      * Just to be safe, write these with constant values.  The CRn_FIXED1
3225      * MSRs are generated by KVM based on the vCPU's CPUID.
3226      */
3227     kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR0_FIXED0,
3228                       CR0_PE_MASK | CR0_PG_MASK | CR0_NE_MASK);
3229     kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR4_FIXED0,
3230                       CR4_VMXE_MASK);
3231 
3232     if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) {
3233         /* TSC multiplier (0x2032).  */
3234         kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x32);
3235     } else {
3236         /* Preemption timer (0x482E).  */
3237         kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x2E);
3238     }
3239 }
3240 
3241 static void kvm_msr_entry_add_perf(X86CPU *cpu, FeatureWordArray f)
3242 {
3243     uint64_t kvm_perf_cap =
3244         kvm_arch_get_supported_msr_feature(kvm_state,
3245                                            MSR_IA32_PERF_CAPABILITIES);
3246 
3247     if (kvm_perf_cap) {
3248         kvm_msr_entry_add(cpu, MSR_IA32_PERF_CAPABILITIES,
3249                         kvm_perf_cap & f[FEAT_PERF_CAPABILITIES]);
3250     }
3251 }
3252 
3253 static int kvm_buf_set_msrs(X86CPU *cpu)
3254 {
3255     int ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
3256     if (ret < 0) {
3257         return ret;
3258     }
3259 
3260     if (ret < cpu->kvm_msr_buf->nmsrs) {
3261         struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
3262         error_report("error: failed to set MSR 0x%" PRIx32 " to 0x%" PRIx64,
3263                      (uint32_t)e->index, (uint64_t)e->data);
3264     }
3265 
3266     assert(ret == cpu->kvm_msr_buf->nmsrs);
3267     return 0;
3268 }
3269 
3270 static void kvm_init_msrs(X86CPU *cpu)
3271 {
3272     CPUX86State *env = &cpu->env;
3273 
3274     kvm_msr_buf_reset(cpu);
3275     if (has_msr_arch_capabs) {
3276         kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES,
3277                           env->features[FEAT_ARCH_CAPABILITIES]);
3278     }
3279 
3280     if (has_msr_core_capabs) {
3281         kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY,
3282                           env->features[FEAT_CORE_CAPABILITY]);
3283     }
3284 
3285     if (has_msr_perf_capabs && cpu->enable_pmu) {
3286         kvm_msr_entry_add_perf(cpu, env->features);
3287     }
3288 
3289     if (has_msr_ucode_rev) {
3290         kvm_msr_entry_add(cpu, MSR_IA32_UCODE_REV, cpu->ucode_rev);
3291     }
3292 
3293     /*
3294      * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but
3295      * all kernels with MSR features should have them.
3296      */
3297     if (kvm_feature_msrs && cpu_has_vmx(env)) {
3298         kvm_msr_entry_add_vmx(cpu, env->features);
3299     }
3300 
3301     assert(kvm_buf_set_msrs(cpu) == 0);
3302 }
3303 
3304 static int kvm_put_msrs(X86CPU *cpu, int level)
3305 {
3306     CPUX86State *env = &cpu->env;
3307     int i;
3308 
3309     kvm_msr_buf_reset(cpu);
3310 
3311     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, env->sysenter_cs);
3312     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
3313     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
3314     kvm_msr_entry_add(cpu, MSR_PAT, env->pat);
3315     if (has_msr_star) {
3316         kvm_msr_entry_add(cpu, MSR_STAR, env->star);
3317     }
3318     if (has_msr_hsave_pa) {
3319         kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, env->vm_hsave);
3320     }
3321     if (has_msr_tsc_aux) {
3322         kvm_msr_entry_add(cpu, MSR_TSC_AUX, env->tsc_aux);
3323     }
3324     if (has_msr_tsc_adjust) {
3325         kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, env->tsc_adjust);
3326     }
3327     if (has_msr_misc_enable) {
3328         kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE,
3329                           env->msr_ia32_misc_enable);
3330     }
3331     if (has_msr_smbase) {
3332         kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, env->smbase);
3333     }
3334     if (has_msr_smi_count) {
3335         kvm_msr_entry_add(cpu, MSR_SMI_COUNT, env->msr_smi_count);
3336     }
3337     if (has_msr_pkrs) {
3338         kvm_msr_entry_add(cpu, MSR_IA32_PKRS, env->pkrs);
3339     }
3340     if (has_msr_bndcfgs) {
3341         kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, env->msr_bndcfgs);
3342     }
3343     if (has_msr_xss) {
3344         kvm_msr_entry_add(cpu, MSR_IA32_XSS, env->xss);
3345     }
3346     if (has_msr_umwait) {
3347         kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, env->umwait);
3348     }
3349     if (has_msr_spec_ctrl) {
3350         kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, env->spec_ctrl);
3351     }
3352     if (has_tsc_scale_msr) {
3353         kvm_msr_entry_add(cpu, MSR_AMD64_TSC_RATIO, env->amd_tsc_scale_msr);
3354     }
3355 
3356     if (has_msr_tsx_ctrl) {
3357         kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, env->tsx_ctrl);
3358     }
3359     if (has_msr_virt_ssbd) {
3360         kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, env->virt_ssbd);
3361     }
3362 
3363 #ifdef TARGET_X86_64
3364     if (lm_capable_kernel) {
3365         kvm_msr_entry_add(cpu, MSR_CSTAR, env->cstar);
3366         kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, env->kernelgsbase);
3367         kvm_msr_entry_add(cpu, MSR_FMASK, env->fmask);
3368         kvm_msr_entry_add(cpu, MSR_LSTAR, env->lstar);
3369     }
3370 #endif
3371 
3372     /*
3373      * The following MSRs have side effects on the guest or are too heavy
3374      * for normal writeback. Limit them to reset or full state updates.
3375      */
3376     if (level >= KVM_PUT_RESET_STATE) {
3377         kvm_msr_entry_add(cpu, MSR_IA32_TSC, env->tsc);
3378         kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, env->system_time_msr);
3379         kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
3380         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) {
3381             kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, env->async_pf_int_msr);
3382         }
3383         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
3384             kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, env->async_pf_en_msr);
3385         }
3386         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
3387             kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, env->pv_eoi_en_msr);
3388         }
3389         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
3390             kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, env->steal_time_msr);
3391         }
3392 
3393         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
3394             kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, env->poll_control_msr);
3395         }
3396 
3397         if (has_architectural_pmu_version > 0) {
3398             if (has_architectural_pmu_version > 1) {
3399                 /* Stop the counter.  */
3400                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
3401                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
3402             }
3403 
3404             /* Set the counter values.  */
3405             for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
3406                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i,
3407                                   env->msr_fixed_counters[i]);
3408             }
3409             for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
3410                 kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i,
3411                                   env->msr_gp_counters[i]);
3412                 kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i,
3413                                   env->msr_gp_evtsel[i]);
3414             }
3415             if (has_architectural_pmu_version > 1) {
3416                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS,
3417                                   env->msr_global_status);
3418                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
3419                                   env->msr_global_ovf_ctrl);
3420 
3421                 /* Now start the PMU.  */
3422                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL,
3423                                   env->msr_fixed_ctr_ctrl);
3424                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL,
3425                                   env->msr_global_ctrl);
3426             }
3427         }
3428         /*
3429          * Hyper-V partition-wide MSRs: to avoid clearing them on cpu hot-add,
3430          * only sync them to KVM on the first cpu
3431          */
3432         if (current_cpu == first_cpu) {
3433             if (has_msr_hv_hypercall) {
3434                 kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID,
3435                                   env->msr_hv_guest_os_id);
3436                 kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL,
3437                                   env->msr_hv_hypercall);
3438             }
3439             if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) {
3440                 kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC,
3441                                   env->msr_hv_tsc);
3442             }
3443             if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) {
3444                 kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL,
3445                                   env->msr_hv_reenlightenment_control);
3446                 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL,
3447                                   env->msr_hv_tsc_emulation_control);
3448                 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS,
3449                                   env->msr_hv_tsc_emulation_status);
3450             }
3451 #ifdef CONFIG_SYNDBG
3452             if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG) &&
3453                 has_msr_hv_syndbg_options) {
3454                 kvm_msr_entry_add(cpu, HV_X64_MSR_SYNDBG_OPTIONS,
3455                                   hyperv_syndbg_query_options());
3456             }
3457 #endif
3458         }
3459         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) {
3460             kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE,
3461                               env->msr_hv_vapic);
3462         }
3463         if (has_msr_hv_crash) {
3464             int j;
3465 
3466             for (j = 0; j < HV_CRASH_PARAMS; j++)
3467                 kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j,
3468                                   env->msr_hv_crash_params[j]);
3469 
3470             kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_NOTIFY);
3471         }
3472         if (has_msr_hv_runtime) {
3473             kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, env->msr_hv_runtime);
3474         }
3475         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)
3476             && hv_vpindex_settable) {
3477             kvm_msr_entry_add(cpu, HV_X64_MSR_VP_INDEX,
3478                               hyperv_vp_index(CPU(cpu)));
3479         }
3480         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
3481             int j;
3482 
3483             kvm_msr_entry_add(cpu, HV_X64_MSR_SVERSION, HV_SYNIC_VERSION);
3484 
3485             kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL,
3486                               env->msr_hv_synic_control);
3487             kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP,
3488                               env->msr_hv_synic_evt_page);
3489             kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP,
3490                               env->msr_hv_synic_msg_page);
3491 
3492             for (j = 0; j < ARRAY_SIZE(env->msr_hv_synic_sint); j++) {
3493                 kvm_msr_entry_add(cpu, HV_X64_MSR_SINT0 + j,
3494                                   env->msr_hv_synic_sint[j]);
3495             }
3496         }
3497         if (has_msr_hv_stimer) {
3498             int j;
3499 
3500             for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_config); j++) {
3501                 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_CONFIG + j * 2,
3502                                 env->msr_hv_stimer_config[j]);
3503             }
3504 
3505             for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_count); j++) {
3506                 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_COUNT + j * 2,
3507                                 env->msr_hv_stimer_count[j]);
3508             }
3509         }
3510         if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
3511             uint64_t phys_mask = MAKE_64BIT_MASK(0, cpu->phys_bits);
3512 
3513             kvm_msr_entry_add(cpu, MSR_MTRRdefType, env->mtrr_deftype);
3514             kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, env->mtrr_fixed[0]);
3515             kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, env->mtrr_fixed[1]);
3516             kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, env->mtrr_fixed[2]);
3517             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, env->mtrr_fixed[3]);
3518             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, env->mtrr_fixed[4]);
3519             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, env->mtrr_fixed[5]);
3520             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, env->mtrr_fixed[6]);
3521             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, env->mtrr_fixed[7]);
3522             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, env->mtrr_fixed[8]);
3523             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, env->mtrr_fixed[9]);
3524             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, env->mtrr_fixed[10]);
3525             for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
3526                 /* The CPU GPs if we write to a bit above the physical limit of
3527                  * the host CPU (and KVM emulates that)
3528                  */
3529                 uint64_t mask = env->mtrr_var[i].mask;
3530                 mask &= phys_mask;
3531 
3532                 kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i),
3533                                   env->mtrr_var[i].base);
3534                 kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), mask);
3535             }
3536         }
3537         if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
3538             int addr_num = kvm_arch_get_supported_cpuid(kvm_state,
3539                                                     0x14, 1, R_EAX) & 0x7;
3540 
3541             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL,
3542                             env->msr_rtit_ctrl);
3543             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS,
3544                             env->msr_rtit_status);
3545             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE,
3546                             env->msr_rtit_output_base);
3547             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK,
3548                             env->msr_rtit_output_mask);
3549             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH,
3550                             env->msr_rtit_cr3_match);
3551             for (i = 0; i < addr_num; i++) {
3552                 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i,
3553                             env->msr_rtit_addrs[i]);
3554             }
3555         }
3556 
3557         if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) {
3558             kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0,
3559                               env->msr_ia32_sgxlepubkeyhash[0]);
3560             kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1,
3561                               env->msr_ia32_sgxlepubkeyhash[1]);
3562             kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2,
3563                               env->msr_ia32_sgxlepubkeyhash[2]);
3564             kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3,
3565                               env->msr_ia32_sgxlepubkeyhash[3]);
3566         }
3567 
3568         if (env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD) {
3569             kvm_msr_entry_add(cpu, MSR_IA32_XFD,
3570                               env->msr_xfd);
3571             kvm_msr_entry_add(cpu, MSR_IA32_XFD_ERR,
3572                               env->msr_xfd_err);
3573         }
3574 
3575         if (kvm_enabled() && cpu->enable_pmu &&
3576             (env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_ARCH_LBR)) {
3577             uint64_t depth;
3578             int ret;
3579 
3580             /*
3581              * Only migrate Arch LBR states when the host Arch LBR depth
3582              * equals that of source guest's, this is to avoid mismatch
3583              * of guest/host config for the msr hence avoid unexpected
3584              * misbehavior.
3585              */
3586             ret = kvm_get_one_msr(cpu, MSR_ARCH_LBR_DEPTH, &depth);
3587 
3588             if (ret == 1 && !!depth && depth == env->msr_lbr_depth) {
3589                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_CTL, env->msr_lbr_ctl);
3590                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_DEPTH, env->msr_lbr_depth);
3591 
3592                 for (i = 0; i < ARCH_LBR_NR_ENTRIES; i++) {
3593                     if (!env->lbr_records[i].from) {
3594                         continue;
3595                     }
3596                     kvm_msr_entry_add(cpu, MSR_ARCH_LBR_FROM_0 + i,
3597                                       env->lbr_records[i].from);
3598                     kvm_msr_entry_add(cpu, MSR_ARCH_LBR_TO_0 + i,
3599                                       env->lbr_records[i].to);
3600                     kvm_msr_entry_add(cpu, MSR_ARCH_LBR_INFO_0 + i,
3601                                       env->lbr_records[i].info);
3602                 }
3603             }
3604         }
3605 
3606         /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see
3607          *       kvm_put_msr_feature_control. */
3608     }
3609 
3610     if (env->mcg_cap) {
3611         kvm_msr_entry_add(cpu, MSR_MCG_STATUS, env->mcg_status);
3612         kvm_msr_entry_add(cpu, MSR_MCG_CTL, env->mcg_ctl);
3613         if (has_msr_mcg_ext_ctl) {
3614             kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, env->mcg_ext_ctl);
3615         }
3616         for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
3617             kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, env->mce_banks[i]);
3618         }
3619     }
3620 
3621     return kvm_buf_set_msrs(cpu);
3622 }
3623 
3624 
3625 static int kvm_get_xsave(X86CPU *cpu)
3626 {
3627     CPUX86State *env = &cpu->env;
3628     void *xsave = env->xsave_buf;
3629     int type, ret;
3630 
3631     type = has_xsave2 ? KVM_GET_XSAVE2 : KVM_GET_XSAVE;
3632     ret = kvm_vcpu_ioctl(CPU(cpu), type, xsave);
3633     if (ret < 0) {
3634         return ret;
3635     }
3636     x86_cpu_xrstor_all_areas(cpu, xsave, env->xsave_buf_len);
3637 
3638     return 0;
3639 }
3640 
3641 static int kvm_get_xcrs(X86CPU *cpu)
3642 {
3643     CPUX86State *env = &cpu->env;
3644     int i, ret;
3645     struct kvm_xcrs xcrs;
3646 
3647     if (!has_xcrs) {
3648         return 0;
3649     }
3650 
3651     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XCRS, &xcrs);
3652     if (ret < 0) {
3653         return ret;
3654     }
3655 
3656     for (i = 0; i < xcrs.nr_xcrs; i++) {
3657         /* Only support xcr0 now */
3658         if (xcrs.xcrs[i].xcr == 0) {
3659             env->xcr0 = xcrs.xcrs[i].value;
3660             break;
3661         }
3662     }
3663     return 0;
3664 }
3665 
3666 static int kvm_get_sregs(X86CPU *cpu)
3667 {
3668     CPUX86State *env = &cpu->env;
3669     struct kvm_sregs sregs;
3670     int ret;
3671 
3672     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
3673     if (ret < 0) {
3674         return ret;
3675     }
3676 
3677     /*
3678      * The interrupt_bitmap is ignored because KVM_GET_SREGS is
3679      * always preceded by KVM_GET_VCPU_EVENTS.
3680      */
3681 
3682     get_seg(&env->segs[R_CS], &sregs.cs);
3683     get_seg(&env->segs[R_DS], &sregs.ds);
3684     get_seg(&env->segs[R_ES], &sregs.es);
3685     get_seg(&env->segs[R_FS], &sregs.fs);
3686     get_seg(&env->segs[R_GS], &sregs.gs);
3687     get_seg(&env->segs[R_SS], &sregs.ss);
3688 
3689     get_seg(&env->tr, &sregs.tr);
3690     get_seg(&env->ldt, &sregs.ldt);
3691 
3692     env->idt.limit = sregs.idt.limit;
3693     env->idt.base = sregs.idt.base;
3694     env->gdt.limit = sregs.gdt.limit;
3695     env->gdt.base = sregs.gdt.base;
3696 
3697     env->cr[0] = sregs.cr0;
3698     env->cr[2] = sregs.cr2;
3699     env->cr[3] = sregs.cr3;
3700     env->cr[4] = sregs.cr4;
3701 
3702     env->efer = sregs.efer;
3703     if (sev_es_enabled() && env->efer & MSR_EFER_LME &&
3704         env->cr[0] & CR0_PG_MASK) {
3705         env->efer |= MSR_EFER_LMA;
3706     }
3707 
3708     /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
3709     x86_update_hflags(env);
3710 
3711     return 0;
3712 }
3713 
3714 static int kvm_get_sregs2(X86CPU *cpu)
3715 {
3716     CPUX86State *env = &cpu->env;
3717     struct kvm_sregs2 sregs;
3718     int i, ret;
3719 
3720     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS2, &sregs);
3721     if (ret < 0) {
3722         return ret;
3723     }
3724 
3725     get_seg(&env->segs[R_CS], &sregs.cs);
3726     get_seg(&env->segs[R_DS], &sregs.ds);
3727     get_seg(&env->segs[R_ES], &sregs.es);
3728     get_seg(&env->segs[R_FS], &sregs.fs);
3729     get_seg(&env->segs[R_GS], &sregs.gs);
3730     get_seg(&env->segs[R_SS], &sregs.ss);
3731 
3732     get_seg(&env->tr, &sregs.tr);
3733     get_seg(&env->ldt, &sregs.ldt);
3734 
3735     env->idt.limit = sregs.idt.limit;
3736     env->idt.base = sregs.idt.base;
3737     env->gdt.limit = sregs.gdt.limit;
3738     env->gdt.base = sregs.gdt.base;
3739 
3740     env->cr[0] = sregs.cr0;
3741     env->cr[2] = sregs.cr2;
3742     env->cr[3] = sregs.cr3;
3743     env->cr[4] = sregs.cr4;
3744 
3745     env->efer = sregs.efer;
3746     if (sev_es_enabled() && env->efer & MSR_EFER_LME &&
3747         env->cr[0] & CR0_PG_MASK) {
3748         env->efer |= MSR_EFER_LMA;
3749     }
3750 
3751     env->pdptrs_valid = sregs.flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
3752 
3753     if (env->pdptrs_valid) {
3754         for (i = 0; i < 4; i++) {
3755             env->pdptrs[i] = sregs.pdptrs[i];
3756         }
3757     }
3758 
3759     /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
3760     x86_update_hflags(env);
3761 
3762     return 0;
3763 }
3764 
3765 static int kvm_get_msrs(X86CPU *cpu)
3766 {
3767     CPUX86State *env = &cpu->env;
3768     struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries;
3769     int ret, i;
3770     uint64_t mtrr_top_bits;
3771 
3772     kvm_msr_buf_reset(cpu);
3773 
3774     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, 0);
3775     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, 0);
3776     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, 0);
3777     kvm_msr_entry_add(cpu, MSR_PAT, 0);
3778     if (has_msr_star) {
3779         kvm_msr_entry_add(cpu, MSR_STAR, 0);
3780     }
3781     if (has_msr_hsave_pa) {
3782         kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, 0);
3783     }
3784     if (has_msr_tsc_aux) {
3785         kvm_msr_entry_add(cpu, MSR_TSC_AUX, 0);
3786     }
3787     if (has_msr_tsc_adjust) {
3788         kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, 0);
3789     }
3790     if (has_msr_tsc_deadline) {
3791         kvm_msr_entry_add(cpu, MSR_IA32_TSCDEADLINE, 0);
3792     }
3793     if (has_msr_misc_enable) {
3794         kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE, 0);
3795     }
3796     if (has_msr_smbase) {
3797         kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, 0);
3798     }
3799     if (has_msr_smi_count) {
3800         kvm_msr_entry_add(cpu, MSR_SMI_COUNT, 0);
3801     }
3802     if (has_msr_feature_control) {
3803         kvm_msr_entry_add(cpu, MSR_IA32_FEATURE_CONTROL, 0);
3804     }
3805     if (has_msr_pkrs) {
3806         kvm_msr_entry_add(cpu, MSR_IA32_PKRS, 0);
3807     }
3808     if (has_msr_bndcfgs) {
3809         kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, 0);
3810     }
3811     if (has_msr_xss) {
3812         kvm_msr_entry_add(cpu, MSR_IA32_XSS, 0);
3813     }
3814     if (has_msr_umwait) {
3815         kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, 0);
3816     }
3817     if (has_msr_spec_ctrl) {
3818         kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, 0);
3819     }
3820     if (has_tsc_scale_msr) {
3821         kvm_msr_entry_add(cpu, MSR_AMD64_TSC_RATIO, 0);
3822     }
3823 
3824     if (has_msr_tsx_ctrl) {
3825         kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, 0);
3826     }
3827     if (has_msr_virt_ssbd) {
3828         kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, 0);
3829     }
3830     if (!env->tsc_valid) {
3831         kvm_msr_entry_add(cpu, MSR_IA32_TSC, 0);
3832         env->tsc_valid = !runstate_is_running();
3833     }
3834 
3835 #ifdef TARGET_X86_64
3836     if (lm_capable_kernel) {
3837         kvm_msr_entry_add(cpu, MSR_CSTAR, 0);
3838         kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, 0);
3839         kvm_msr_entry_add(cpu, MSR_FMASK, 0);
3840         kvm_msr_entry_add(cpu, MSR_LSTAR, 0);
3841     }
3842 #endif
3843     kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0);
3844     kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, 0);
3845     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) {
3846         kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, 0);
3847     }
3848     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
3849         kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, 0);
3850     }
3851     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
3852         kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, 0);
3853     }
3854     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
3855         kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, 0);
3856     }
3857     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
3858         kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, 1);
3859     }
3860     if (has_architectural_pmu_version > 0) {
3861         if (has_architectural_pmu_version > 1) {
3862             kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
3863             kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
3864             kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS, 0);
3865             kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 0);
3866         }
3867         for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
3868             kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i, 0);
3869         }
3870         for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
3871             kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i, 0);
3872             kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i, 0);
3873         }
3874     }
3875 
3876     if (env->mcg_cap) {
3877         kvm_msr_entry_add(cpu, MSR_MCG_STATUS, 0);
3878         kvm_msr_entry_add(cpu, MSR_MCG_CTL, 0);
3879         if (has_msr_mcg_ext_ctl) {
3880             kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, 0);
3881         }
3882         for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
3883             kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, 0);
3884         }
3885     }
3886 
3887     if (has_msr_hv_hypercall) {
3888         kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL, 0);
3889         kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID, 0);
3890     }
3891     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) {
3892         kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE, 0);
3893     }
3894     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) {
3895         kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC, 0);
3896     }
3897     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) {
3898         kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0);
3899         kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL, 0);
3900         kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS, 0);
3901     }
3902     if (has_msr_hv_syndbg_options) {
3903         kvm_msr_entry_add(cpu, HV_X64_MSR_SYNDBG_OPTIONS, 0);
3904     }
3905     if (has_msr_hv_crash) {
3906         int j;
3907 
3908         for (j = 0; j < HV_CRASH_PARAMS; j++) {
3909             kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j, 0);
3910         }
3911     }
3912     if (has_msr_hv_runtime) {
3913         kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, 0);
3914     }
3915     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
3916         uint32_t msr;
3917 
3918         kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL, 0);
3919         kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP, 0);
3920         kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP, 0);
3921         for (msr = HV_X64_MSR_SINT0; msr <= HV_X64_MSR_SINT15; msr++) {
3922             kvm_msr_entry_add(cpu, msr, 0);
3923         }
3924     }
3925     if (has_msr_hv_stimer) {
3926         uint32_t msr;
3927 
3928         for (msr = HV_X64_MSR_STIMER0_CONFIG; msr <= HV_X64_MSR_STIMER3_COUNT;
3929              msr++) {
3930             kvm_msr_entry_add(cpu, msr, 0);
3931         }
3932     }
3933     if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
3934         kvm_msr_entry_add(cpu, MSR_MTRRdefType, 0);
3935         kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, 0);
3936         kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, 0);
3937         kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, 0);
3938         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, 0);
3939         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, 0);
3940         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, 0);
3941         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, 0);
3942         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, 0);
3943         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, 0);
3944         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, 0);
3945         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, 0);
3946         for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
3947             kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i), 0);
3948             kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), 0);
3949         }
3950     }
3951 
3952     if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
3953         int addr_num =
3954             kvm_arch_get_supported_cpuid(kvm_state, 0x14, 1, R_EAX) & 0x7;
3955 
3956         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL, 0);
3957         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS, 0);
3958         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE, 0);
3959         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK, 0);
3960         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH, 0);
3961         for (i = 0; i < addr_num; i++) {
3962             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i, 0);
3963         }
3964     }
3965 
3966     if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) {
3967         kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0, 0);
3968         kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1, 0);
3969         kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2, 0);
3970         kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3, 0);
3971     }
3972 
3973     if (env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD) {
3974         kvm_msr_entry_add(cpu, MSR_IA32_XFD, 0);
3975         kvm_msr_entry_add(cpu, MSR_IA32_XFD_ERR, 0);
3976     }
3977 
3978     if (kvm_enabled() && cpu->enable_pmu &&
3979         (env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_ARCH_LBR)) {
3980         uint64_t depth;
3981 
3982         ret = kvm_get_one_msr(cpu, MSR_ARCH_LBR_DEPTH, &depth);
3983         if (ret == 1 && depth == ARCH_LBR_NR_ENTRIES) {
3984             kvm_msr_entry_add(cpu, MSR_ARCH_LBR_CTL, 0);
3985             kvm_msr_entry_add(cpu, MSR_ARCH_LBR_DEPTH, 0);
3986 
3987             for (i = 0; i < ARCH_LBR_NR_ENTRIES; i++) {
3988                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_FROM_0 + i, 0);
3989                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_TO_0 + i, 0);
3990                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_INFO_0 + i, 0);
3991             }
3992         }
3993     }
3994 
3995     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf);
3996     if (ret < 0) {
3997         return ret;
3998     }
3999 
4000     if (ret < cpu->kvm_msr_buf->nmsrs) {
4001         struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
4002         error_report("error: failed to get MSR 0x%" PRIx32,
4003                      (uint32_t)e->index);
4004     }
4005 
4006     assert(ret == cpu->kvm_msr_buf->nmsrs);
4007     /*
4008      * MTRR masks: Each mask consists of 5 parts
4009      * a  10..0: must be zero
4010      * b  11   : valid bit
4011      * c n-1.12: actual mask bits
4012      * d  51..n: reserved must be zero
4013      * e  63.52: reserved must be zero
4014      *
4015      * 'n' is the number of physical bits supported by the CPU and is
4016      * apparently always <= 52.   We know our 'n' but don't know what
4017      * the destinations 'n' is; it might be smaller, in which case
4018      * it masks (c) on loading. It might be larger, in which case
4019      * we fill 'd' so that d..c is consistent irrespetive of the 'n'
4020      * we're migrating to.
4021      */
4022 
4023     if (cpu->fill_mtrr_mask) {
4024         QEMU_BUILD_BUG_ON(TARGET_PHYS_ADDR_SPACE_BITS > 52);
4025         assert(cpu->phys_bits <= TARGET_PHYS_ADDR_SPACE_BITS);
4026         mtrr_top_bits = MAKE_64BIT_MASK(cpu->phys_bits, 52 - cpu->phys_bits);
4027     } else {
4028         mtrr_top_bits = 0;
4029     }
4030 
4031     for (i = 0; i < ret; i++) {
4032         uint32_t index = msrs[i].index;
4033         switch (index) {
4034         case MSR_IA32_SYSENTER_CS:
4035             env->sysenter_cs = msrs[i].data;
4036             break;
4037         case MSR_IA32_SYSENTER_ESP:
4038             env->sysenter_esp = msrs[i].data;
4039             break;
4040         case MSR_IA32_SYSENTER_EIP:
4041             env->sysenter_eip = msrs[i].data;
4042             break;
4043         case MSR_PAT:
4044             env->pat = msrs[i].data;
4045             break;
4046         case MSR_STAR:
4047             env->star = msrs[i].data;
4048             break;
4049 #ifdef TARGET_X86_64
4050         case MSR_CSTAR:
4051             env->cstar = msrs[i].data;
4052             break;
4053         case MSR_KERNELGSBASE:
4054             env->kernelgsbase = msrs[i].data;
4055             break;
4056         case MSR_FMASK:
4057             env->fmask = msrs[i].data;
4058             break;
4059         case MSR_LSTAR:
4060             env->lstar = msrs[i].data;
4061             break;
4062 #endif
4063         case MSR_IA32_TSC:
4064             env->tsc = msrs[i].data;
4065             break;
4066         case MSR_TSC_AUX:
4067             env->tsc_aux = msrs[i].data;
4068             break;
4069         case MSR_TSC_ADJUST:
4070             env->tsc_adjust = msrs[i].data;
4071             break;
4072         case MSR_IA32_TSCDEADLINE:
4073             env->tsc_deadline = msrs[i].data;
4074             break;
4075         case MSR_VM_HSAVE_PA:
4076             env->vm_hsave = msrs[i].data;
4077             break;
4078         case MSR_KVM_SYSTEM_TIME:
4079             env->system_time_msr = msrs[i].data;
4080             break;
4081         case MSR_KVM_WALL_CLOCK:
4082             env->wall_clock_msr = msrs[i].data;
4083             break;
4084         case MSR_MCG_STATUS:
4085             env->mcg_status = msrs[i].data;
4086             break;
4087         case MSR_MCG_CTL:
4088             env->mcg_ctl = msrs[i].data;
4089             break;
4090         case MSR_MCG_EXT_CTL:
4091             env->mcg_ext_ctl = msrs[i].data;
4092             break;
4093         case MSR_IA32_MISC_ENABLE:
4094             env->msr_ia32_misc_enable = msrs[i].data;
4095             break;
4096         case MSR_IA32_SMBASE:
4097             env->smbase = msrs[i].data;
4098             break;
4099         case MSR_SMI_COUNT:
4100             env->msr_smi_count = msrs[i].data;
4101             break;
4102         case MSR_IA32_FEATURE_CONTROL:
4103             env->msr_ia32_feature_control = msrs[i].data;
4104             break;
4105         case MSR_IA32_BNDCFGS:
4106             env->msr_bndcfgs = msrs[i].data;
4107             break;
4108         case MSR_IA32_XSS:
4109             env->xss = msrs[i].data;
4110             break;
4111         case MSR_IA32_UMWAIT_CONTROL:
4112             env->umwait = msrs[i].data;
4113             break;
4114         case MSR_IA32_PKRS:
4115             env->pkrs = msrs[i].data;
4116             break;
4117         default:
4118             if (msrs[i].index >= MSR_MC0_CTL &&
4119                 msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
4120                 env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
4121             }
4122             break;
4123         case MSR_KVM_ASYNC_PF_EN:
4124             env->async_pf_en_msr = msrs[i].data;
4125             break;
4126         case MSR_KVM_ASYNC_PF_INT:
4127             env->async_pf_int_msr = msrs[i].data;
4128             break;
4129         case MSR_KVM_PV_EOI_EN:
4130             env->pv_eoi_en_msr = msrs[i].data;
4131             break;
4132         case MSR_KVM_STEAL_TIME:
4133             env->steal_time_msr = msrs[i].data;
4134             break;
4135         case MSR_KVM_POLL_CONTROL: {
4136             env->poll_control_msr = msrs[i].data;
4137             break;
4138         }
4139         case MSR_CORE_PERF_FIXED_CTR_CTRL:
4140             env->msr_fixed_ctr_ctrl = msrs[i].data;
4141             break;
4142         case MSR_CORE_PERF_GLOBAL_CTRL:
4143             env->msr_global_ctrl = msrs[i].data;
4144             break;
4145         case MSR_CORE_PERF_GLOBAL_STATUS:
4146             env->msr_global_status = msrs[i].data;
4147             break;
4148         case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
4149             env->msr_global_ovf_ctrl = msrs[i].data;
4150             break;
4151         case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR0 + MAX_FIXED_COUNTERS - 1:
4152             env->msr_fixed_counters[index - MSR_CORE_PERF_FIXED_CTR0] = msrs[i].data;
4153             break;
4154         case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR0 + MAX_GP_COUNTERS - 1:
4155             env->msr_gp_counters[index - MSR_P6_PERFCTR0] = msrs[i].data;
4156             break;
4157         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1:
4158             env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data;
4159             break;
4160         case HV_X64_MSR_HYPERCALL:
4161             env->msr_hv_hypercall = msrs[i].data;
4162             break;
4163         case HV_X64_MSR_GUEST_OS_ID:
4164             env->msr_hv_guest_os_id = msrs[i].data;
4165             break;
4166         case HV_X64_MSR_APIC_ASSIST_PAGE:
4167             env->msr_hv_vapic = msrs[i].data;
4168             break;
4169         case HV_X64_MSR_REFERENCE_TSC:
4170             env->msr_hv_tsc = msrs[i].data;
4171             break;
4172         case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
4173             env->msr_hv_crash_params[index - HV_X64_MSR_CRASH_P0] = msrs[i].data;
4174             break;
4175         case HV_X64_MSR_VP_RUNTIME:
4176             env->msr_hv_runtime = msrs[i].data;
4177             break;
4178         case HV_X64_MSR_SCONTROL:
4179             env->msr_hv_synic_control = msrs[i].data;
4180             break;
4181         case HV_X64_MSR_SIEFP:
4182             env->msr_hv_synic_evt_page = msrs[i].data;
4183             break;
4184         case HV_X64_MSR_SIMP:
4185             env->msr_hv_synic_msg_page = msrs[i].data;
4186             break;
4187         case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
4188             env->msr_hv_synic_sint[index - HV_X64_MSR_SINT0] = msrs[i].data;
4189             break;
4190         case HV_X64_MSR_STIMER0_CONFIG:
4191         case HV_X64_MSR_STIMER1_CONFIG:
4192         case HV_X64_MSR_STIMER2_CONFIG:
4193         case HV_X64_MSR_STIMER3_CONFIG:
4194             env->msr_hv_stimer_config[(index - HV_X64_MSR_STIMER0_CONFIG)/2] =
4195                                 msrs[i].data;
4196             break;
4197         case HV_X64_MSR_STIMER0_COUNT:
4198         case HV_X64_MSR_STIMER1_COUNT:
4199         case HV_X64_MSR_STIMER2_COUNT:
4200         case HV_X64_MSR_STIMER3_COUNT:
4201             env->msr_hv_stimer_count[(index - HV_X64_MSR_STIMER0_COUNT)/2] =
4202                                 msrs[i].data;
4203             break;
4204         case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
4205             env->msr_hv_reenlightenment_control = msrs[i].data;
4206             break;
4207         case HV_X64_MSR_TSC_EMULATION_CONTROL:
4208             env->msr_hv_tsc_emulation_control = msrs[i].data;
4209             break;
4210         case HV_X64_MSR_TSC_EMULATION_STATUS:
4211             env->msr_hv_tsc_emulation_status = msrs[i].data;
4212             break;
4213         case HV_X64_MSR_SYNDBG_OPTIONS:
4214             env->msr_hv_syndbg_options = msrs[i].data;
4215             break;
4216         case MSR_MTRRdefType:
4217             env->mtrr_deftype = msrs[i].data;
4218             break;
4219         case MSR_MTRRfix64K_00000:
4220             env->mtrr_fixed[0] = msrs[i].data;
4221             break;
4222         case MSR_MTRRfix16K_80000:
4223             env->mtrr_fixed[1] = msrs[i].data;
4224             break;
4225         case MSR_MTRRfix16K_A0000:
4226             env->mtrr_fixed[2] = msrs[i].data;
4227             break;
4228         case MSR_MTRRfix4K_C0000:
4229             env->mtrr_fixed[3] = msrs[i].data;
4230             break;
4231         case MSR_MTRRfix4K_C8000:
4232             env->mtrr_fixed[4] = msrs[i].data;
4233             break;
4234         case MSR_MTRRfix4K_D0000:
4235             env->mtrr_fixed[5] = msrs[i].data;
4236             break;
4237         case MSR_MTRRfix4K_D8000:
4238             env->mtrr_fixed[6] = msrs[i].data;
4239             break;
4240         case MSR_MTRRfix4K_E0000:
4241             env->mtrr_fixed[7] = msrs[i].data;
4242             break;
4243         case MSR_MTRRfix4K_E8000:
4244             env->mtrr_fixed[8] = msrs[i].data;
4245             break;
4246         case MSR_MTRRfix4K_F0000:
4247             env->mtrr_fixed[9] = msrs[i].data;
4248             break;
4249         case MSR_MTRRfix4K_F8000:
4250             env->mtrr_fixed[10] = msrs[i].data;
4251             break;
4252         case MSR_MTRRphysBase(0) ... MSR_MTRRphysMask(MSR_MTRRcap_VCNT - 1):
4253             if (index & 1) {
4254                 env->mtrr_var[MSR_MTRRphysIndex(index)].mask = msrs[i].data |
4255                                                                mtrr_top_bits;
4256             } else {
4257                 env->mtrr_var[MSR_MTRRphysIndex(index)].base = msrs[i].data;
4258             }
4259             break;
4260         case MSR_IA32_SPEC_CTRL:
4261             env->spec_ctrl = msrs[i].data;
4262             break;
4263         case MSR_AMD64_TSC_RATIO:
4264             env->amd_tsc_scale_msr = msrs[i].data;
4265             break;
4266         case MSR_IA32_TSX_CTRL:
4267             env->tsx_ctrl = msrs[i].data;
4268             break;
4269         case MSR_VIRT_SSBD:
4270             env->virt_ssbd = msrs[i].data;
4271             break;
4272         case MSR_IA32_RTIT_CTL:
4273             env->msr_rtit_ctrl = msrs[i].data;
4274             break;
4275         case MSR_IA32_RTIT_STATUS:
4276             env->msr_rtit_status = msrs[i].data;
4277             break;
4278         case MSR_IA32_RTIT_OUTPUT_BASE:
4279             env->msr_rtit_output_base = msrs[i].data;
4280             break;
4281         case MSR_IA32_RTIT_OUTPUT_MASK:
4282             env->msr_rtit_output_mask = msrs[i].data;
4283             break;
4284         case MSR_IA32_RTIT_CR3_MATCH:
4285             env->msr_rtit_cr3_match = msrs[i].data;
4286             break;
4287         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
4288             env->msr_rtit_addrs[index - MSR_IA32_RTIT_ADDR0_A] = msrs[i].data;
4289             break;
4290         case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
4291             env->msr_ia32_sgxlepubkeyhash[index - MSR_IA32_SGXLEPUBKEYHASH0] =
4292                            msrs[i].data;
4293             break;
4294         case MSR_IA32_XFD:
4295             env->msr_xfd = msrs[i].data;
4296             break;
4297         case MSR_IA32_XFD_ERR:
4298             env->msr_xfd_err = msrs[i].data;
4299             break;
4300         case MSR_ARCH_LBR_CTL:
4301             env->msr_lbr_ctl = msrs[i].data;
4302             break;
4303         case MSR_ARCH_LBR_DEPTH:
4304             env->msr_lbr_depth = msrs[i].data;
4305             break;
4306         case MSR_ARCH_LBR_FROM_0 ... MSR_ARCH_LBR_FROM_0 + 31:
4307             env->lbr_records[index - MSR_ARCH_LBR_FROM_0].from = msrs[i].data;
4308             break;
4309         case MSR_ARCH_LBR_TO_0 ... MSR_ARCH_LBR_TO_0 + 31:
4310             env->lbr_records[index - MSR_ARCH_LBR_TO_0].to = msrs[i].data;
4311             break;
4312         case MSR_ARCH_LBR_INFO_0 ... MSR_ARCH_LBR_INFO_0 + 31:
4313             env->lbr_records[index - MSR_ARCH_LBR_INFO_0].info = msrs[i].data;
4314             break;
4315         }
4316     }
4317 
4318     return 0;
4319 }
4320 
4321 static int kvm_put_mp_state(X86CPU *cpu)
4322 {
4323     struct kvm_mp_state mp_state = { .mp_state = cpu->env.mp_state };
4324 
4325     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MP_STATE, &mp_state);
4326 }
4327 
4328 static int kvm_get_mp_state(X86CPU *cpu)
4329 {
4330     CPUState *cs = CPU(cpu);
4331     CPUX86State *env = &cpu->env;
4332     struct kvm_mp_state mp_state;
4333     int ret;
4334 
4335     ret = kvm_vcpu_ioctl(cs, KVM_GET_MP_STATE, &mp_state);
4336     if (ret < 0) {
4337         return ret;
4338     }
4339     env->mp_state = mp_state.mp_state;
4340     if (kvm_irqchip_in_kernel()) {
4341         cs->halted = (mp_state.mp_state == KVM_MP_STATE_HALTED);
4342     }
4343     return 0;
4344 }
4345 
4346 static int kvm_get_apic(X86CPU *cpu)
4347 {
4348     DeviceState *apic = cpu->apic_state;
4349     struct kvm_lapic_state kapic;
4350     int ret;
4351 
4352     if (apic && kvm_irqchip_in_kernel()) {
4353         ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_LAPIC, &kapic);
4354         if (ret < 0) {
4355             return ret;
4356         }
4357 
4358         kvm_get_apic_state(apic, &kapic);
4359     }
4360     return 0;
4361 }
4362 
4363 static int kvm_put_vcpu_events(X86CPU *cpu, int level)
4364 {
4365     CPUState *cs = CPU(cpu);
4366     CPUX86State *env = &cpu->env;
4367     struct kvm_vcpu_events events = {};
4368 
4369     events.flags = 0;
4370 
4371     if (has_exception_payload) {
4372         events.flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
4373         events.exception.pending = env->exception_pending;
4374         events.exception_has_payload = env->exception_has_payload;
4375         events.exception_payload = env->exception_payload;
4376     }
4377     events.exception.nr = env->exception_nr;
4378     events.exception.injected = env->exception_injected;
4379     events.exception.has_error_code = env->has_error_code;
4380     events.exception.error_code = env->error_code;
4381 
4382     events.interrupt.injected = (env->interrupt_injected >= 0);
4383     events.interrupt.nr = env->interrupt_injected;
4384     events.interrupt.soft = env->soft_interrupt;
4385 
4386     events.nmi.injected = env->nmi_injected;
4387     events.nmi.pending = env->nmi_pending;
4388     events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
4389 
4390     events.sipi_vector = env->sipi_vector;
4391 
4392     if (has_msr_smbase) {
4393         events.smi.smm = !!(env->hflags & HF_SMM_MASK);
4394         events.smi.smm_inside_nmi = !!(env->hflags2 & HF2_SMM_INSIDE_NMI_MASK);
4395         if (kvm_irqchip_in_kernel()) {
4396             /* As soon as these are moved to the kernel, remove them
4397              * from cs->interrupt_request.
4398              */
4399             events.smi.pending = cs->interrupt_request & CPU_INTERRUPT_SMI;
4400             events.smi.latched_init = cs->interrupt_request & CPU_INTERRUPT_INIT;
4401             cs->interrupt_request &= ~(CPU_INTERRUPT_INIT | CPU_INTERRUPT_SMI);
4402         } else {
4403             /* Keep these in cs->interrupt_request.  */
4404             events.smi.pending = 0;
4405             events.smi.latched_init = 0;
4406         }
4407         /* Stop SMI delivery on old machine types to avoid a reboot
4408          * on an inward migration of an old VM.
4409          */
4410         if (!cpu->kvm_no_smi_migration) {
4411             events.flags |= KVM_VCPUEVENT_VALID_SMM;
4412         }
4413     }
4414 
4415     if (level >= KVM_PUT_RESET_STATE) {
4416         events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING;
4417         if (env->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
4418             events.flags |= KVM_VCPUEVENT_VALID_SIPI_VECTOR;
4419         }
4420     }
4421 
4422     if (has_triple_fault_event) {
4423         events.flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
4424         events.triple_fault.pending = env->triple_fault_pending;
4425     }
4426 
4427     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_VCPU_EVENTS, &events);
4428 }
4429 
4430 static int kvm_get_vcpu_events(X86CPU *cpu)
4431 {
4432     CPUX86State *env = &cpu->env;
4433     struct kvm_vcpu_events events;
4434     int ret;
4435 
4436     memset(&events, 0, sizeof(events));
4437     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_VCPU_EVENTS, &events);
4438     if (ret < 0) {
4439        return ret;
4440     }
4441 
4442     if (events.flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
4443         env->exception_pending = events.exception.pending;
4444         env->exception_has_payload = events.exception_has_payload;
4445         env->exception_payload = events.exception_payload;
4446     } else {
4447         env->exception_pending = 0;
4448         env->exception_has_payload = false;
4449     }
4450     env->exception_injected = events.exception.injected;
4451     env->exception_nr =
4452         (env->exception_pending || env->exception_injected) ?
4453         events.exception.nr : -1;
4454     env->has_error_code = events.exception.has_error_code;
4455     env->error_code = events.exception.error_code;
4456 
4457     env->interrupt_injected =
4458         events.interrupt.injected ? events.interrupt.nr : -1;
4459     env->soft_interrupt = events.interrupt.soft;
4460 
4461     env->nmi_injected = events.nmi.injected;
4462     env->nmi_pending = events.nmi.pending;
4463     if (events.nmi.masked) {
4464         env->hflags2 |= HF2_NMI_MASK;
4465     } else {
4466         env->hflags2 &= ~HF2_NMI_MASK;
4467     }
4468 
4469     if (events.flags & KVM_VCPUEVENT_VALID_SMM) {
4470         if (events.smi.smm) {
4471             env->hflags |= HF_SMM_MASK;
4472         } else {
4473             env->hflags &= ~HF_SMM_MASK;
4474         }
4475         if (events.smi.pending) {
4476             cpu_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
4477         } else {
4478             cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
4479         }
4480         if (events.smi.smm_inside_nmi) {
4481             env->hflags2 |= HF2_SMM_INSIDE_NMI_MASK;
4482         } else {
4483             env->hflags2 &= ~HF2_SMM_INSIDE_NMI_MASK;
4484         }
4485         if (events.smi.latched_init) {
4486             cpu_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
4487         } else {
4488             cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
4489         }
4490     }
4491 
4492     if (events.flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) {
4493         env->triple_fault_pending = events.triple_fault.pending;
4494     }
4495 
4496     env->sipi_vector = events.sipi_vector;
4497 
4498     return 0;
4499 }
4500 
4501 static int kvm_put_debugregs(X86CPU *cpu)
4502 {
4503     CPUX86State *env = &cpu->env;
4504     struct kvm_debugregs dbgregs;
4505     int i;
4506 
4507     memset(&dbgregs, 0, sizeof(dbgregs));
4508     for (i = 0; i < 4; i++) {
4509         dbgregs.db[i] = env->dr[i];
4510     }
4511     dbgregs.dr6 = env->dr[6];
4512     dbgregs.dr7 = env->dr[7];
4513     dbgregs.flags = 0;
4514 
4515     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_DEBUGREGS, &dbgregs);
4516 }
4517 
4518 static int kvm_get_debugregs(X86CPU *cpu)
4519 {
4520     CPUX86State *env = &cpu->env;
4521     struct kvm_debugregs dbgregs;
4522     int i, ret;
4523 
4524     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_DEBUGREGS, &dbgregs);
4525     if (ret < 0) {
4526         return ret;
4527     }
4528     for (i = 0; i < 4; i++) {
4529         env->dr[i] = dbgregs.db[i];
4530     }
4531     env->dr[4] = env->dr[6] = dbgregs.dr6;
4532     env->dr[5] = env->dr[7] = dbgregs.dr7;
4533 
4534     return 0;
4535 }
4536 
4537 static int kvm_put_nested_state(X86CPU *cpu)
4538 {
4539     CPUX86State *env = &cpu->env;
4540     int max_nested_state_len = kvm_max_nested_state_length();
4541 
4542     if (!env->nested_state) {
4543         return 0;
4544     }
4545 
4546     /*
4547      * Copy flags that are affected by reset from env->hflags and env->hflags2.
4548      */
4549     if (env->hflags & HF_GUEST_MASK) {
4550         env->nested_state->flags |= KVM_STATE_NESTED_GUEST_MODE;
4551     } else {
4552         env->nested_state->flags &= ~KVM_STATE_NESTED_GUEST_MODE;
4553     }
4554 
4555     /* Don't set KVM_STATE_NESTED_GIF_SET on VMX as it is illegal */
4556     if (cpu_has_svm(env) && (env->hflags2 & HF2_GIF_MASK)) {
4557         env->nested_state->flags |= KVM_STATE_NESTED_GIF_SET;
4558     } else {
4559         env->nested_state->flags &= ~KVM_STATE_NESTED_GIF_SET;
4560     }
4561 
4562     assert(env->nested_state->size <= max_nested_state_len);
4563     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state);
4564 }
4565 
4566 static int kvm_get_nested_state(X86CPU *cpu)
4567 {
4568     CPUX86State *env = &cpu->env;
4569     int max_nested_state_len = kvm_max_nested_state_length();
4570     int ret;
4571 
4572     if (!env->nested_state) {
4573         return 0;
4574     }
4575 
4576     /*
4577      * It is possible that migration restored a smaller size into
4578      * nested_state->hdr.size than what our kernel support.
4579      * We preserve migration origin nested_state->hdr.size for
4580      * call to KVM_SET_NESTED_STATE but wish that our next call
4581      * to KVM_GET_NESTED_STATE will use max size our kernel support.
4582      */
4583     env->nested_state->size = max_nested_state_len;
4584 
4585     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_NESTED_STATE, env->nested_state);
4586     if (ret < 0) {
4587         return ret;
4588     }
4589 
4590     /*
4591      * Copy flags that are affected by reset to env->hflags and env->hflags2.
4592      */
4593     if (env->nested_state->flags & KVM_STATE_NESTED_GUEST_MODE) {
4594         env->hflags |= HF_GUEST_MASK;
4595     } else {
4596         env->hflags &= ~HF_GUEST_MASK;
4597     }
4598 
4599     /* Keep HF2_GIF_MASK set on !SVM as x86_cpu_pending_interrupt() needs it */
4600     if (cpu_has_svm(env)) {
4601         if (env->nested_state->flags & KVM_STATE_NESTED_GIF_SET) {
4602             env->hflags2 |= HF2_GIF_MASK;
4603         } else {
4604             env->hflags2 &= ~HF2_GIF_MASK;
4605         }
4606     }
4607 
4608     return ret;
4609 }
4610 
4611 int kvm_arch_put_registers(CPUState *cpu, int level)
4612 {
4613     X86CPU *x86_cpu = X86_CPU(cpu);
4614     int ret;
4615 
4616     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
4617 
4618     /*
4619      * Put MSR_IA32_FEATURE_CONTROL first, this ensures the VM gets out of VMX
4620      * root operation upon vCPU reset. kvm_put_msr_feature_control() should also
4621      * precede kvm_put_nested_state() when 'real' nested state is set.
4622      */
4623     if (level >= KVM_PUT_RESET_STATE) {
4624         ret = kvm_put_msr_feature_control(x86_cpu);
4625         if (ret < 0) {
4626             return ret;
4627         }
4628     }
4629 
4630     /* must be before kvm_put_nested_state so that EFER.SVME is set */
4631     ret = has_sregs2 ? kvm_put_sregs2(x86_cpu) : kvm_put_sregs(x86_cpu);
4632     if (ret < 0) {
4633         return ret;
4634     }
4635 
4636     if (level >= KVM_PUT_RESET_STATE) {
4637         ret = kvm_put_nested_state(x86_cpu);
4638         if (ret < 0) {
4639             return ret;
4640         }
4641     }
4642 
4643     if (level == KVM_PUT_FULL_STATE) {
4644         /* We don't check for kvm_arch_set_tsc_khz() errors here,
4645          * because TSC frequency mismatch shouldn't abort migration,
4646          * unless the user explicitly asked for a more strict TSC
4647          * setting (e.g. using an explicit "tsc-freq" option).
4648          */
4649         kvm_arch_set_tsc_khz(cpu);
4650     }
4651 
4652 #ifdef CONFIG_XEN_EMU
4653     if (xen_mode == XEN_EMULATE && level == KVM_PUT_FULL_STATE) {
4654         ret = kvm_put_xen_state(cpu);
4655         if (ret < 0) {
4656             return ret;
4657         }
4658     }
4659 #endif
4660 
4661     ret = kvm_getput_regs(x86_cpu, 1);
4662     if (ret < 0) {
4663         return ret;
4664     }
4665     ret = kvm_put_xsave(x86_cpu);
4666     if (ret < 0) {
4667         return ret;
4668     }
4669     ret = kvm_put_xcrs(x86_cpu);
4670     if (ret < 0) {
4671         return ret;
4672     }
4673     ret = kvm_put_msrs(x86_cpu, level);
4674     if (ret < 0) {
4675         return ret;
4676     }
4677     ret = kvm_put_vcpu_events(x86_cpu, level);
4678     if (ret < 0) {
4679         return ret;
4680     }
4681     if (level >= KVM_PUT_RESET_STATE) {
4682         ret = kvm_put_mp_state(x86_cpu);
4683         if (ret < 0) {
4684             return ret;
4685         }
4686     }
4687 
4688     ret = kvm_put_tscdeadline_msr(x86_cpu);
4689     if (ret < 0) {
4690         return ret;
4691     }
4692     ret = kvm_put_debugregs(x86_cpu);
4693     if (ret < 0) {
4694         return ret;
4695     }
4696     return 0;
4697 }
4698 
4699 int kvm_arch_get_registers(CPUState *cs)
4700 {
4701     X86CPU *cpu = X86_CPU(cs);
4702     int ret;
4703 
4704     assert(cpu_is_stopped(cs) || qemu_cpu_is_self(cs));
4705 
4706     ret = kvm_get_vcpu_events(cpu);
4707     if (ret < 0) {
4708         goto out;
4709     }
4710     /*
4711      * KVM_GET_MPSTATE can modify CS and RIP, call it before
4712      * KVM_GET_REGS and KVM_GET_SREGS.
4713      */
4714     ret = kvm_get_mp_state(cpu);
4715     if (ret < 0) {
4716         goto out;
4717     }
4718     ret = kvm_getput_regs(cpu, 0);
4719     if (ret < 0) {
4720         goto out;
4721     }
4722     ret = kvm_get_xsave(cpu);
4723     if (ret < 0) {
4724         goto out;
4725     }
4726     ret = kvm_get_xcrs(cpu);
4727     if (ret < 0) {
4728         goto out;
4729     }
4730     ret = has_sregs2 ? kvm_get_sregs2(cpu) : kvm_get_sregs(cpu);
4731     if (ret < 0) {
4732         goto out;
4733     }
4734     ret = kvm_get_msrs(cpu);
4735     if (ret < 0) {
4736         goto out;
4737     }
4738     ret = kvm_get_apic(cpu);
4739     if (ret < 0) {
4740         goto out;
4741     }
4742     ret = kvm_get_debugregs(cpu);
4743     if (ret < 0) {
4744         goto out;
4745     }
4746     ret = kvm_get_nested_state(cpu);
4747     if (ret < 0) {
4748         goto out;
4749     }
4750 #ifdef CONFIG_XEN_EMU
4751     if (xen_mode == XEN_EMULATE) {
4752         ret = kvm_get_xen_state(cs);
4753         if (ret < 0) {
4754             goto out;
4755         }
4756     }
4757 #endif
4758     ret = 0;
4759  out:
4760     cpu_sync_bndcs_hflags(&cpu->env);
4761     return ret;
4762 }
4763 
4764 void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
4765 {
4766     X86CPU *x86_cpu = X86_CPU(cpu);
4767     CPUX86State *env = &x86_cpu->env;
4768     int ret;
4769 
4770     /* Inject NMI */
4771     if (cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
4772         if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
4773             bql_lock();
4774             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
4775             bql_unlock();
4776             DPRINTF("injected NMI\n");
4777             ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
4778             if (ret < 0) {
4779                 fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
4780                         strerror(-ret));
4781             }
4782         }
4783         if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
4784             bql_lock();
4785             cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
4786             bql_unlock();
4787             DPRINTF("injected SMI\n");
4788             ret = kvm_vcpu_ioctl(cpu, KVM_SMI);
4789             if (ret < 0) {
4790                 fprintf(stderr, "KVM: injection failed, SMI lost (%s)\n",
4791                         strerror(-ret));
4792             }
4793         }
4794     }
4795 
4796     if (!kvm_pic_in_kernel()) {
4797         bql_lock();
4798     }
4799 
4800     /* Force the VCPU out of its inner loop to process any INIT requests
4801      * or (for userspace APIC, but it is cheap to combine the checks here)
4802      * pending TPR access reports.
4803      */
4804     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
4805         if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
4806             !(env->hflags & HF_SMM_MASK)) {
4807             cpu->exit_request = 1;
4808         }
4809         if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
4810             cpu->exit_request = 1;
4811         }
4812     }
4813 
4814     if (!kvm_pic_in_kernel()) {
4815         /* Try to inject an interrupt if the guest can accept it */
4816         if (run->ready_for_interrupt_injection &&
4817             (cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
4818             (env->eflags & IF_MASK)) {
4819             int irq;
4820 
4821             cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
4822             irq = cpu_get_pic_interrupt(env);
4823             if (irq >= 0) {
4824                 struct kvm_interrupt intr;
4825 
4826                 intr.irq = irq;
4827                 DPRINTF("injected interrupt %d\n", irq);
4828                 ret = kvm_vcpu_ioctl(cpu, KVM_INTERRUPT, &intr);
4829                 if (ret < 0) {
4830                     fprintf(stderr,
4831                             "KVM: injection failed, interrupt lost (%s)\n",
4832                             strerror(-ret));
4833                 }
4834             }
4835         }
4836 
4837         /* If we have an interrupt but the guest is not ready to receive an
4838          * interrupt, request an interrupt window exit.  This will
4839          * cause a return to userspace as soon as the guest is ready to
4840          * receive interrupts. */
4841         if ((cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
4842             run->request_interrupt_window = 1;
4843         } else {
4844             run->request_interrupt_window = 0;
4845         }
4846 
4847         DPRINTF("setting tpr\n");
4848         run->cr8 = cpu_get_apic_tpr(x86_cpu->apic_state);
4849 
4850         bql_unlock();
4851     }
4852 }
4853 
4854 static void kvm_rate_limit_on_bus_lock(void)
4855 {
4856     uint64_t delay_ns = ratelimit_calculate_delay(&bus_lock_ratelimit_ctrl, 1);
4857 
4858     if (delay_ns) {
4859         g_usleep(delay_ns / SCALE_US);
4860     }
4861 }
4862 
4863 MemTxAttrs kvm_arch_post_run(CPUState *cpu, struct kvm_run *run)
4864 {
4865     X86CPU *x86_cpu = X86_CPU(cpu);
4866     CPUX86State *env = &x86_cpu->env;
4867 
4868     if (run->flags & KVM_RUN_X86_SMM) {
4869         env->hflags |= HF_SMM_MASK;
4870     } else {
4871         env->hflags &= ~HF_SMM_MASK;
4872     }
4873     if (run->if_flag) {
4874         env->eflags |= IF_MASK;
4875     } else {
4876         env->eflags &= ~IF_MASK;
4877     }
4878     if (run->flags & KVM_RUN_X86_BUS_LOCK) {
4879         kvm_rate_limit_on_bus_lock();
4880     }
4881 
4882 #ifdef CONFIG_XEN_EMU
4883     /*
4884      * If the callback is asserted as a GSI (or PCI INTx) then check if
4885      * vcpu_info->evtchn_upcall_pending has been cleared, and deassert
4886      * the callback IRQ if so. Ideally we could hook into the PIC/IOAPIC
4887      * EOI and only resample then, exactly how the VFIO eventfd pairs
4888      * are designed to work for level triggered interrupts.
4889      */
4890     if (x86_cpu->env.xen_callback_asserted) {
4891         kvm_xen_maybe_deassert_callback(cpu);
4892     }
4893 #endif
4894 
4895     /* We need to protect the apic state against concurrent accesses from
4896      * different threads in case the userspace irqchip is used. */
4897     if (!kvm_irqchip_in_kernel()) {
4898         bql_lock();
4899     }
4900     cpu_set_apic_tpr(x86_cpu->apic_state, run->cr8);
4901     cpu_set_apic_base(x86_cpu->apic_state, run->apic_base);
4902     if (!kvm_irqchip_in_kernel()) {
4903         bql_unlock();
4904     }
4905     return cpu_get_mem_attrs(env);
4906 }
4907 
4908 int kvm_arch_process_async_events(CPUState *cs)
4909 {
4910     X86CPU *cpu = X86_CPU(cs);
4911     CPUX86State *env = &cpu->env;
4912 
4913     if (cs->interrupt_request & CPU_INTERRUPT_MCE) {
4914         /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
4915         assert(env->mcg_cap);
4916 
4917         cs->interrupt_request &= ~CPU_INTERRUPT_MCE;
4918 
4919         kvm_cpu_synchronize_state(cs);
4920 
4921         if (env->exception_nr == EXCP08_DBLE) {
4922             /* this means triple fault */
4923             qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
4924             cs->exit_request = 1;
4925             return 0;
4926         }
4927         kvm_queue_exception(env, EXCP12_MCHK, 0, 0);
4928         env->has_error_code = 0;
4929 
4930         cs->halted = 0;
4931         if (kvm_irqchip_in_kernel() && env->mp_state == KVM_MP_STATE_HALTED) {
4932             env->mp_state = KVM_MP_STATE_RUNNABLE;
4933         }
4934     }
4935 
4936     if ((cs->interrupt_request & CPU_INTERRUPT_INIT) &&
4937         !(env->hflags & HF_SMM_MASK)) {
4938         kvm_cpu_synchronize_state(cs);
4939         do_cpu_init(cpu);
4940     }
4941 
4942     if (kvm_irqchip_in_kernel()) {
4943         return 0;
4944     }
4945 
4946     if (cs->interrupt_request & CPU_INTERRUPT_POLL) {
4947         cs->interrupt_request &= ~CPU_INTERRUPT_POLL;
4948         apic_poll_irq(cpu->apic_state);
4949     }
4950     if (((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
4951          (env->eflags & IF_MASK)) ||
4952         (cs->interrupt_request & CPU_INTERRUPT_NMI)) {
4953         cs->halted = 0;
4954     }
4955     if (cs->interrupt_request & CPU_INTERRUPT_SIPI) {
4956         kvm_cpu_synchronize_state(cs);
4957         do_cpu_sipi(cpu);
4958     }
4959     if (cs->interrupt_request & CPU_INTERRUPT_TPR) {
4960         cs->interrupt_request &= ~CPU_INTERRUPT_TPR;
4961         kvm_cpu_synchronize_state(cs);
4962         apic_handle_tpr_access_report(cpu->apic_state, env->eip,
4963                                       env->tpr_access_type);
4964     }
4965 
4966     return cs->halted;
4967 }
4968 
4969 static int kvm_handle_halt(X86CPU *cpu)
4970 {
4971     CPUState *cs = CPU(cpu);
4972     CPUX86State *env = &cpu->env;
4973 
4974     if (!((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
4975           (env->eflags & IF_MASK)) &&
4976         !(cs->interrupt_request & CPU_INTERRUPT_NMI)) {
4977         cs->halted = 1;
4978         return EXCP_HLT;
4979     }
4980 
4981     return 0;
4982 }
4983 
4984 static int kvm_handle_tpr_access(X86CPU *cpu)
4985 {
4986     CPUState *cs = CPU(cpu);
4987     struct kvm_run *run = cs->kvm_run;
4988 
4989     apic_handle_tpr_access_report(cpu->apic_state, run->tpr_access.rip,
4990                                   run->tpr_access.is_write ? TPR_ACCESS_WRITE
4991                                                            : TPR_ACCESS_READ);
4992     return 1;
4993 }
4994 
4995 int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
4996 {
4997     static const uint8_t int3 = 0xcc;
4998 
4999     if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 0) ||
5000         cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&int3, 1, 1)) {
5001         return -EINVAL;
5002     }
5003     return 0;
5004 }
5005 
5006 int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
5007 {
5008     uint8_t int3;
5009 
5010     if (cpu_memory_rw_debug(cs, bp->pc, &int3, 1, 0)) {
5011         return -EINVAL;
5012     }
5013     if (int3 != 0xcc) {
5014         return 0;
5015     }
5016     if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 1)) {
5017         return -EINVAL;
5018     }
5019     return 0;
5020 }
5021 
5022 static struct {
5023     target_ulong addr;
5024     int len;
5025     int type;
5026 } hw_breakpoint[4];
5027 
5028 static int nb_hw_breakpoint;
5029 
5030 static int find_hw_breakpoint(target_ulong addr, int len, int type)
5031 {
5032     int n;
5033 
5034     for (n = 0; n < nb_hw_breakpoint; n++) {
5035         if (hw_breakpoint[n].addr == addr && hw_breakpoint[n].type == type &&
5036             (hw_breakpoint[n].len == len || len == -1)) {
5037             return n;
5038         }
5039     }
5040     return -1;
5041 }
5042 
5043 int kvm_arch_insert_hw_breakpoint(vaddr addr, vaddr len, int type)
5044 {
5045     switch (type) {
5046     case GDB_BREAKPOINT_HW:
5047         len = 1;
5048         break;
5049     case GDB_WATCHPOINT_WRITE:
5050     case GDB_WATCHPOINT_ACCESS:
5051         switch (len) {
5052         case 1:
5053             break;
5054         case 2:
5055         case 4:
5056         case 8:
5057             if (addr & (len - 1)) {
5058                 return -EINVAL;
5059             }
5060             break;
5061         default:
5062             return -EINVAL;
5063         }
5064         break;
5065     default:
5066         return -ENOSYS;
5067     }
5068 
5069     if (nb_hw_breakpoint == 4) {
5070         return -ENOBUFS;
5071     }
5072     if (find_hw_breakpoint(addr, len, type) >= 0) {
5073         return -EEXIST;
5074     }
5075     hw_breakpoint[nb_hw_breakpoint].addr = addr;
5076     hw_breakpoint[nb_hw_breakpoint].len = len;
5077     hw_breakpoint[nb_hw_breakpoint].type = type;
5078     nb_hw_breakpoint++;
5079 
5080     return 0;
5081 }
5082 
5083 int kvm_arch_remove_hw_breakpoint(vaddr addr, vaddr len, int type)
5084 {
5085     int n;
5086 
5087     n = find_hw_breakpoint(addr, (type == GDB_BREAKPOINT_HW) ? 1 : len, type);
5088     if (n < 0) {
5089         return -ENOENT;
5090     }
5091     nb_hw_breakpoint--;
5092     hw_breakpoint[n] = hw_breakpoint[nb_hw_breakpoint];
5093 
5094     return 0;
5095 }
5096 
5097 void kvm_arch_remove_all_hw_breakpoints(void)
5098 {
5099     nb_hw_breakpoint = 0;
5100 }
5101 
5102 static CPUWatchpoint hw_watchpoint;
5103 
5104 static int kvm_handle_debug(X86CPU *cpu,
5105                             struct kvm_debug_exit_arch *arch_info)
5106 {
5107     CPUState *cs = CPU(cpu);
5108     CPUX86State *env = &cpu->env;
5109     int ret = 0;
5110     int n;
5111 
5112     if (arch_info->exception == EXCP01_DB) {
5113         if (arch_info->dr6 & DR6_BS) {
5114             if (cs->singlestep_enabled) {
5115                 ret = EXCP_DEBUG;
5116             }
5117         } else {
5118             for (n = 0; n < 4; n++) {
5119                 if (arch_info->dr6 & (1 << n)) {
5120                     switch ((arch_info->dr7 >> (16 + n*4)) & 0x3) {
5121                     case 0x0:
5122                         ret = EXCP_DEBUG;
5123                         break;
5124                     case 0x1:
5125                         ret = EXCP_DEBUG;
5126                         cs->watchpoint_hit = &hw_watchpoint;
5127                         hw_watchpoint.vaddr = hw_breakpoint[n].addr;
5128                         hw_watchpoint.flags = BP_MEM_WRITE;
5129                         break;
5130                     case 0x3:
5131                         ret = EXCP_DEBUG;
5132                         cs->watchpoint_hit = &hw_watchpoint;
5133                         hw_watchpoint.vaddr = hw_breakpoint[n].addr;
5134                         hw_watchpoint.flags = BP_MEM_ACCESS;
5135                         break;
5136                     }
5137                 }
5138             }
5139         }
5140     } else if (kvm_find_sw_breakpoint(cs, arch_info->pc)) {
5141         ret = EXCP_DEBUG;
5142     }
5143     if (ret == 0) {
5144         cpu_synchronize_state(cs);
5145         assert(env->exception_nr == -1);
5146 
5147         /* pass to guest */
5148         kvm_queue_exception(env, arch_info->exception,
5149                             arch_info->exception == EXCP01_DB,
5150                             arch_info->dr6);
5151         env->has_error_code = 0;
5152     }
5153 
5154     return ret;
5155 }
5156 
5157 void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg)
5158 {
5159     const uint8_t type_code[] = {
5160         [GDB_BREAKPOINT_HW] = 0x0,
5161         [GDB_WATCHPOINT_WRITE] = 0x1,
5162         [GDB_WATCHPOINT_ACCESS] = 0x3
5163     };
5164     const uint8_t len_code[] = {
5165         [1] = 0x0, [2] = 0x1, [4] = 0x3, [8] = 0x2
5166     };
5167     int n;
5168 
5169     if (kvm_sw_breakpoints_active(cpu)) {
5170         dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
5171     }
5172     if (nb_hw_breakpoint > 0) {
5173         dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
5174         dbg->arch.debugreg[7] = 0x0600;
5175         for (n = 0; n < nb_hw_breakpoint; n++) {
5176             dbg->arch.debugreg[n] = hw_breakpoint[n].addr;
5177             dbg->arch.debugreg[7] |= (2 << (n * 2)) |
5178                 (type_code[hw_breakpoint[n].type] << (16 + n*4)) |
5179                 ((uint32_t)len_code[hw_breakpoint[n].len] << (18 + n*4));
5180         }
5181     }
5182 }
5183 
5184 static bool kvm_install_msr_filters(KVMState *s)
5185 {
5186     uint64_t zero = 0;
5187     struct kvm_msr_filter filter = {
5188         .flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
5189     };
5190     int r, i, j = 0;
5191 
5192     for (i = 0; i < KVM_MSR_FILTER_MAX_RANGES; i++) {
5193         KVMMSRHandlers *handler = &msr_handlers[i];
5194         if (handler->msr) {
5195             struct kvm_msr_filter_range *range = &filter.ranges[j++];
5196 
5197             *range = (struct kvm_msr_filter_range) {
5198                 .flags = 0,
5199                 .nmsrs = 1,
5200                 .base = handler->msr,
5201                 .bitmap = (__u8 *)&zero,
5202             };
5203 
5204             if (handler->rdmsr) {
5205                 range->flags |= KVM_MSR_FILTER_READ;
5206             }
5207 
5208             if (handler->wrmsr) {
5209                 range->flags |= KVM_MSR_FILTER_WRITE;
5210             }
5211         }
5212     }
5213 
5214     r = kvm_vm_ioctl(s, KVM_X86_SET_MSR_FILTER, &filter);
5215     if (r) {
5216         return false;
5217     }
5218 
5219     return true;
5220 }
5221 
5222 bool kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr,
5223                     QEMUWRMSRHandler *wrmsr)
5224 {
5225     int i;
5226 
5227     for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) {
5228         if (!msr_handlers[i].msr) {
5229             msr_handlers[i] = (KVMMSRHandlers) {
5230                 .msr = msr,
5231                 .rdmsr = rdmsr,
5232                 .wrmsr = wrmsr,
5233             };
5234 
5235             if (!kvm_install_msr_filters(s)) {
5236                 msr_handlers[i] = (KVMMSRHandlers) { };
5237                 return false;
5238             }
5239 
5240             return true;
5241         }
5242     }
5243 
5244     return false;
5245 }
5246 
5247 static int kvm_handle_rdmsr(X86CPU *cpu, struct kvm_run *run)
5248 {
5249     int i;
5250     bool r;
5251 
5252     for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) {
5253         KVMMSRHandlers *handler = &msr_handlers[i];
5254         if (run->msr.index == handler->msr) {
5255             if (handler->rdmsr) {
5256                 r = handler->rdmsr(cpu, handler->msr,
5257                                    (uint64_t *)&run->msr.data);
5258                 run->msr.error = r ? 0 : 1;
5259                 return 0;
5260             }
5261         }
5262     }
5263 
5264     assert(false);
5265 }
5266 
5267 static int kvm_handle_wrmsr(X86CPU *cpu, struct kvm_run *run)
5268 {
5269     int i;
5270     bool r;
5271 
5272     for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) {
5273         KVMMSRHandlers *handler = &msr_handlers[i];
5274         if (run->msr.index == handler->msr) {
5275             if (handler->wrmsr) {
5276                 r = handler->wrmsr(cpu, handler->msr, run->msr.data);
5277                 run->msr.error = r ? 0 : 1;
5278                 return 0;
5279             }
5280         }
5281     }
5282 
5283     assert(false);
5284 }
5285 
5286 static bool has_sgx_provisioning;
5287 
5288 static bool __kvm_enable_sgx_provisioning(KVMState *s)
5289 {
5290     int fd, ret;
5291 
5292     if (!kvm_vm_check_extension(s, KVM_CAP_SGX_ATTRIBUTE)) {
5293         return false;
5294     }
5295 
5296     fd = qemu_open_old("/dev/sgx_provision", O_RDONLY);
5297     if (fd < 0) {
5298         return false;
5299     }
5300 
5301     ret = kvm_vm_enable_cap(s, KVM_CAP_SGX_ATTRIBUTE, 0, fd);
5302     if (ret) {
5303         error_report("Could not enable SGX PROVISIONKEY: %s", strerror(-ret));
5304         exit(1);
5305     }
5306     close(fd);
5307     return true;
5308 }
5309 
5310 bool kvm_enable_sgx_provisioning(KVMState *s)
5311 {
5312     return MEMORIZE(__kvm_enable_sgx_provisioning(s), has_sgx_provisioning);
5313 }
5314 
5315 static bool host_supports_vmx(void)
5316 {
5317     uint32_t ecx, unused;
5318 
5319     host_cpuid(1, 0, &unused, &unused, &ecx, &unused);
5320     return ecx & CPUID_EXT_VMX;
5321 }
5322 
5323 #define VMX_INVALID_GUEST_STATE 0x80000021
5324 
5325 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
5326 {
5327     X86CPU *cpu = X86_CPU(cs);
5328     uint64_t code;
5329     int ret;
5330     bool ctx_invalid;
5331     char str[256];
5332     KVMState *state;
5333 
5334     switch (run->exit_reason) {
5335     case KVM_EXIT_HLT:
5336         DPRINTF("handle_hlt\n");
5337         bql_lock();
5338         ret = kvm_handle_halt(cpu);
5339         bql_unlock();
5340         break;
5341     case KVM_EXIT_SET_TPR:
5342         ret = 0;
5343         break;
5344     case KVM_EXIT_TPR_ACCESS:
5345         bql_lock();
5346         ret = kvm_handle_tpr_access(cpu);
5347         bql_unlock();
5348         break;
5349     case KVM_EXIT_FAIL_ENTRY:
5350         code = run->fail_entry.hardware_entry_failure_reason;
5351         fprintf(stderr, "KVM: entry failed, hardware error 0x%" PRIx64 "\n",
5352                 code);
5353         if (host_supports_vmx() && code == VMX_INVALID_GUEST_STATE) {
5354             fprintf(stderr,
5355                     "\nIf you're running a guest on an Intel machine without "
5356                         "unrestricted mode\n"
5357                     "support, the failure can be most likely due to the guest "
5358                         "entering an invalid\n"
5359                     "state for Intel VT. For example, the guest maybe running "
5360                         "in big real mode\n"
5361                     "which is not supported on less recent Intel processors."
5362                         "\n\n");
5363         }
5364         ret = -1;
5365         break;
5366     case KVM_EXIT_EXCEPTION:
5367         fprintf(stderr, "KVM: exception %d exit (error code 0x%x)\n",
5368                 run->ex.exception, run->ex.error_code);
5369         ret = -1;
5370         break;
5371     case KVM_EXIT_DEBUG:
5372         DPRINTF("kvm_exit_debug\n");
5373         bql_lock();
5374         ret = kvm_handle_debug(cpu, &run->debug.arch);
5375         bql_unlock();
5376         break;
5377     case KVM_EXIT_HYPERV:
5378         ret = kvm_hv_handle_exit(cpu, &run->hyperv);
5379         break;
5380     case KVM_EXIT_IOAPIC_EOI:
5381         ioapic_eoi_broadcast(run->eoi.vector);
5382         ret = 0;
5383         break;
5384     case KVM_EXIT_X86_BUS_LOCK:
5385         /* already handled in kvm_arch_post_run */
5386         ret = 0;
5387         break;
5388     case KVM_EXIT_NOTIFY:
5389         ctx_invalid = !!(run->notify.flags & KVM_NOTIFY_CONTEXT_INVALID);
5390         state = KVM_STATE(current_accel());
5391         sprintf(str, "Encounter a notify exit with %svalid context in"
5392                      " guest. There can be possible misbehaves in guest."
5393                      " Please have a look.", ctx_invalid ? "in" : "");
5394         if (ctx_invalid ||
5395             state->notify_vmexit == NOTIFY_VMEXIT_OPTION_INTERNAL_ERROR) {
5396             warn_report("KVM internal error: %s", str);
5397             ret = -1;
5398         } else {
5399             warn_report_once("KVM: %s", str);
5400             ret = 0;
5401         }
5402         break;
5403     case KVM_EXIT_X86_RDMSR:
5404         /* We only enable MSR filtering, any other exit is bogus */
5405         assert(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER);
5406         ret = kvm_handle_rdmsr(cpu, run);
5407         break;
5408     case KVM_EXIT_X86_WRMSR:
5409         /* We only enable MSR filtering, any other exit is bogus */
5410         assert(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER);
5411         ret = kvm_handle_wrmsr(cpu, run);
5412         break;
5413 #ifdef CONFIG_XEN_EMU
5414     case KVM_EXIT_XEN:
5415         ret = kvm_xen_handle_exit(cpu, &run->xen);
5416         break;
5417 #endif
5418     default:
5419         fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
5420         ret = -1;
5421         break;
5422     }
5423 
5424     return ret;
5425 }
5426 
5427 bool kvm_arch_stop_on_emulation_error(CPUState *cs)
5428 {
5429     X86CPU *cpu = X86_CPU(cs);
5430     CPUX86State *env = &cpu->env;
5431 
5432     kvm_cpu_synchronize_state(cs);
5433     return !(env->cr[0] & CR0_PE_MASK) ||
5434            ((env->segs[R_CS].selector  & 3) != 3);
5435 }
5436 
5437 void kvm_arch_init_irq_routing(KVMState *s)
5438 {
5439     /* We know at this point that we're using the in-kernel
5440      * irqchip, so we can use irqfds, and on x86 we know
5441      * we can use msi via irqfd and GSI routing.
5442      */
5443     kvm_msi_via_irqfd_allowed = true;
5444     kvm_gsi_routing_allowed = true;
5445 
5446     if (kvm_irqchip_is_split()) {
5447         KVMRouteChange c = kvm_irqchip_begin_route_changes(s);
5448         int i;
5449 
5450         /* If the ioapic is in QEMU and the lapics are in KVM, reserve
5451            MSI routes for signaling interrupts to the local apics. */
5452         for (i = 0; i < IOAPIC_NUM_PINS; i++) {
5453             if (kvm_irqchip_add_msi_route(&c, 0, NULL) < 0) {
5454                 error_report("Could not enable split IRQ mode.");
5455                 exit(1);
5456             }
5457         }
5458         kvm_irqchip_commit_route_changes(&c);
5459     }
5460 }
5461 
5462 int kvm_arch_irqchip_create(KVMState *s)
5463 {
5464     int ret;
5465     if (kvm_kernel_irqchip_split()) {
5466         ret = kvm_vm_enable_cap(s, KVM_CAP_SPLIT_IRQCHIP, 0, 24);
5467         if (ret) {
5468             error_report("Could not enable split irqchip mode: %s",
5469                          strerror(-ret));
5470             exit(1);
5471         } else {
5472             DPRINTF("Enabled KVM_CAP_SPLIT_IRQCHIP\n");
5473             kvm_split_irqchip = true;
5474             return 1;
5475         }
5476     } else {
5477         return 0;
5478     }
5479 }
5480 
5481 uint64_t kvm_swizzle_msi_ext_dest_id(uint64_t address)
5482 {
5483     CPUX86State *env;
5484     uint64_t ext_id;
5485 
5486     if (!first_cpu) {
5487         return address;
5488     }
5489     env = &X86_CPU(first_cpu)->env;
5490     if (!(env->features[FEAT_KVM] & (1 << KVM_FEATURE_MSI_EXT_DEST_ID))) {
5491         return address;
5492     }
5493 
5494     /*
5495      * If the remappable format bit is set, or the upper bits are
5496      * already set in address_hi, or the low extended bits aren't
5497      * there anyway, do nothing.
5498      */
5499     ext_id = address & (0xff << MSI_ADDR_DEST_IDX_SHIFT);
5500     if (!ext_id || (ext_id & (1 << MSI_ADDR_DEST_IDX_SHIFT)) || (address >> 32)) {
5501         return address;
5502     }
5503 
5504     address &= ~ext_id;
5505     address |= ext_id << 35;
5506     return address;
5507 }
5508 
5509 int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
5510                              uint64_t address, uint32_t data, PCIDevice *dev)
5511 {
5512     X86IOMMUState *iommu = x86_iommu_get_default();
5513 
5514     if (iommu) {
5515         X86IOMMUClass *class = X86_IOMMU_DEVICE_GET_CLASS(iommu);
5516 
5517         if (class->int_remap) {
5518             int ret;
5519             MSIMessage src, dst;
5520 
5521             src.address = route->u.msi.address_hi;
5522             src.address <<= VTD_MSI_ADDR_HI_SHIFT;
5523             src.address |= route->u.msi.address_lo;
5524             src.data = route->u.msi.data;
5525 
5526             ret = class->int_remap(iommu, &src, &dst, dev ?     \
5527                                    pci_requester_id(dev) :      \
5528                                    X86_IOMMU_SID_INVALID);
5529             if (ret) {
5530                 trace_kvm_x86_fixup_msi_error(route->gsi);
5531                 return 1;
5532             }
5533 
5534             /*
5535              * Handled untranslated compatibility format interrupt with
5536              * extended destination ID in the low bits 11-5. */
5537             dst.address = kvm_swizzle_msi_ext_dest_id(dst.address);
5538 
5539             route->u.msi.address_hi = dst.address >> VTD_MSI_ADDR_HI_SHIFT;
5540             route->u.msi.address_lo = dst.address & VTD_MSI_ADDR_LO_MASK;
5541             route->u.msi.data = dst.data;
5542             return 0;
5543         }
5544     }
5545 
5546 #ifdef CONFIG_XEN_EMU
5547     if (xen_mode == XEN_EMULATE) {
5548         int handled = xen_evtchn_translate_pirq_msi(route, address, data);
5549 
5550         /*
5551          * If it was a PIRQ and successfully routed (handled == 0) or it was
5552          * an error (handled < 0), return. If it wasn't a PIRQ, keep going.
5553          */
5554         if (handled <= 0) {
5555             return handled;
5556         }
5557     }
5558 #endif
5559 
5560     address = kvm_swizzle_msi_ext_dest_id(address);
5561     route->u.msi.address_hi = address >> VTD_MSI_ADDR_HI_SHIFT;
5562     route->u.msi.address_lo = address & VTD_MSI_ADDR_LO_MASK;
5563     return 0;
5564 }
5565 
5566 typedef struct MSIRouteEntry MSIRouteEntry;
5567 
5568 struct MSIRouteEntry {
5569     PCIDevice *dev;             /* Device pointer */
5570     int vector;                 /* MSI/MSIX vector index */
5571     int virq;                   /* Virtual IRQ index */
5572     QLIST_ENTRY(MSIRouteEntry) list;
5573 };
5574 
5575 /* List of used GSI routes */
5576 static QLIST_HEAD(, MSIRouteEntry) msi_route_list = \
5577     QLIST_HEAD_INITIALIZER(msi_route_list);
5578 
5579 void kvm_update_msi_routes_all(void *private, bool global,
5580                                uint32_t index, uint32_t mask)
5581 {
5582     int cnt = 0, vector;
5583     MSIRouteEntry *entry;
5584     MSIMessage msg;
5585     PCIDevice *dev;
5586 
5587     /* TODO: explicit route update */
5588     QLIST_FOREACH(entry, &msi_route_list, list) {
5589         cnt++;
5590         vector = entry->vector;
5591         dev = entry->dev;
5592         if (msix_enabled(dev) && !msix_is_masked(dev, vector)) {
5593             msg = msix_get_message(dev, vector);
5594         } else if (msi_enabled(dev) && !msi_is_masked(dev, vector)) {
5595             msg = msi_get_message(dev, vector);
5596         } else {
5597             /*
5598              * Either MSI/MSIX is disabled for the device, or the
5599              * specific message was masked out.  Skip this one.
5600              */
5601             continue;
5602         }
5603         kvm_irqchip_update_msi_route(kvm_state, entry->virq, msg, dev);
5604     }
5605     kvm_irqchip_commit_routes(kvm_state);
5606     trace_kvm_x86_update_msi_routes(cnt);
5607 }
5608 
5609 int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route,
5610                                 int vector, PCIDevice *dev)
5611 {
5612     static bool notify_list_inited = false;
5613     MSIRouteEntry *entry;
5614 
5615     if (!dev) {
5616         /* These are (possibly) IOAPIC routes only used for split
5617          * kernel irqchip mode, while what we are housekeeping are
5618          * PCI devices only. */
5619         return 0;
5620     }
5621 
5622     entry = g_new0(MSIRouteEntry, 1);
5623     entry->dev = dev;
5624     entry->vector = vector;
5625     entry->virq = route->gsi;
5626     QLIST_INSERT_HEAD(&msi_route_list, entry, list);
5627 
5628     trace_kvm_x86_add_msi_route(route->gsi);
5629 
5630     if (!notify_list_inited) {
5631         /* For the first time we do add route, add ourselves into
5632          * IOMMU's IEC notify list if needed. */
5633         X86IOMMUState *iommu = x86_iommu_get_default();
5634         if (iommu) {
5635             x86_iommu_iec_register_notifier(iommu,
5636                                             kvm_update_msi_routes_all,
5637                                             NULL);
5638         }
5639         notify_list_inited = true;
5640     }
5641     return 0;
5642 }
5643 
5644 int kvm_arch_release_virq_post(int virq)
5645 {
5646     MSIRouteEntry *entry, *next;
5647     QLIST_FOREACH_SAFE(entry, &msi_route_list, list, next) {
5648         if (entry->virq == virq) {
5649             trace_kvm_x86_remove_msi_route(virq);
5650             QLIST_REMOVE(entry, list);
5651             g_free(entry);
5652             break;
5653         }
5654     }
5655     return 0;
5656 }
5657 
5658 int kvm_arch_msi_data_to_gsi(uint32_t data)
5659 {
5660     abort();
5661 }
5662 
5663 bool kvm_has_waitpkg(void)
5664 {
5665     return has_msr_umwait;
5666 }
5667 
5668 #define ARCH_REQ_XCOMP_GUEST_PERM       0x1025
5669 
5670 void kvm_request_xsave_components(X86CPU *cpu, uint64_t mask)
5671 {
5672     KVMState *s = kvm_state;
5673     uint64_t supported;
5674 
5675     mask &= XSTATE_DYNAMIC_MASK;
5676     if (!mask) {
5677         return;
5678     }
5679     /*
5680      * Just ignore bits that are not in CPUID[EAX=0xD,ECX=0].
5681      * ARCH_REQ_XCOMP_GUEST_PERM would fail, and QEMU has warned
5682      * about them already because they are not supported features.
5683      */
5684     supported = kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EAX);
5685     supported |= (uint64_t)kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EDX) << 32;
5686     mask &= supported;
5687 
5688     while (mask) {
5689         int bit = ctz64(mask);
5690         int rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit);
5691         if (rc) {
5692             /*
5693              * Older kernel version (<5.17) do not support
5694              * ARCH_REQ_XCOMP_GUEST_PERM, but also do not return
5695              * any dynamic feature from kvm_arch_get_supported_cpuid.
5696              */
5697             warn_report("prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure "
5698                         "for feature bit %d", bit);
5699         }
5700         mask &= ~BIT_ULL(bit);
5701     }
5702 }
5703 
5704 static int kvm_arch_get_notify_vmexit(Object *obj, Error **errp)
5705 {
5706     KVMState *s = KVM_STATE(obj);
5707     return s->notify_vmexit;
5708 }
5709 
5710 static void kvm_arch_set_notify_vmexit(Object *obj, int value, Error **errp)
5711 {
5712     KVMState *s = KVM_STATE(obj);
5713 
5714     if (s->fd != -1) {
5715         error_setg(errp, "Cannot set properties after the accelerator has been initialized");
5716         return;
5717     }
5718 
5719     s->notify_vmexit = value;
5720 }
5721 
5722 static void kvm_arch_get_notify_window(Object *obj, Visitor *v,
5723                                        const char *name, void *opaque,
5724                                        Error **errp)
5725 {
5726     KVMState *s = KVM_STATE(obj);
5727     uint32_t value = s->notify_window;
5728 
5729     visit_type_uint32(v, name, &value, errp);
5730 }
5731 
5732 static void kvm_arch_set_notify_window(Object *obj, Visitor *v,
5733                                        const char *name, void *opaque,
5734                                        Error **errp)
5735 {
5736     KVMState *s = KVM_STATE(obj);
5737     uint32_t value;
5738 
5739     if (s->fd != -1) {
5740         error_setg(errp, "Cannot set properties after the accelerator has been initialized");
5741         return;
5742     }
5743 
5744     if (!visit_type_uint32(v, name, &value, errp)) {
5745         return;
5746     }
5747 
5748     s->notify_window = value;
5749 }
5750 
5751 static void kvm_arch_get_xen_version(Object *obj, Visitor *v,
5752                                      const char *name, void *opaque,
5753                                      Error **errp)
5754 {
5755     KVMState *s = KVM_STATE(obj);
5756     uint32_t value = s->xen_version;
5757 
5758     visit_type_uint32(v, name, &value, errp);
5759 }
5760 
5761 static void kvm_arch_set_xen_version(Object *obj, Visitor *v,
5762                                      const char *name, void *opaque,
5763                                      Error **errp)
5764 {
5765     KVMState *s = KVM_STATE(obj);
5766     Error *error = NULL;
5767     uint32_t value;
5768 
5769     visit_type_uint32(v, name, &value, &error);
5770     if (error) {
5771         error_propagate(errp, error);
5772         return;
5773     }
5774 
5775     s->xen_version = value;
5776     if (value && xen_mode == XEN_DISABLED) {
5777         xen_mode = XEN_EMULATE;
5778     }
5779 }
5780 
5781 static void kvm_arch_get_xen_gnttab_max_frames(Object *obj, Visitor *v,
5782                                                const char *name, void *opaque,
5783                                                Error **errp)
5784 {
5785     KVMState *s = KVM_STATE(obj);
5786     uint16_t value = s->xen_gnttab_max_frames;
5787 
5788     visit_type_uint16(v, name, &value, errp);
5789 }
5790 
5791 static void kvm_arch_set_xen_gnttab_max_frames(Object *obj, Visitor *v,
5792                                                const char *name, void *opaque,
5793                                                Error **errp)
5794 {
5795     KVMState *s = KVM_STATE(obj);
5796     Error *error = NULL;
5797     uint16_t value;
5798 
5799     visit_type_uint16(v, name, &value, &error);
5800     if (error) {
5801         error_propagate(errp, error);
5802         return;
5803     }
5804 
5805     s->xen_gnttab_max_frames = value;
5806 }
5807 
5808 static void kvm_arch_get_xen_evtchn_max_pirq(Object *obj, Visitor *v,
5809                                              const char *name, void *opaque,
5810                                              Error **errp)
5811 {
5812     KVMState *s = KVM_STATE(obj);
5813     uint16_t value = s->xen_evtchn_max_pirq;
5814 
5815     visit_type_uint16(v, name, &value, errp);
5816 }
5817 
5818 static void kvm_arch_set_xen_evtchn_max_pirq(Object *obj, Visitor *v,
5819                                              const char *name, void *opaque,
5820                                              Error **errp)
5821 {
5822     KVMState *s = KVM_STATE(obj);
5823     Error *error = NULL;
5824     uint16_t value;
5825 
5826     visit_type_uint16(v, name, &value, &error);
5827     if (error) {
5828         error_propagate(errp, error);
5829         return;
5830     }
5831 
5832     s->xen_evtchn_max_pirq = value;
5833 }
5834 
5835 void kvm_arch_accel_class_init(ObjectClass *oc)
5836 {
5837     object_class_property_add_enum(oc, "notify-vmexit", "NotifyVMexitOption",
5838                                    &NotifyVmexitOption_lookup,
5839                                    kvm_arch_get_notify_vmexit,
5840                                    kvm_arch_set_notify_vmexit);
5841     object_class_property_set_description(oc, "notify-vmexit",
5842                                           "Enable notify VM exit");
5843 
5844     object_class_property_add(oc, "notify-window", "uint32",
5845                               kvm_arch_get_notify_window,
5846                               kvm_arch_set_notify_window,
5847                               NULL, NULL);
5848     object_class_property_set_description(oc, "notify-window",
5849                                           "Clock cycles without an event window "
5850                                           "after which a notification VM exit occurs");
5851 
5852     object_class_property_add(oc, "xen-version", "uint32",
5853                               kvm_arch_get_xen_version,
5854                               kvm_arch_set_xen_version,
5855                               NULL, NULL);
5856     object_class_property_set_description(oc, "xen-version",
5857                                           "Xen version to be emulated "
5858                                           "(in XENVER_version form "
5859                                           "e.g. 0x4000a for 4.10)");
5860 
5861     object_class_property_add(oc, "xen-gnttab-max-frames", "uint16",
5862                               kvm_arch_get_xen_gnttab_max_frames,
5863                               kvm_arch_set_xen_gnttab_max_frames,
5864                               NULL, NULL);
5865     object_class_property_set_description(oc, "xen-gnttab-max-frames",
5866                                           "Maximum number of grant table frames");
5867 
5868     object_class_property_add(oc, "xen-evtchn-max-pirq", "uint16",
5869                               kvm_arch_get_xen_evtchn_max_pirq,
5870                               kvm_arch_set_xen_evtchn_max_pirq,
5871                               NULL, NULL);
5872     object_class_property_set_description(oc, "xen-evtchn-max-pirq",
5873                                           "Maximum number of Xen PIRQs");
5874 }
5875 
5876 void kvm_set_max_apic_id(uint32_t max_apic_id)
5877 {
5878     kvm_vm_enable_cap(kvm_state, KVM_CAP_MAX_VCPU_ID, 0, max_apic_id);
5879 }
5880