xref: /qemu/target/i386/whpx/whpx-all.c (revision 4a1babe5)
1 /*
2  * QEMU Windows Hypervisor Platform accelerator (WHPX)
3  *
4  * Copyright Microsoft Corp. 2017
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
7  * See the COPYING file in the top-level directory.
8  *
9  */
10 
11 #include "qemu/osdep.h"
12 #include "cpu.h"
13 #include "exec/address-spaces.h"
14 #include "exec/ioport.h"
15 #include "gdbstub/helpers.h"
16 #include "qemu/accel.h"
17 #include "sysemu/whpx.h"
18 #include "sysemu/cpus.h"
19 #include "sysemu/runstate.h"
20 #include "qemu/main-loop.h"
21 #include "hw/boards.h"
22 #include "hw/intc/ioapic.h"
23 #include "hw/i386/apic_internal.h"
24 #include "qemu/error-report.h"
25 #include "qapi/error.h"
26 #include "qapi/qapi-types-common.h"
27 #include "qapi/qapi-visit-common.h"
28 #include "migration/blocker.h"
29 #include <winerror.h>
30 
31 #include "whpx-internal.h"
32 #include "whpx-accel-ops.h"
33 
34 #include <winhvplatform.h>
35 #include <winhvemulation.h>
36 
37 #define HYPERV_APIC_BUS_FREQUENCY      (200000000ULL)
38 
39 static const WHV_REGISTER_NAME whpx_register_names[] = {
40 
41     /* X64 General purpose registers */
42     WHvX64RegisterRax,
43     WHvX64RegisterRcx,
44     WHvX64RegisterRdx,
45     WHvX64RegisterRbx,
46     WHvX64RegisterRsp,
47     WHvX64RegisterRbp,
48     WHvX64RegisterRsi,
49     WHvX64RegisterRdi,
50     WHvX64RegisterR8,
51     WHvX64RegisterR9,
52     WHvX64RegisterR10,
53     WHvX64RegisterR11,
54     WHvX64RegisterR12,
55     WHvX64RegisterR13,
56     WHvX64RegisterR14,
57     WHvX64RegisterR15,
58     WHvX64RegisterRip,
59     WHvX64RegisterRflags,
60 
61     /* X64 Segment registers */
62     WHvX64RegisterEs,
63     WHvX64RegisterCs,
64     WHvX64RegisterSs,
65     WHvX64RegisterDs,
66     WHvX64RegisterFs,
67     WHvX64RegisterGs,
68     WHvX64RegisterLdtr,
69     WHvX64RegisterTr,
70 
71     /* X64 Table registers */
72     WHvX64RegisterIdtr,
73     WHvX64RegisterGdtr,
74 
75     /* X64 Control Registers */
76     WHvX64RegisterCr0,
77     WHvX64RegisterCr2,
78     WHvX64RegisterCr3,
79     WHvX64RegisterCr4,
80     WHvX64RegisterCr8,
81 
82     /* X64 Debug Registers */
83     /*
84      * WHvX64RegisterDr0,
85      * WHvX64RegisterDr1,
86      * WHvX64RegisterDr2,
87      * WHvX64RegisterDr3,
88      * WHvX64RegisterDr6,
89      * WHvX64RegisterDr7,
90      */
91 
92     /* X64 Floating Point and Vector Registers */
93     WHvX64RegisterXmm0,
94     WHvX64RegisterXmm1,
95     WHvX64RegisterXmm2,
96     WHvX64RegisterXmm3,
97     WHvX64RegisterXmm4,
98     WHvX64RegisterXmm5,
99     WHvX64RegisterXmm6,
100     WHvX64RegisterXmm7,
101     WHvX64RegisterXmm8,
102     WHvX64RegisterXmm9,
103     WHvX64RegisterXmm10,
104     WHvX64RegisterXmm11,
105     WHvX64RegisterXmm12,
106     WHvX64RegisterXmm13,
107     WHvX64RegisterXmm14,
108     WHvX64RegisterXmm15,
109     WHvX64RegisterFpMmx0,
110     WHvX64RegisterFpMmx1,
111     WHvX64RegisterFpMmx2,
112     WHvX64RegisterFpMmx3,
113     WHvX64RegisterFpMmx4,
114     WHvX64RegisterFpMmx5,
115     WHvX64RegisterFpMmx6,
116     WHvX64RegisterFpMmx7,
117     WHvX64RegisterFpControlStatus,
118     WHvX64RegisterXmmControlStatus,
119 
120     /* X64 MSRs */
121     WHvX64RegisterEfer,
122 #ifdef TARGET_X86_64
123     WHvX64RegisterKernelGsBase,
124 #endif
125     WHvX64RegisterApicBase,
126     /* WHvX64RegisterPat, */
127     WHvX64RegisterSysenterCs,
128     WHvX64RegisterSysenterEip,
129     WHvX64RegisterSysenterEsp,
130     WHvX64RegisterStar,
131 #ifdef TARGET_X86_64
132     WHvX64RegisterLstar,
133     WHvX64RegisterCstar,
134     WHvX64RegisterSfmask,
135 #endif
136 
137     /* Interrupt / Event Registers */
138     /*
139      * WHvRegisterPendingInterruption,
140      * WHvRegisterInterruptState,
141      * WHvRegisterPendingEvent0,
142      * WHvRegisterPendingEvent1
143      * WHvX64RegisterDeliverabilityNotifications,
144      */
145 };
146 
147 struct whpx_register_set {
148     WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
149 };
150 
151 /*
152  * The current implementation of instruction stepping sets the TF flag
153  * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
154  * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
155  *
156  * This approach has a few limitations:
157  *     1. Stepping over a PUSHF/SAHF instruction will save the TF flag
158  *        along with the other flags, possibly restoring it later. It would
159  *        result in another INT1 when the flags are restored, triggering
160  *        a stop in gdb that could be cleared by doing another step.
161  *
162  *        Stepping over a POPF/LAHF instruction will let it overwrite the
163  *        TF flags, ending the stepping mode.
164  *
165  *     2. Stepping over an instruction raising an exception (e.g. INT, DIV,
166  *        or anything that could result in a page fault) will save the flags
167  *        to the stack, clear the TF flag, and let the guest execute the
168  *        handler. Normally, the guest will restore the original flags,
169  *        that will continue single-stepping.
170  *
171  *     3. Debuggers running on the guest may wish to set TF to do instruction
172  *        stepping. INT1 events generated by it would be intercepted by us,
173  *        as long as the gdb is connected to QEMU.
174  *
175  * In practice this means that:
176  *     1. Stepping through flags-modifying instructions may cause gdb to
177  *        continue or stop in unexpected places. This will be fully recoverable
178  *        and will not crash the target.
179  *
180  *     2. Stepping over an instruction that triggers an exception will step
181  *        over the exception handler, not into it.
182  *
183  *     3. Debugging the guest via gdb, while running debugger on the guest
184  *        at the same time may lead to unexpected effects. Removing all
185  *        breakpoints set via QEMU will prevent any further interference
186  *        with the guest-level debuggers.
187  *
188  * The limitations can be addressed as shown below:
189  *     1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
190  *        stepping through them. The exact semantics of the instructions is
191  *        defined in the "Combined Volume Set of Intel 64 and IA-32
192  *        Architectures Software Developer's Manuals", however it involves a
193  *        fair amount of corner cases due to compatibility with real mode,
194  *        virtual 8086 mode, and differences between 64-bit and 32-bit modes.
195  *
196  *     2. We could step into the guest's exception handlers using the following
197  *        sequence:
198  *          a. Temporarily enable catching of all exception types via
199  *             whpx_set_exception_exit_bitmap().
200  *          b. Once an exception is intercepted, read the IDT/GDT and locate
201  *             the original handler.
202  *          c. Patch the original handler, injecting an INT3 at the beginning.
203  *          d. Update the exception exit bitmap to only catch the
204  *             WHvX64ExceptionTypeBreakpointTrap exception.
205  *          e. Let the affected CPU run in the exclusive mode.
206  *          f. Restore the original handler and the exception exit bitmap.
207  *        Note that handling all corner cases related to IDT/GDT is harder
208  *        than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
209  *        rough idea.
210  *
211  *     3. In order to properly support guest-level debugging in parallel with
212  *        the QEMU-level debugging, we would need to be able to pass some INT1
213  *        events to the guest. This could be done via the following methods:
214  *          a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
215  *             it seems to only work for interrupts and not software
216  *             exceptions.
217  *          b. Locating and patching the original handler by parsing IDT/GDT.
218  *             This involves relatively complex logic outlined in the previous
219  *             paragraph.
220  *          c. Emulating the exception invocation (i.e. manually updating RIP,
221  *             RFLAGS, and pushing the old values to stack). This is even more
222  *             complicated than the previous option, since it involves checking
223  *             CPL, gate attributes, and doing various adjustments depending
224  *             on the current CPU mode, whether the CPL is changing, etc.
225  */
226 typedef enum WhpxStepMode {
227     WHPX_STEP_NONE = 0,
228     /* Halt other VCPUs */
229     WHPX_STEP_EXCLUSIVE,
230 } WhpxStepMode;
231 
232 struct AccelCPUState {
233     WHV_EMULATOR_HANDLE emulator;
234     bool window_registered;
235     bool interruptable;
236     bool ready_for_pic_interrupt;
237     uint64_t tpr;
238     uint64_t apic_base;
239     bool interruption_pending;
240 
241     /* Must be the last field as it may have a tail */
242     WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
243 };
244 
245 static bool whpx_allowed;
246 static bool whp_dispatch_initialized;
247 static HMODULE hWinHvPlatform, hWinHvEmulation;
248 static uint32_t max_vcpu_index;
249 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;
250 
251 struct whpx_state whpx_global;
252 struct WHPDispatch whp_dispatch;
253 
254 static bool whpx_has_xsave(void)
255 {
256     return whpx_xsave_cap.XsaveSupport;
257 }
258 
259 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
260                                              int r86)
261 {
262     WHV_X64_SEGMENT_REGISTER hs;
263     unsigned flags = qs->flags;
264 
265     hs.Base = qs->base;
266     hs.Limit = qs->limit;
267     hs.Selector = qs->selector;
268 
269     if (v86) {
270         hs.Attributes = 0;
271         hs.SegmentType = 3;
272         hs.Present = 1;
273         hs.DescriptorPrivilegeLevel = 3;
274         hs.NonSystemSegment = 1;
275 
276     } else {
277         hs.Attributes = (flags >> DESC_TYPE_SHIFT);
278 
279         if (r86) {
280             /* hs.Base &= 0xfffff; */
281         }
282     }
283 
284     return hs;
285 }
286 
287 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
288 {
289     SegmentCache qs;
290 
291     qs.base = hs->Base;
292     qs.limit = hs->Limit;
293     qs.selector = hs->Selector;
294 
295     qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;
296 
297     return qs;
298 }
299 
300 /* X64 Extended Control Registers */
301 static void whpx_set_xcrs(CPUState *cpu)
302 {
303     HRESULT hr;
304     struct whpx_state *whpx = &whpx_global;
305     WHV_REGISTER_VALUE xcr0;
306     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
307 
308     if (!whpx_has_xsave()) {
309         return;
310     }
311 
312     /* Only xcr0 is supported by the hypervisor currently */
313     xcr0.Reg64 = cpu_env(cpu)->xcr0;
314     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
315         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
316     if (FAILED(hr)) {
317         error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
318     }
319 }
320 
321 static int whpx_set_tsc(CPUState *cpu)
322 {
323     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
324     WHV_REGISTER_VALUE tsc_val;
325     HRESULT hr;
326     struct whpx_state *whpx = &whpx_global;
327 
328     /*
329      * Suspend the partition prior to setting the TSC to reduce the variance
330      * in TSC across vCPUs. When the first vCPU runs post suspend, the
331      * partition is automatically resumed.
332      */
333     if (whp_dispatch.WHvSuspendPartitionTime) {
334 
335         /*
336          * Unable to suspend partition while setting TSC is not a fatal
337          * error. It just increases the likelihood of TSC variance between
338          * vCPUs and some guest OS are able to handle that just fine.
339          */
340         hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
341         if (FAILED(hr)) {
342             warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
343         }
344     }
345 
346     tsc_val.Reg64 = cpu_env(cpu)->tsc;
347     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
348         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
349     if (FAILED(hr)) {
350         error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
351         return -1;
352     }
353 
354     return 0;
355 }
356 
357 /*
358  * The CR8 register in the CPU is mapped to the TPR register of the APIC,
359  * however, they use a slightly different encoding. Specifically:
360  *
361  *     APIC.TPR[bits 7:4] = CR8[bits 3:0]
362  *
363  * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
364  * and IA-32 Architectures Software Developer's Manual.
365  *
366  * The functions below translate the value of CR8 to TPR and vice versa.
367  */
368 
369 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
370 {
371     return tpr >> 4;
372 }
373 
374 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
375 {
376     return cr8 << 4;
377 }
378 
379 static void whpx_set_registers(CPUState *cpu, int level)
380 {
381     struct whpx_state *whpx = &whpx_global;
382     AccelCPUState *vcpu = cpu->accel;
383     X86CPU *x86_cpu = X86_CPU(cpu);
384     CPUX86State *env = &x86_cpu->env;
385     struct whpx_register_set vcxt;
386     HRESULT hr;
387     int idx;
388     int idx_next;
389     int i;
390     int v86, r86;
391 
392     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
393 
394     /*
395      * Following MSRs have side effects on the guest or are too heavy for
396      * runtime. Limit them to full state update.
397      */
398     if (level >= WHPX_SET_RESET_STATE) {
399         whpx_set_tsc(cpu);
400     }
401 
402     memset(&vcxt, 0, sizeof(struct whpx_register_set));
403 
404     v86 = (env->eflags & VM_MASK);
405     r86 = !(env->cr[0] & CR0_PE_MASK);
406 
407     vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
408     vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);
409 
410     idx = 0;
411 
412     /* Indexes for first 16 registers match between HV and QEMU definitions */
413     idx_next = 16;
414     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
415         vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
416     }
417     idx = idx_next;
418 
419     /* Same goes for RIP and RFLAGS */
420     assert(whpx_register_names[idx] == WHvX64RegisterRip);
421     vcxt.values[idx++].Reg64 = env->eip;
422 
423     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
424     vcxt.values[idx++].Reg64 = env->eflags;
425 
426     /* Translate 6+4 segment registers. HV and QEMU order matches  */
427     assert(idx == WHvX64RegisterEs);
428     for (i = 0; i < 6; i += 1, idx += 1) {
429         vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
430     }
431 
432     assert(idx == WHvX64RegisterLdtr);
433     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
434 
435     assert(idx == WHvX64RegisterTr);
436     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
437 
438     assert(idx == WHvX64RegisterIdtr);
439     vcxt.values[idx].Table.Base = env->idt.base;
440     vcxt.values[idx].Table.Limit = env->idt.limit;
441     idx += 1;
442 
443     assert(idx == WHvX64RegisterGdtr);
444     vcxt.values[idx].Table.Base = env->gdt.base;
445     vcxt.values[idx].Table.Limit = env->gdt.limit;
446     idx += 1;
447 
448     /* CR0, 2, 3, 4, 8 */
449     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
450     vcxt.values[idx++].Reg64 = env->cr[0];
451     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
452     vcxt.values[idx++].Reg64 = env->cr[2];
453     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
454     vcxt.values[idx++].Reg64 = env->cr[3];
455     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
456     vcxt.values[idx++].Reg64 = env->cr[4];
457     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
458     vcxt.values[idx++].Reg64 = vcpu->tpr;
459 
460     /* 8 Debug Registers - Skipped */
461 
462     /*
463      * Extended control registers needs to be handled separately depending
464      * on whether xsave is supported/enabled or not.
465      */
466     whpx_set_xcrs(cpu);
467 
468     /* 16 XMM registers */
469     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
470     idx_next = idx + 16;
471     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
472         vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
473         vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
474     }
475     idx = idx_next;
476 
477     /* 8 FP registers */
478     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
479     for (i = 0; i < 8; i += 1, idx += 1) {
480         vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
481         /* vcxt.values[idx].Fp.AsUINT128.High64 =
482                env->fpregs[i].mmx.MMX_Q(1);
483         */
484     }
485 
486     /* FP control status register */
487     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
488     vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
489     vcxt.values[idx].FpControlStatus.FpStatus =
490         (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
491     vcxt.values[idx].FpControlStatus.FpTag = 0;
492     for (i = 0; i < 8; ++i) {
493         vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
494     }
495     vcxt.values[idx].FpControlStatus.Reserved = 0;
496     vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
497     vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
498     idx += 1;
499 
500     /* XMM control status register */
501     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
502     vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
503     vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
504     vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
505     idx += 1;
506 
507     /* MSRs */
508     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
509     vcxt.values[idx++].Reg64 = env->efer;
510 #ifdef TARGET_X86_64
511     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
512     vcxt.values[idx++].Reg64 = env->kernelgsbase;
513 #endif
514 
515     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
516     vcxt.values[idx++].Reg64 = vcpu->apic_base;
517 
518     /* WHvX64RegisterPat - Skipped */
519 
520     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
521     vcxt.values[idx++].Reg64 = env->sysenter_cs;
522     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
523     vcxt.values[idx++].Reg64 = env->sysenter_eip;
524     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
525     vcxt.values[idx++].Reg64 = env->sysenter_esp;
526     assert(whpx_register_names[idx] == WHvX64RegisterStar);
527     vcxt.values[idx++].Reg64 = env->star;
528 #ifdef TARGET_X86_64
529     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
530     vcxt.values[idx++].Reg64 = env->lstar;
531     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
532     vcxt.values[idx++].Reg64 = env->cstar;
533     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
534     vcxt.values[idx++].Reg64 = env->fmask;
535 #endif
536 
537     /* Interrupt / Event Registers - Skipped */
538 
539     assert(idx == RTL_NUMBER_OF(whpx_register_names));
540 
541     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
542         whpx->partition, cpu->cpu_index,
543         whpx_register_names,
544         RTL_NUMBER_OF(whpx_register_names),
545         &vcxt.values[0]);
546 
547     if (FAILED(hr)) {
548         error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
549                      hr);
550     }
551 
552     return;
553 }
554 
555 static int whpx_get_tsc(CPUState *cpu)
556 {
557     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
558     WHV_REGISTER_VALUE tsc_val;
559     HRESULT hr;
560     struct whpx_state *whpx = &whpx_global;
561 
562     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
563         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
564     if (FAILED(hr)) {
565         error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
566         return -1;
567     }
568 
569     cpu_env(cpu)->tsc = tsc_val.Reg64;
570     return 0;
571 }
572 
573 /* X64 Extended Control Registers */
574 static void whpx_get_xcrs(CPUState *cpu)
575 {
576     HRESULT hr;
577     struct whpx_state *whpx = &whpx_global;
578     WHV_REGISTER_VALUE xcr0;
579     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
580 
581     if (!whpx_has_xsave()) {
582         return;
583     }
584 
585     /* Only xcr0 is supported by the hypervisor currently */
586     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
587         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
588     if (FAILED(hr)) {
589         error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
590         return;
591     }
592 
593     cpu_env(cpu)->xcr0 = xcr0.Reg64;
594 }
595 
596 static void whpx_get_registers(CPUState *cpu)
597 {
598     struct whpx_state *whpx = &whpx_global;
599     AccelCPUState *vcpu = cpu->accel;
600     X86CPU *x86_cpu = X86_CPU(cpu);
601     CPUX86State *env = &x86_cpu->env;
602     struct whpx_register_set vcxt;
603     uint64_t tpr, apic_base;
604     HRESULT hr;
605     int idx;
606     int idx_next;
607     int i;
608 
609     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
610 
611     if (!env->tsc_valid) {
612         whpx_get_tsc(cpu);
613         env->tsc_valid = !runstate_is_running();
614     }
615 
616     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
617         whpx->partition, cpu->cpu_index,
618         whpx_register_names,
619         RTL_NUMBER_OF(whpx_register_names),
620         &vcxt.values[0]);
621     if (FAILED(hr)) {
622         error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
623                      hr);
624     }
625 
626     if (whpx_apic_in_platform()) {
627         /*
628          * Fetch the TPR value from the emulated APIC. It may get overwritten
629          * below with the value from CR8 returned by
630          * WHvGetVirtualProcessorRegisters().
631          */
632         whpx_apic_get(x86_cpu->apic_state);
633         vcpu->tpr = whpx_apic_tpr_to_cr8(
634             cpu_get_apic_tpr(x86_cpu->apic_state));
635     }
636 
637     idx = 0;
638 
639     /* Indexes for first 16 registers match between HV and QEMU definitions */
640     idx_next = 16;
641     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
642         env->regs[idx] = vcxt.values[idx].Reg64;
643     }
644     idx = idx_next;
645 
646     /* Same goes for RIP and RFLAGS */
647     assert(whpx_register_names[idx] == WHvX64RegisterRip);
648     env->eip = vcxt.values[idx++].Reg64;
649     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
650     env->eflags = vcxt.values[idx++].Reg64;
651 
652     /* Translate 6+4 segment registers. HV and QEMU order matches  */
653     assert(idx == WHvX64RegisterEs);
654     for (i = 0; i < 6; i += 1, idx += 1) {
655         env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
656     }
657 
658     assert(idx == WHvX64RegisterLdtr);
659     env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
660     assert(idx == WHvX64RegisterTr);
661     env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
662     assert(idx == WHvX64RegisterIdtr);
663     env->idt.base = vcxt.values[idx].Table.Base;
664     env->idt.limit = vcxt.values[idx].Table.Limit;
665     idx += 1;
666     assert(idx == WHvX64RegisterGdtr);
667     env->gdt.base = vcxt.values[idx].Table.Base;
668     env->gdt.limit = vcxt.values[idx].Table.Limit;
669     idx += 1;
670 
671     /* CR0, 2, 3, 4, 8 */
672     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
673     env->cr[0] = vcxt.values[idx++].Reg64;
674     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
675     env->cr[2] = vcxt.values[idx++].Reg64;
676     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
677     env->cr[3] = vcxt.values[idx++].Reg64;
678     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
679     env->cr[4] = vcxt.values[idx++].Reg64;
680     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
681     tpr = vcxt.values[idx++].Reg64;
682     if (tpr != vcpu->tpr) {
683         vcpu->tpr = tpr;
684         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
685     }
686 
687     /* 8 Debug Registers - Skipped */
688 
689     /*
690      * Extended control registers needs to be handled separately depending
691      * on whether xsave is supported/enabled or not.
692      */
693     whpx_get_xcrs(cpu);
694 
695     /* 16 XMM registers */
696     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
697     idx_next = idx + 16;
698     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
699         env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
700         env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
701     }
702     idx = idx_next;
703 
704     /* 8 FP registers */
705     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
706     for (i = 0; i < 8; i += 1, idx += 1) {
707         env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
708         /* env->fpregs[i].mmx.MMX_Q(1) =
709                vcxt.values[idx].Fp.AsUINT128.High64;
710         */
711     }
712 
713     /* FP control status register */
714     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
715     env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
716     env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
717     env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
718     for (i = 0; i < 8; ++i) {
719         env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
720     }
721     env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
722     env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
723     idx += 1;
724 
725     /* XMM control status register */
726     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
727     env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
728     idx += 1;
729 
730     /* MSRs */
731     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
732     env->efer = vcxt.values[idx++].Reg64;
733 #ifdef TARGET_X86_64
734     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
735     env->kernelgsbase = vcxt.values[idx++].Reg64;
736 #endif
737 
738     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
739     apic_base = vcxt.values[idx++].Reg64;
740     if (apic_base != vcpu->apic_base) {
741         vcpu->apic_base = apic_base;
742         cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
743     }
744 
745     /* WHvX64RegisterPat - Skipped */
746 
747     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
748     env->sysenter_cs = vcxt.values[idx++].Reg64;
749     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
750     env->sysenter_eip = vcxt.values[idx++].Reg64;
751     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
752     env->sysenter_esp = vcxt.values[idx++].Reg64;
753     assert(whpx_register_names[idx] == WHvX64RegisterStar);
754     env->star = vcxt.values[idx++].Reg64;
755 #ifdef TARGET_X86_64
756     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
757     env->lstar = vcxt.values[idx++].Reg64;
758     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
759     env->cstar = vcxt.values[idx++].Reg64;
760     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
761     env->fmask = vcxt.values[idx++].Reg64;
762 #endif
763 
764     /* Interrupt / Event Registers - Skipped */
765 
766     assert(idx == RTL_NUMBER_OF(whpx_register_names));
767 
768     if (whpx_apic_in_platform()) {
769         whpx_apic_get(x86_cpu->apic_state);
770     }
771 
772     x86_update_hflags(env);
773 
774     return;
775 }
776 
777 static HRESULT CALLBACK whpx_emu_ioport_callback(
778     void *ctx,
779     WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
780 {
781     MemTxAttrs attrs = { 0 };
782     address_space_rw(&address_space_io, IoAccess->Port, attrs,
783                      &IoAccess->Data, IoAccess->AccessSize,
784                      IoAccess->Direction);
785     return S_OK;
786 }
787 
788 static HRESULT CALLBACK whpx_emu_mmio_callback(
789     void *ctx,
790     WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
791 {
792     cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
793                            ma->Direction);
794     return S_OK;
795 }
796 
797 static HRESULT CALLBACK whpx_emu_getreg_callback(
798     void *ctx,
799     const WHV_REGISTER_NAME *RegisterNames,
800     UINT32 RegisterCount,
801     WHV_REGISTER_VALUE *RegisterValues)
802 {
803     HRESULT hr;
804     struct whpx_state *whpx = &whpx_global;
805     CPUState *cpu = (CPUState *)ctx;
806 
807     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
808         whpx->partition, cpu->cpu_index,
809         RegisterNames, RegisterCount,
810         RegisterValues);
811     if (FAILED(hr)) {
812         error_report("WHPX: Failed to get virtual processor registers,"
813                      " hr=%08lx", hr);
814     }
815 
816     return hr;
817 }
818 
819 static HRESULT CALLBACK whpx_emu_setreg_callback(
820     void *ctx,
821     const WHV_REGISTER_NAME *RegisterNames,
822     UINT32 RegisterCount,
823     const WHV_REGISTER_VALUE *RegisterValues)
824 {
825     HRESULT hr;
826     struct whpx_state *whpx = &whpx_global;
827     CPUState *cpu = (CPUState *)ctx;
828 
829     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
830         whpx->partition, cpu->cpu_index,
831         RegisterNames, RegisterCount,
832         RegisterValues);
833     if (FAILED(hr)) {
834         error_report("WHPX: Failed to set virtual processor registers,"
835                      " hr=%08lx", hr);
836     }
837 
838     /*
839      * The emulator just successfully wrote the register state. We clear the
840      * dirty state so we avoid the double write on resume of the VP.
841      */
842     cpu->vcpu_dirty = false;
843 
844     return hr;
845 }
846 
847 static HRESULT CALLBACK whpx_emu_translate_callback(
848     void *ctx,
849     WHV_GUEST_VIRTUAL_ADDRESS Gva,
850     WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
851     WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
852     WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
853 {
854     HRESULT hr;
855     struct whpx_state *whpx = &whpx_global;
856     CPUState *cpu = (CPUState *)ctx;
857     WHV_TRANSLATE_GVA_RESULT res;
858 
859     hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
860                                       Gva, TranslateFlags, &res, Gpa);
861     if (FAILED(hr)) {
862         error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
863     } else {
864         *TranslationResult = res.ResultCode;
865     }
866 
867     return hr;
868 }
869 
870 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
871     .Size = sizeof(WHV_EMULATOR_CALLBACKS),
872     .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
873     .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
874     .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
875     .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
876     .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
877 };
878 
879 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
880 {
881     HRESULT hr;
882     AccelCPUState *vcpu = cpu->accel;
883     WHV_EMULATOR_STATUS emu_status;
884 
885     hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
886         vcpu->emulator, cpu,
887         &vcpu->exit_ctx.VpContext, ctx,
888         &emu_status);
889     if (FAILED(hr)) {
890         error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
891         return -1;
892     }
893 
894     if (!emu_status.EmulationSuccessful) {
895         error_report("WHPX: Failed to emulate MMIO access with"
896                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
897         return -1;
898     }
899 
900     return 0;
901 }
902 
903 static int whpx_handle_portio(CPUState *cpu,
904                               WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
905 {
906     HRESULT hr;
907     AccelCPUState *vcpu = cpu->accel;
908     WHV_EMULATOR_STATUS emu_status;
909 
910     hr = whp_dispatch.WHvEmulatorTryIoEmulation(
911         vcpu->emulator, cpu,
912         &vcpu->exit_ctx.VpContext, ctx,
913         &emu_status);
914     if (FAILED(hr)) {
915         error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
916         return -1;
917     }
918 
919     if (!emu_status.EmulationSuccessful) {
920         error_report("WHPX: Failed to emulate PortIO access with"
921                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
922         return -1;
923     }
924 
925     return 0;
926 }
927 
928 /*
929  * Controls whether we should intercept various exceptions on the guest,
930  * namely breakpoint/single-step events.
931  *
932  * The 'exceptions' argument accepts a bitmask, e.g:
933  * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
934  */
935 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
936 {
937     struct whpx_state *whpx = &whpx_global;
938     WHV_PARTITION_PROPERTY prop = { 0, };
939     HRESULT hr;
940 
941     if (exceptions == whpx->exception_exit_bitmap) {
942         return S_OK;
943     }
944 
945     prop.ExceptionExitBitmap = exceptions;
946 
947     hr = whp_dispatch.WHvSetPartitionProperty(
948         whpx->partition,
949         WHvPartitionPropertyCodeExceptionExitBitmap,
950         &prop,
951         sizeof(WHV_PARTITION_PROPERTY));
952 
953     if (SUCCEEDED(hr)) {
954         whpx->exception_exit_bitmap = exceptions;
955     }
956 
957     return hr;
958 }
959 
960 
961 /*
962  * This function is called before/after stepping over a single instruction.
963  * It will update the CPU registers to arm/disarm the instruction stepping
964  * accordingly.
965  */
966 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
967     bool set,
968     uint64_t *exit_context_rflags)
969 {
970     WHV_REGISTER_NAME reg_name;
971     WHV_REGISTER_VALUE reg_value;
972     HRESULT hr;
973     struct whpx_state *whpx = &whpx_global;
974 
975     /*
976      * If we are trying to step over a single instruction, we need to set the
977      * TF bit in rflags. Otherwise, clear it.
978      */
979     reg_name = WHvX64RegisterRflags;
980     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
981         whpx->partition,
982         cpu->cpu_index,
983         &reg_name,
984         1,
985         &reg_value);
986 
987     if (FAILED(hr)) {
988         error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
989         return hr;
990     }
991 
992     if (exit_context_rflags) {
993         assert(*exit_context_rflags == reg_value.Reg64);
994     }
995 
996     if (set) {
997         /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
998         reg_value.Reg64 |= TF_MASK;
999     } else {
1000         reg_value.Reg64 &= ~TF_MASK;
1001     }
1002 
1003     if (exit_context_rflags) {
1004         *exit_context_rflags = reg_value.Reg64;
1005     }
1006 
1007     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1008         whpx->partition,
1009         cpu->cpu_index,
1010         &reg_name,
1011         1,
1012         &reg_value);
1013 
1014     if (FAILED(hr)) {
1015         error_report("WHPX: Failed to set rflags,"
1016             " hr=%08lx",
1017             hr);
1018         return hr;
1019     }
1020 
1021     reg_name = WHvRegisterInterruptState;
1022     reg_value.Reg64 = 0;
1023 
1024     /* Suspend delivery of hardware interrupts during single-stepping. */
1025     reg_value.InterruptState.InterruptShadow = set != 0;
1026 
1027     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1028     whpx->partition,
1029         cpu->cpu_index,
1030         &reg_name,
1031         1,
1032         &reg_value);
1033 
1034     if (FAILED(hr)) {
1035         error_report("WHPX: Failed to set InterruptState,"
1036             " hr=%08lx",
1037             hr);
1038         return hr;
1039     }
1040 
1041     if (!set) {
1042         /*
1043          * We have just finished stepping over a single instruction,
1044          * and intercepted the INT1 generated by it.
1045          * We need to now hide the INT1 from the guest,
1046          * as it would not be expecting it.
1047          */
1048 
1049         reg_name = WHvX64RegisterPendingDebugException;
1050         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1051         whpx->partition,
1052             cpu->cpu_index,
1053             &reg_name,
1054             1,
1055             &reg_value);
1056 
1057         if (FAILED(hr)) {
1058             error_report("WHPX: Failed to get pending debug exceptions,"
1059                          "hr=%08lx", hr);
1060             return hr;
1061         }
1062 
1063         if (reg_value.PendingDebugException.SingleStep) {
1064             reg_value.PendingDebugException.SingleStep = 0;
1065 
1066             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1067                 whpx->partition,
1068                 cpu->cpu_index,
1069                 &reg_name,
1070                 1,
1071                 &reg_value);
1072 
1073             if (FAILED(hr)) {
1074                 error_report("WHPX: Failed to clear pending debug exceptions,"
1075                              "hr=%08lx", hr);
1076              return hr;
1077             }
1078         }
1079 
1080     }
1081 
1082     return S_OK;
1083 }
1084 
1085 /* Tries to find a breakpoint at the specified address. */
1086 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
1087 {
1088     struct whpx_state *whpx = &whpx_global;
1089     int i;
1090 
1091     if (whpx->breakpoints.breakpoints) {
1092         for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
1093             if (address == whpx->breakpoints.breakpoints->data[i].address) {
1094                 return &whpx->breakpoints.breakpoints->data[i];
1095             }
1096         }
1097     }
1098 
1099     return NULL;
1100 }
1101 
1102 /*
1103  * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
1104  * debugging user-mode applications. Since the WHPX API does not offer
1105  * an easy way to pass the intercepted exception back to the guest, we
1106  * resort to using INT1 instead, and let the guest always handle INT3.
1107  */
1108 static const uint8_t whpx_breakpoint_instruction = 0xF1;
1109 
1110 /*
1111  * The WHPX QEMU backend implements breakpoints by writing the INT1
1112  * instruction into memory (ignoring the DRx registers). This raises a few
1113  * issues that need to be carefully handled:
1114  *
1115  * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
1116  *    at the same location, and later remove them in arbitrary order.
1117  *    This should not cause memory corruption, and should only remove the
1118  *    physical breakpoint instruction when the last QEMU breakpoint is gone.
1119  *
1120  * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
1121  *    physical location. Hence, physically adding/removing a breakpoint can
1122  *    theoretically fail at any time. We need to keep track of it.
1123  *
1124  * The function below rebuilds a list of low-level breakpoints (one per
1125  * address, tracking the original instruction and any errors) from the list of
1126  * high-level breakpoints (set via cpu_breakpoint_insert()).
1127  *
1128  * In order to optimize performance, this function stores the list of
1129  * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
1130  * low-level ones, so that it won't be re-invoked until these breakpoints
1131  * change.
1132  *
1133  * Note that this function decides which breakpoints should be inserted into,
1134  * memory, but doesn't actually do it. The memory accessing is done in
1135  * whpx_apply_breakpoints().
1136  */
1137 static void whpx_translate_cpu_breakpoints(
1138     struct whpx_breakpoints *breakpoints,
1139     CPUState *cpu,
1140     int cpu_breakpoint_count)
1141 {
1142     CPUBreakpoint *bp;
1143     int cpu_bp_index = 0;
1144 
1145     breakpoints->original_addresses =
1146         g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);
1147 
1148     breakpoints->original_address_count = cpu_breakpoint_count;
1149 
1150     int max_breakpoints = cpu_breakpoint_count +
1151         (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);
1152 
1153     struct whpx_breakpoint_collection *new_breakpoints =
1154         g_malloc0(sizeof(struct whpx_breakpoint_collection)
1155                   + max_breakpoints * sizeof(struct whpx_breakpoint));
1156 
1157     new_breakpoints->allocated = max_breakpoints;
1158     new_breakpoints->used = 0;
1159 
1160     /*
1161      * 1. Preserve all old breakpoints that could not be automatically
1162      * cleared when the CPU got stopped.
1163      */
1164     if (breakpoints->breakpoints) {
1165         int i;
1166         for (i = 0; i < breakpoints->breakpoints->used; i++) {
1167             if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
1168                 new_breakpoints->data[new_breakpoints->used++] =
1169                     breakpoints->breakpoints->data[i];
1170             }
1171         }
1172     }
1173 
1174     /* 2. Map all CPU breakpoints to WHPX breakpoints */
1175     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1176         int i;
1177         bool found = false;
1178 
1179         /* This will be used to detect changed CPU breakpoints later. */
1180         breakpoints->original_addresses[cpu_bp_index++] = bp->pc;
1181 
1182         for (i = 0; i < new_breakpoints->used; i++) {
1183             /*
1184              * WARNING: This loop has O(N^2) complexity, where N is the
1185              * number of breakpoints. It should not be a bottleneck in
1186              * real-world scenarios, since it only needs to run once after
1187              * the breakpoints have been modified.
1188              * If this ever becomes a concern, it can be optimized by storing
1189              * high-level breakpoint objects in a tree or hash map.
1190              */
1191 
1192             if (new_breakpoints->data[i].address == bp->pc) {
1193                 /* There was already a breakpoint at this address. */
1194                 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
1195                     new_breakpoints->data[i].state = WHPX_BP_SET;
1196                 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
1197                     new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
1198                 }
1199 
1200                 found = true;
1201                 break;
1202             }
1203         }
1204 
1205         if (!found && new_breakpoints->used < new_breakpoints->allocated) {
1206             /* No WHPX breakpoint at this address. Create one. */
1207             new_breakpoints->data[new_breakpoints->used].address = bp->pc;
1208             new_breakpoints->data[new_breakpoints->used].state =
1209                 WHPX_BP_SET_PENDING;
1210             new_breakpoints->used++;
1211         }
1212     }
1213 
1214     /*
1215      * Free the previous breakpoint list. This can be optimized by keeping
1216      * it as shadow buffer for the next computation instead of freeing
1217      * it immediately.
1218      */
1219     g_free(breakpoints->breakpoints);
1220 
1221     breakpoints->breakpoints = new_breakpoints;
1222 }
1223 
1224 /*
1225  * Physically inserts/removes the breakpoints by reading and writing the
1226  * physical memory, keeping a track of the failed attempts.
1227  *
1228  * Passing resuming=true  will try to set all previously unset breakpoints.
1229  * Passing resuming=false will remove all inserted ones.
1230  */
1231 static void whpx_apply_breakpoints(
1232     struct whpx_breakpoint_collection *breakpoints,
1233     CPUState *cpu,
1234     bool resuming)
1235 {
1236     int i, rc;
1237     if (!breakpoints) {
1238         return;
1239     }
1240 
1241     for (i = 0; i < breakpoints->used; i++) {
1242         /* Decide what to do right now based on the last known state. */
1243         WhpxBreakpointState state = breakpoints->data[i].state;
1244         switch (state) {
1245         case WHPX_BP_CLEARED:
1246             if (resuming) {
1247                 state = WHPX_BP_SET_PENDING;
1248             }
1249             break;
1250         case WHPX_BP_SET_PENDING:
1251             if (!resuming) {
1252                 state = WHPX_BP_CLEARED;
1253             }
1254             break;
1255         case WHPX_BP_SET:
1256             if (!resuming) {
1257                 state = WHPX_BP_CLEAR_PENDING;
1258             }
1259             break;
1260         case WHPX_BP_CLEAR_PENDING:
1261             if (resuming) {
1262                 state = WHPX_BP_SET;
1263             }
1264             break;
1265         }
1266 
1267         if (state == WHPX_BP_SET_PENDING) {
1268             /* Remember the original instruction. */
1269             rc = cpu_memory_rw_debug(cpu,
1270                 breakpoints->data[i].address,
1271                 &breakpoints->data[i].original_instruction,
1272                 1,
1273                 false);
1274 
1275             if (!rc) {
1276                 /* Write the breakpoint instruction. */
1277                 rc = cpu_memory_rw_debug(cpu,
1278                     breakpoints->data[i].address,
1279                     (void *)&whpx_breakpoint_instruction,
1280                     1,
1281                     true);
1282             }
1283 
1284             if (!rc) {
1285                 state = WHPX_BP_SET;
1286             }
1287 
1288         }
1289 
1290         if (state == WHPX_BP_CLEAR_PENDING) {
1291             /* Restore the original instruction. */
1292             rc = cpu_memory_rw_debug(cpu,
1293                 breakpoints->data[i].address,
1294                 &breakpoints->data[i].original_instruction,
1295                 1,
1296                 true);
1297 
1298             if (!rc) {
1299                 state = WHPX_BP_CLEARED;
1300             }
1301         }
1302 
1303         breakpoints->data[i].state = state;
1304     }
1305 }
1306 
1307 /*
1308  * This function is called when the a VCPU is about to start and no other
1309  * VCPUs have been started so far. Since the VCPU start order could be
1310  * arbitrary, it doesn't have to be VCPU#0.
1311  *
1312  * It is used to commit the breakpoints into memory, and configure WHPX
1313  * to intercept debug exceptions.
1314  *
1315  * Note that whpx_set_exception_exit_bitmap() cannot be called if one or
1316  * more VCPUs are already running, so this is the best place to do it.
1317  */
1318 static int whpx_first_vcpu_starting(CPUState *cpu)
1319 {
1320     struct whpx_state *whpx = &whpx_global;
1321     HRESULT hr;
1322 
1323     g_assert(bql_locked());
1324 
1325     if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
1326             (whpx->breakpoints.breakpoints &&
1327              whpx->breakpoints.breakpoints->used)) {
1328         CPUBreakpoint *bp;
1329         int i = 0;
1330         bool update_pending = false;
1331 
1332         QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1333             if (i >= whpx->breakpoints.original_address_count ||
1334                 bp->pc != whpx->breakpoints.original_addresses[i]) {
1335                 update_pending = true;
1336             }
1337 
1338             i++;
1339         }
1340 
1341         if (i != whpx->breakpoints.original_address_count) {
1342             update_pending = true;
1343         }
1344 
1345         if (update_pending) {
1346             /*
1347              * The CPU breakpoints have changed since the last call to
1348              * whpx_translate_cpu_breakpoints(). WHPX breakpoints must
1349              * now be recomputed.
1350              */
1351             whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
1352         }
1353 
1354         /* Actually insert the breakpoints into the memory. */
1355         whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
1356     }
1357 
1358     uint64_t exception_mask;
1359     if (whpx->step_pending ||
1360         (whpx->breakpoints.breakpoints &&
1361          whpx->breakpoints.breakpoints->used)) {
1362         /*
1363          * We are either attempting to single-step one or more CPUs, or
1364          * have one or more breakpoints enabled. Both require intercepting
1365          * the WHvX64ExceptionTypeBreakpointTrap exception.
1366          */
1367 
1368         exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
1369     } else {
1370         /* Let the guest handle all exceptions. */
1371         exception_mask = 0;
1372     }
1373 
1374     hr = whpx_set_exception_exit_bitmap(exception_mask);
1375     if (!SUCCEEDED(hr)) {
1376         error_report("WHPX: Failed to update exception exit mask,"
1377                      "hr=%08lx.", hr);
1378         return 1;
1379     }
1380 
1381     return 0;
1382 }
1383 
1384 /*
1385  * This function is called when the last VCPU has finished running.
1386  * It is used to remove any previously set breakpoints from memory.
1387  */
1388 static int whpx_last_vcpu_stopping(CPUState *cpu)
1389 {
1390     whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
1391     return 0;
1392 }
1393 
1394 /* Returns the address of the next instruction that is about to be executed. */
1395 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
1396 {
1397     if (cpu->vcpu_dirty) {
1398         /* The CPU registers have been modified by other parts of QEMU. */
1399         return cpu_env(cpu)->eip;
1400     } else if (exit_context_valid) {
1401         /*
1402          * The CPU registers have not been modified by neither other parts
1403          * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
1404          * This is the most common case.
1405          */
1406         AccelCPUState *vcpu = cpu->accel;
1407         return vcpu->exit_ctx.VpContext.Rip;
1408     } else {
1409         /*
1410          * The CPU registers have been modified by a call to
1411          * WHvSetVirtualProcessorRegisters() and must be re-queried from
1412          * the target.
1413          */
1414         WHV_REGISTER_VALUE reg_value;
1415         WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
1416         HRESULT hr;
1417         struct whpx_state *whpx = &whpx_global;
1418 
1419         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1420             whpx->partition,
1421             cpu->cpu_index,
1422             &reg_name,
1423             1,
1424             &reg_value);
1425 
1426         if (FAILED(hr)) {
1427             error_report("WHPX: Failed to get PC, hr=%08lx", hr);
1428             return 0;
1429         }
1430 
1431         return reg_value.Reg64;
1432     }
1433 }
1434 
1435 static int whpx_handle_halt(CPUState *cpu)
1436 {
1437     int ret = 0;
1438 
1439     bql_lock();
1440     if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1441           (cpu_env(cpu)->eflags & IF_MASK)) &&
1442         !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1443         cpu->exception_index = EXCP_HLT;
1444         cpu->halted = true;
1445         ret = 1;
1446     }
1447     bql_unlock();
1448 
1449     return ret;
1450 }
1451 
1452 static void whpx_vcpu_pre_run(CPUState *cpu)
1453 {
1454     HRESULT hr;
1455     struct whpx_state *whpx = &whpx_global;
1456     AccelCPUState *vcpu = cpu->accel;
1457     X86CPU *x86_cpu = X86_CPU(cpu);
1458     CPUX86State *env = &x86_cpu->env;
1459     int irq;
1460     uint8_t tpr;
1461     WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
1462     UINT32 reg_count = 0;
1463     WHV_REGISTER_VALUE reg_values[3];
1464     WHV_REGISTER_NAME reg_names[3];
1465 
1466     memset(&new_int, 0, sizeof(new_int));
1467     memset(reg_values, 0, sizeof(reg_values));
1468 
1469     bql_lock();
1470 
1471     /* Inject NMI */
1472     if (!vcpu->interruption_pending &&
1473         cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
1474         if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
1475             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
1476             vcpu->interruptable = false;
1477             new_int.InterruptionType = WHvX64PendingNmi;
1478             new_int.InterruptionPending = 1;
1479             new_int.InterruptionVector = 2;
1480         }
1481         if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
1482             cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
1483         }
1484     }
1485 
1486     /*
1487      * Force the VCPU out of its inner loop to process any INIT requests or
1488      * commit pending TPR access.
1489      */
1490     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
1491         if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1492             !(env->hflags & HF_SMM_MASK)) {
1493             cpu->exit_request = 1;
1494         }
1495         if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1496             cpu->exit_request = 1;
1497         }
1498     }
1499 
1500     /* Get pending hard interruption or replay one that was overwritten */
1501     if (!whpx_apic_in_platform()) {
1502         if (!vcpu->interruption_pending &&
1503             vcpu->interruptable && (env->eflags & IF_MASK)) {
1504             assert(!new_int.InterruptionPending);
1505             if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1506                 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1507                 irq = cpu_get_pic_interrupt(env);
1508                 if (irq >= 0) {
1509                     new_int.InterruptionType = WHvX64PendingInterrupt;
1510                     new_int.InterruptionPending = 1;
1511                     new_int.InterruptionVector = irq;
1512                 }
1513             }
1514         }
1515 
1516         /* Setup interrupt state if new one was prepared */
1517         if (new_int.InterruptionPending) {
1518             reg_values[reg_count].PendingInterruption = new_int;
1519             reg_names[reg_count] = WHvRegisterPendingInterruption;
1520             reg_count += 1;
1521         }
1522     } else if (vcpu->ready_for_pic_interrupt &&
1523                (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
1524         cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1525         irq = cpu_get_pic_interrupt(env);
1526         if (irq >= 0) {
1527             reg_names[reg_count] = WHvRegisterPendingEvent;
1528             reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
1529             {
1530                 .EventPending = 1,
1531                 .EventType = WHvX64PendingEventExtInt,
1532                 .Vector = irq,
1533             };
1534             reg_count += 1;
1535         }
1536      }
1537 
1538     /* Sync the TPR to the CR8 if was modified during the intercept */
1539     tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
1540     if (tpr != vcpu->tpr) {
1541         vcpu->tpr = tpr;
1542         reg_values[reg_count].Reg64 = tpr;
1543         cpu->exit_request = 1;
1544         reg_names[reg_count] = WHvX64RegisterCr8;
1545         reg_count += 1;
1546     }
1547 
1548     /* Update the state of the interrupt delivery notification */
1549     if (!vcpu->window_registered &&
1550         cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1551         reg_values[reg_count].DeliverabilityNotifications =
1552             (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
1553                 .InterruptNotification = 1
1554             };
1555         vcpu->window_registered = 1;
1556         reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
1557         reg_count += 1;
1558     }
1559 
1560     bql_unlock();
1561     vcpu->ready_for_pic_interrupt = false;
1562 
1563     if (reg_count) {
1564         hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1565             whpx->partition, cpu->cpu_index,
1566             reg_names, reg_count, reg_values);
1567         if (FAILED(hr)) {
1568             error_report("WHPX: Failed to set interrupt state registers,"
1569                          " hr=%08lx", hr);
1570         }
1571     }
1572 
1573     return;
1574 }
1575 
1576 static void whpx_vcpu_post_run(CPUState *cpu)
1577 {
1578     AccelCPUState *vcpu = cpu->accel;
1579     X86CPU *x86_cpu = X86_CPU(cpu);
1580     CPUX86State *env = &x86_cpu->env;
1581 
1582     env->eflags = vcpu->exit_ctx.VpContext.Rflags;
1583 
1584     uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
1585     if (vcpu->tpr != tpr) {
1586         vcpu->tpr = tpr;
1587         bql_lock();
1588         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr));
1589         bql_unlock();
1590     }
1591 
1592     vcpu->interruption_pending =
1593         vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;
1594 
1595     vcpu->interruptable =
1596         !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
1597 
1598     return;
1599 }
1600 
1601 static void whpx_vcpu_process_async_events(CPUState *cpu)
1602 {
1603     X86CPU *x86_cpu = X86_CPU(cpu);
1604     CPUX86State *env = &x86_cpu->env;
1605     AccelCPUState *vcpu = cpu->accel;
1606 
1607     if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1608         !(env->hflags & HF_SMM_MASK)) {
1609         whpx_cpu_synchronize_state(cpu);
1610         do_cpu_init(x86_cpu);
1611         vcpu->interruptable = true;
1612     }
1613 
1614     if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
1615         cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
1616         apic_poll_irq(x86_cpu->apic_state);
1617     }
1618 
1619     if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1620          (env->eflags & IF_MASK)) ||
1621         (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1622         cpu->halted = false;
1623     }
1624 
1625     if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
1626         whpx_cpu_synchronize_state(cpu);
1627         do_cpu_sipi(x86_cpu);
1628     }
1629 
1630     if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1631         cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
1632         whpx_cpu_synchronize_state(cpu);
1633         apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
1634                                       env->tpr_access_type);
1635     }
1636 
1637     return;
1638 }
1639 
1640 static int whpx_vcpu_run(CPUState *cpu)
1641 {
1642     HRESULT hr;
1643     struct whpx_state *whpx = &whpx_global;
1644     AccelCPUState *vcpu = cpu->accel;
1645     struct whpx_breakpoint *stepped_over_bp = NULL;
1646     WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
1647     int ret;
1648 
1649     g_assert(bql_locked());
1650 
1651     if (whpx->running_cpus++ == 0) {
1652         /* Insert breakpoints into memory, update exception exit bitmap. */
1653         ret = whpx_first_vcpu_starting(cpu);
1654         if (ret != 0) {
1655             return ret;
1656         }
1657     }
1658 
1659     if (whpx->breakpoints.breakpoints &&
1660         whpx->breakpoints.breakpoints->used > 0)
1661     {
1662         uint64_t pc = whpx_vcpu_get_pc(cpu, true);
1663         stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
1664         if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
1665             stepped_over_bp = NULL;
1666         }
1667 
1668         if (stepped_over_bp) {
1669             /*
1670              * We are trying to run the instruction overwritten by an active
1671              * breakpoint. We will temporarily disable the breakpoint, suspend
1672              * other CPUs, and step over the instruction.
1673              */
1674             exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
1675         }
1676     }
1677 
1678     if (exclusive_step_mode == WHPX_STEP_NONE) {
1679         whpx_vcpu_process_async_events(cpu);
1680         if (cpu->halted && !whpx_apic_in_platform()) {
1681             cpu->exception_index = EXCP_HLT;
1682             qatomic_set(&cpu->exit_request, false);
1683             return 0;
1684         }
1685     }
1686 
1687     bql_unlock();
1688 
1689     if (exclusive_step_mode != WHPX_STEP_NONE) {
1690         start_exclusive();
1691         g_assert(cpu == current_cpu);
1692         g_assert(!cpu->running);
1693         cpu->running = true;
1694 
1695         hr = whpx_set_exception_exit_bitmap(
1696             1UL << WHvX64ExceptionTypeDebugTrapOrFault);
1697         if (!SUCCEEDED(hr)) {
1698             error_report("WHPX: Failed to update exception exit mask, "
1699                          "hr=%08lx.", hr);
1700             return 1;
1701         }
1702 
1703         if (stepped_over_bp) {
1704             /* Temporarily disable the triggered breakpoint. */
1705             cpu_memory_rw_debug(cpu,
1706                 stepped_over_bp->address,
1707                 &stepped_over_bp->original_instruction,
1708                 1,
1709                 true);
1710         }
1711     } else {
1712         cpu_exec_start(cpu);
1713     }
1714 
1715     do {
1716         if (cpu->vcpu_dirty) {
1717             whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
1718             cpu->vcpu_dirty = false;
1719         }
1720 
1721         if (exclusive_step_mode == WHPX_STEP_NONE) {
1722             whpx_vcpu_pre_run(cpu);
1723 
1724             if (qatomic_read(&cpu->exit_request)) {
1725                 whpx_vcpu_kick(cpu);
1726             }
1727         }
1728 
1729         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1730             whpx_vcpu_configure_single_stepping(cpu, true, NULL);
1731         }
1732 
1733         hr = whp_dispatch.WHvRunVirtualProcessor(
1734             whpx->partition, cpu->cpu_index,
1735             &vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
1736 
1737         if (FAILED(hr)) {
1738             error_report("WHPX: Failed to exec a virtual processor,"
1739                          " hr=%08lx", hr);
1740             ret = -1;
1741             break;
1742         }
1743 
1744         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1745             whpx_vcpu_configure_single_stepping(cpu,
1746                 false,
1747                 &vcpu->exit_ctx.VpContext.Rflags);
1748         }
1749 
1750         whpx_vcpu_post_run(cpu);
1751 
1752         switch (vcpu->exit_ctx.ExitReason) {
1753         case WHvRunVpExitReasonMemoryAccess:
1754             ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
1755             break;
1756 
1757         case WHvRunVpExitReasonX64IoPortAccess:
1758             ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
1759             break;
1760 
1761         case WHvRunVpExitReasonX64InterruptWindow:
1762             vcpu->ready_for_pic_interrupt = 1;
1763             vcpu->window_registered = 0;
1764             ret = 0;
1765             break;
1766 
1767         case WHvRunVpExitReasonX64ApicEoi:
1768             assert(whpx_apic_in_platform());
1769             ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
1770             break;
1771 
1772         case WHvRunVpExitReasonX64Halt:
1773             /*
1774              * WARNING: as of build 19043.1526 (21H1), this exit reason is no
1775              * longer used.
1776              */
1777             ret = whpx_handle_halt(cpu);
1778             break;
1779 
1780         case WHvRunVpExitReasonX64ApicInitSipiTrap: {
1781             WHV_INTERRUPT_CONTROL ipi = {0};
1782             uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
1783             uint32_t delivery_mode =
1784                 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
1785             int dest_shorthand =
1786                 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
1787             bool broadcast = false;
1788             bool include_self = false;
1789             uint32_t i;
1790 
1791             /* We only registered for INIT and SIPI exits. */
1792             if ((delivery_mode != APIC_DM_INIT) &&
1793                 (delivery_mode != APIC_DM_SIPI)) {
1794                 error_report(
1795                     "WHPX: Unexpected APIC exit that is not a INIT or SIPI");
1796                 break;
1797             }
1798 
1799             if (delivery_mode == APIC_DM_INIT) {
1800                 ipi.Type = WHvX64InterruptTypeInit;
1801             } else {
1802                 ipi.Type = WHvX64InterruptTypeSipi;
1803             }
1804 
1805             ipi.DestinationMode =
1806                 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
1807                     WHvX64InterruptDestinationModeLogical :
1808                     WHvX64InterruptDestinationModePhysical;
1809 
1810             ipi.TriggerMode =
1811                 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
1812                     WHvX64InterruptTriggerModeLevel :
1813                     WHvX64InterruptTriggerModeEdge;
1814 
1815             ipi.Vector = icr & APIC_VECTOR_MASK;
1816             switch (dest_shorthand) {
1817             /* no shorthand. Bits 56-63 contain the destination. */
1818             case 0:
1819                 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
1820                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1821                         &ipi, sizeof(ipi));
1822                 if (FAILED(hr)) {
1823                     error_report("WHPX: Failed to request interrupt  hr=%08lx",
1824                         hr);
1825                 }
1826 
1827                 break;
1828 
1829             /* self */
1830             case 1:
1831                 include_self = true;
1832                 break;
1833 
1834             /* broadcast, including self */
1835             case 2:
1836                 broadcast = true;
1837                 include_self = true;
1838                 break;
1839 
1840             /* broadcast, excluding self */
1841             case 3:
1842                 broadcast = true;
1843                 break;
1844             }
1845 
1846             if (!broadcast && !include_self) {
1847                 break;
1848             }
1849 
1850             for (i = 0; i <= max_vcpu_index; i++) {
1851                 if (i == cpu->cpu_index && !include_self) {
1852                     continue;
1853                 }
1854 
1855                 /*
1856                  * Assuming that APIC Ids are identity mapped since
1857                  * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
1858                  * are not handled yet and the hypervisor doesn't allow the
1859                  * guest to modify the APIC ID.
1860                  */
1861                 ipi.Destination = i;
1862                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1863                         &ipi, sizeof(ipi));
1864                 if (FAILED(hr)) {
1865                     error_report(
1866                         "WHPX: Failed to request SIPI for %d,  hr=%08lx",
1867                         i, hr);
1868                 }
1869             }
1870 
1871             break;
1872         }
1873 
1874         case WHvRunVpExitReasonCanceled:
1875             if (exclusive_step_mode != WHPX_STEP_NONE) {
1876                 /*
1877                  * We are trying to step over a single instruction, and
1878                  * likely got a request to stop from another thread.
1879                  * Delay it until we are done stepping
1880                  * over.
1881                  */
1882                 ret = 0;
1883             } else {
1884                 cpu->exception_index = EXCP_INTERRUPT;
1885                 ret = 1;
1886             }
1887             break;
1888         case WHvRunVpExitReasonX64MsrAccess: {
1889             WHV_REGISTER_VALUE reg_values[3] = {0};
1890             WHV_REGISTER_NAME reg_names[3];
1891             UINT32 reg_count;
1892 
1893             reg_names[0] = WHvX64RegisterRip;
1894             reg_names[1] = WHvX64RegisterRax;
1895             reg_names[2] = WHvX64RegisterRdx;
1896 
1897             reg_values[0].Reg64 =
1898                 vcpu->exit_ctx.VpContext.Rip +
1899                 vcpu->exit_ctx.VpContext.InstructionLength;
1900 
1901             /*
1902              * For all unsupported MSR access we:
1903              *     ignore writes
1904              *     return 0 on read.
1905              */
1906             reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
1907                         1 : 3;
1908 
1909             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1910                 whpx->partition,
1911                 cpu->cpu_index,
1912                 reg_names, reg_count,
1913                 reg_values);
1914 
1915             if (FAILED(hr)) {
1916                 error_report("WHPX: Failed to set MsrAccess state "
1917                              " registers, hr=%08lx", hr);
1918             }
1919             ret = 0;
1920             break;
1921         }
1922         case WHvRunVpExitReasonX64Cpuid: {
1923             WHV_REGISTER_VALUE reg_values[5];
1924             WHV_REGISTER_NAME reg_names[5];
1925             UINT32 reg_count = 5;
1926             UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
1927             X86CPU *x86_cpu = X86_CPU(cpu);
1928             CPUX86State *env = &x86_cpu->env;
1929 
1930             memset(reg_values, 0, sizeof(reg_values));
1931 
1932             rip = vcpu->exit_ctx.VpContext.Rip +
1933                   vcpu->exit_ctx.VpContext.InstructionLength;
1934             cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
1935 
1936             /*
1937              * Ideally, these should be supplied to the hypervisor during VCPU
1938              * initialization and it should be able to satisfy this request.
1939              * But, currently, WHPX doesn't support setting CPUID values in the
1940              * hypervisor once the partition has been setup, which is too late
1941              * since VCPUs are realized later. For now, use the values from
1942              * QEMU to satisfy these requests, until WHPX adds support for
1943              * being able to set these values in the hypervisor at runtime.
1944              */
1945             cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
1946                 (UINT32 *)&rcx, (UINT32 *)&rdx);
1947             switch (cpuid_fn) {
1948             case 0x40000000:
1949                 /* Expose the vmware cpu frequency cpuid leaf */
1950                 rax = 0x40000010;
1951                 rbx = rcx = rdx = 0;
1952                 break;
1953 
1954             case 0x40000010:
1955                 rax = env->tsc_khz;
1956                 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
1957                 rcx = rdx = 0;
1958                 break;
1959 
1960             case 0x80000001:
1961                 /* Remove any support of OSVW */
1962                 rcx &= ~CPUID_EXT3_OSVW;
1963                 break;
1964             }
1965 
1966             reg_names[0] = WHvX64RegisterRip;
1967             reg_names[1] = WHvX64RegisterRax;
1968             reg_names[2] = WHvX64RegisterRcx;
1969             reg_names[3] = WHvX64RegisterRdx;
1970             reg_names[4] = WHvX64RegisterRbx;
1971 
1972             reg_values[0].Reg64 = rip;
1973             reg_values[1].Reg64 = rax;
1974             reg_values[2].Reg64 = rcx;
1975             reg_values[3].Reg64 = rdx;
1976             reg_values[4].Reg64 = rbx;
1977 
1978             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1979                 whpx->partition, cpu->cpu_index,
1980                 reg_names,
1981                 reg_count,
1982                 reg_values);
1983 
1984             if (FAILED(hr)) {
1985                 error_report("WHPX: Failed to set CpuidAccess state registers,"
1986                              " hr=%08lx", hr);
1987             }
1988             ret = 0;
1989             break;
1990         }
1991         case WHvRunVpExitReasonException:
1992             whpx_get_registers(cpu);
1993 
1994             if ((vcpu->exit_ctx.VpException.ExceptionType ==
1995                  WHvX64ExceptionTypeDebugTrapOrFault) &&
1996                 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
1997                 (vcpu->exit_ctx.VpException.InstructionBytes[0] ==
1998                  whpx_breakpoint_instruction)) {
1999                 /* Stopped at a software breakpoint. */
2000                 cpu->exception_index = EXCP_DEBUG;
2001             } else if ((vcpu->exit_ctx.VpException.ExceptionType ==
2002                         WHvX64ExceptionTypeDebugTrapOrFault) &&
2003                        !cpu->singlestep_enabled) {
2004                 /*
2005                  * Just finished stepping over a breakpoint, but the
2006                  * gdb does not expect us to do single-stepping.
2007                  * Don't do anything special.
2008                  */
2009                 cpu->exception_index = EXCP_INTERRUPT;
2010             } else {
2011                 /* Another exception or debug event. Report it to GDB. */
2012                 cpu->exception_index = EXCP_DEBUG;
2013             }
2014 
2015             ret = 1;
2016             break;
2017         case WHvRunVpExitReasonNone:
2018         case WHvRunVpExitReasonUnrecoverableException:
2019         case WHvRunVpExitReasonInvalidVpRegisterValue:
2020         case WHvRunVpExitReasonUnsupportedFeature:
2021         default:
2022             error_report("WHPX: Unexpected VP exit code %d",
2023                          vcpu->exit_ctx.ExitReason);
2024             whpx_get_registers(cpu);
2025             bql_lock();
2026             qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2027             bql_unlock();
2028             break;
2029         }
2030 
2031     } while (!ret);
2032 
2033     if (stepped_over_bp) {
2034         /* Restore the breakpoint we stepped over */
2035         cpu_memory_rw_debug(cpu,
2036             stepped_over_bp->address,
2037             (void *)&whpx_breakpoint_instruction,
2038             1,
2039             true);
2040     }
2041 
2042     if (exclusive_step_mode != WHPX_STEP_NONE) {
2043         g_assert(cpu_in_exclusive_context(cpu));
2044         cpu->running = false;
2045         end_exclusive();
2046 
2047         exclusive_step_mode = WHPX_STEP_NONE;
2048     } else {
2049         cpu_exec_end(cpu);
2050     }
2051 
2052     bql_lock();
2053     current_cpu = cpu;
2054 
2055     if (--whpx->running_cpus == 0) {
2056         whpx_last_vcpu_stopping(cpu);
2057     }
2058 
2059     qatomic_set(&cpu->exit_request, false);
2060 
2061     return ret < 0;
2062 }
2063 
2064 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2065 {
2066     if (!cpu->vcpu_dirty) {
2067         whpx_get_registers(cpu);
2068         cpu->vcpu_dirty = true;
2069     }
2070 }
2071 
2072 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
2073                                                run_on_cpu_data arg)
2074 {
2075     whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
2076     cpu->vcpu_dirty = false;
2077 }
2078 
2079 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
2080                                               run_on_cpu_data arg)
2081 {
2082     whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
2083     cpu->vcpu_dirty = false;
2084 }
2085 
2086 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
2087                                                run_on_cpu_data arg)
2088 {
2089     cpu->vcpu_dirty = true;
2090 }
2091 
2092 /*
2093  * CPU support.
2094  */
2095 
2096 void whpx_cpu_synchronize_state(CPUState *cpu)
2097 {
2098     if (!cpu->vcpu_dirty) {
2099         run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
2100     }
2101 }
2102 
2103 void whpx_cpu_synchronize_post_reset(CPUState *cpu)
2104 {
2105     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2106 }
2107 
2108 void whpx_cpu_synchronize_post_init(CPUState *cpu)
2109 {
2110     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2111 }
2112 
2113 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
2114 {
2115     run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2116 }
2117 
2118 void whpx_cpu_synchronize_pre_resume(bool step_pending)
2119 {
2120     whpx_global.step_pending = step_pending;
2121 }
2122 
2123 /*
2124  * Vcpu support.
2125  */
2126 
2127 static Error *whpx_migration_blocker;
2128 
2129 static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
2130 {
2131     CPUX86State *env = opaque;
2132 
2133     if (running) {
2134         env->tsc_valid = false;
2135     }
2136 }
2137 
2138 int whpx_init_vcpu(CPUState *cpu)
2139 {
2140     HRESULT hr;
2141     struct whpx_state *whpx = &whpx_global;
2142     AccelCPUState *vcpu = NULL;
2143     Error *local_error = NULL;
2144     X86CPU *x86_cpu = X86_CPU(cpu);
2145     CPUX86State *env = &x86_cpu->env;
2146     UINT64 freq = 0;
2147     int ret;
2148 
2149     /* Add migration blockers for all unsupported features of the
2150      * Windows Hypervisor Platform
2151      */
2152     if (whpx_migration_blocker == NULL) {
2153         error_setg(&whpx_migration_blocker,
2154                "State blocked due to non-migratable CPUID feature support,"
2155                "dirty memory tracking support, and XSAVE/XRSTOR support");
2156 
2157         if (migrate_add_blocker(&whpx_migration_blocker, &local_error) < 0) {
2158             error_report_err(local_error);
2159             ret = -EINVAL;
2160             goto error;
2161         }
2162     }
2163 
2164     vcpu = g_new0(AccelCPUState, 1);
2165 
2166     hr = whp_dispatch.WHvEmulatorCreateEmulator(
2167         &whpx_emu_callbacks,
2168         &vcpu->emulator);
2169     if (FAILED(hr)) {
2170         error_report("WHPX: Failed to setup instruction completion support,"
2171                      " hr=%08lx", hr);
2172         ret = -EINVAL;
2173         goto error;
2174     }
2175 
2176     hr = whp_dispatch.WHvCreateVirtualProcessor(
2177         whpx->partition, cpu->cpu_index, 0);
2178     if (FAILED(hr)) {
2179         error_report("WHPX: Failed to create a virtual processor,"
2180                      " hr=%08lx", hr);
2181         whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2182         ret = -EINVAL;
2183         goto error;
2184     }
2185 
2186     /*
2187      * vcpu's TSC frequency is either specified by user, or use the value
2188      * provided by Hyper-V if the former is not present. In the latter case, we
2189      * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
2190      * frequency can be migrated later via this field.
2191      */
2192     if (!env->tsc_khz) {
2193         hr = whp_dispatch.WHvGetCapability(
2194             WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
2195                 NULL);
2196         if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2197             if (FAILED(hr)) {
2198                 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
2199             } else {
2200                 env->tsc_khz = freq / 1000; /* Hz to KHz */
2201             }
2202         }
2203     }
2204 
2205     env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
2206     hr = whp_dispatch.WHvGetCapability(
2207         WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
2208     if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2209         if (FAILED(hr)) {
2210             printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
2211         } else {
2212             env->apic_bus_freq = freq;
2213         }
2214     }
2215 
2216     /*
2217      * If the vmware cpuid frequency leaf option is set, and we have a valid
2218      * tsc value, trap the corresponding cpuid's.
2219      */
2220     if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
2221         UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
2222 
2223         hr = whp_dispatch.WHvSetPartitionProperty(
2224                 whpx->partition,
2225                 WHvPartitionPropertyCodeCpuidExitList,
2226                 cpuidExitList,
2227                 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2228 
2229         if (FAILED(hr)) {
2230             error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2231                         hr);
2232             ret = -EINVAL;
2233             goto error;
2234         }
2235     }
2236 
2237     vcpu->interruptable = true;
2238     cpu->vcpu_dirty = true;
2239     cpu->accel = vcpu;
2240     max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
2241     qemu_add_vm_change_state_handler(whpx_cpu_update_state, env);
2242 
2243     return 0;
2244 
2245 error:
2246     g_free(vcpu);
2247 
2248     return ret;
2249 }
2250 
2251 int whpx_vcpu_exec(CPUState *cpu)
2252 {
2253     int ret;
2254     int fatal;
2255 
2256     for (;;) {
2257         if (cpu->exception_index >= EXCP_INTERRUPT) {
2258             ret = cpu->exception_index;
2259             cpu->exception_index = -1;
2260             break;
2261         }
2262 
2263         fatal = whpx_vcpu_run(cpu);
2264 
2265         if (fatal) {
2266             error_report("WHPX: Failed to exec a virtual processor");
2267             abort();
2268         }
2269     }
2270 
2271     return ret;
2272 }
2273 
2274 void whpx_destroy_vcpu(CPUState *cpu)
2275 {
2276     struct whpx_state *whpx = &whpx_global;
2277     AccelCPUState *vcpu = cpu->accel;
2278 
2279     whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
2280     whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2281     g_free(cpu->accel);
2282     return;
2283 }
2284 
2285 void whpx_vcpu_kick(CPUState *cpu)
2286 {
2287     struct whpx_state *whpx = &whpx_global;
2288     whp_dispatch.WHvCancelRunVirtualProcessor(
2289         whpx->partition, cpu->cpu_index, 0);
2290 }
2291 
2292 /*
2293  * Memory support.
2294  */
2295 
2296 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
2297                                 void *host_va, int add, int rom,
2298                                 const char *name)
2299 {
2300     struct whpx_state *whpx = &whpx_global;
2301     HRESULT hr;
2302 
2303     /*
2304     if (add) {
2305         printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
2306                (void*)start_pa, (void*)size, host_va,
2307                (rom ? "ROM" : "RAM"), name);
2308     } else {
2309         printf("WHPX: DEL PA:%p Size:%p, Host:%p,      '%s'\n",
2310                (void*)start_pa, (void*)size, host_va, name);
2311     }
2312     */
2313 
2314     if (add) {
2315         hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
2316                                          host_va,
2317                                          start_pa,
2318                                          size,
2319                                          (WHvMapGpaRangeFlagRead |
2320                                           WHvMapGpaRangeFlagExecute |
2321                                           (rom ? 0 : WHvMapGpaRangeFlagWrite)));
2322     } else {
2323         hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
2324                                            start_pa,
2325                                            size);
2326     }
2327 
2328     if (FAILED(hr)) {
2329         error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
2330                      " Host:%p, hr=%08lx",
2331                      (add ? "MAP" : "UNMAP"), name,
2332                      (void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
2333     }
2334 }
2335 
2336 static void whpx_process_section(MemoryRegionSection *section, int add)
2337 {
2338     MemoryRegion *mr = section->mr;
2339     hwaddr start_pa = section->offset_within_address_space;
2340     ram_addr_t size = int128_get64(section->size);
2341     unsigned int delta;
2342     uint64_t host_va;
2343 
2344     if (!memory_region_is_ram(mr)) {
2345         return;
2346     }
2347 
2348     delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
2349     delta &= ~qemu_real_host_page_mask();
2350     if (delta > size) {
2351         return;
2352     }
2353     start_pa += delta;
2354     size -= delta;
2355     size &= qemu_real_host_page_mask();
2356     if (!size || (start_pa & ~qemu_real_host_page_mask())) {
2357         return;
2358     }
2359 
2360     host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
2361             + section->offset_within_region + delta;
2362 
2363     whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
2364                         memory_region_is_rom(mr), mr->name);
2365 }
2366 
2367 static void whpx_region_add(MemoryListener *listener,
2368                            MemoryRegionSection *section)
2369 {
2370     memory_region_ref(section->mr);
2371     whpx_process_section(section, 1);
2372 }
2373 
2374 static void whpx_region_del(MemoryListener *listener,
2375                            MemoryRegionSection *section)
2376 {
2377     whpx_process_section(section, 0);
2378     memory_region_unref(section->mr);
2379 }
2380 
2381 static void whpx_transaction_begin(MemoryListener *listener)
2382 {
2383 }
2384 
2385 static void whpx_transaction_commit(MemoryListener *listener)
2386 {
2387 }
2388 
2389 static void whpx_log_sync(MemoryListener *listener,
2390                          MemoryRegionSection *section)
2391 {
2392     MemoryRegion *mr = section->mr;
2393 
2394     if (!memory_region_is_ram(mr)) {
2395         return;
2396     }
2397 
2398     memory_region_set_dirty(mr, 0, int128_get64(section->size));
2399 }
2400 
2401 static MemoryListener whpx_memory_listener = {
2402     .name = "whpx",
2403     .begin = whpx_transaction_begin,
2404     .commit = whpx_transaction_commit,
2405     .region_add = whpx_region_add,
2406     .region_del = whpx_region_del,
2407     .log_sync = whpx_log_sync,
2408     .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
2409 };
2410 
2411 static void whpx_memory_init(void)
2412 {
2413     memory_listener_register(&whpx_memory_listener, &address_space_memory);
2414 }
2415 
2416 /*
2417  * Load the functions from the given library, using the given handle. If a
2418  * handle is provided, it is used, otherwise the library is opened. The
2419  * handle will be updated on return with the opened one.
2420  */
2421 static bool load_whp_dispatch_fns(HMODULE *handle,
2422     WHPFunctionList function_list)
2423 {
2424     HMODULE hLib = *handle;
2425 
2426     #define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
2427     #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
2428     #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
2429         whp_dispatch.function_name = \
2430             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2431 
2432     #define WHP_LOAD_FIELD(return_type, function_name, signature) \
2433         whp_dispatch.function_name = \
2434             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2435         if (!whp_dispatch.function_name) { \
2436             error_report("Could not load function %s", #function_name); \
2437             goto error; \
2438         } \
2439 
2440     #define WHP_LOAD_LIB(lib_name, handle_lib) \
2441     if (!handle_lib) { \
2442         handle_lib = LoadLibrary(lib_name); \
2443         if (!handle_lib) { \
2444             error_report("Could not load library %s.", lib_name); \
2445             goto error; \
2446         } \
2447     } \
2448 
2449     switch (function_list) {
2450     case WINHV_PLATFORM_FNS_DEFAULT:
2451         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2452         LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
2453         break;
2454 
2455     case WINHV_EMULATION_FNS_DEFAULT:
2456         WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
2457         LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
2458         break;
2459 
2460     case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
2461         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2462         LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
2463         break;
2464     }
2465 
2466     *handle = hLib;
2467     return true;
2468 
2469 error:
2470     if (hLib) {
2471         FreeLibrary(hLib);
2472     }
2473 
2474     return false;
2475 }
2476 
2477 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
2478                                    const char *name, void *opaque,
2479                                    Error **errp)
2480 {
2481     struct whpx_state *whpx = &whpx_global;
2482     OnOffSplit mode;
2483 
2484     if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
2485         return;
2486     }
2487 
2488     switch (mode) {
2489     case ON_OFF_SPLIT_ON:
2490         whpx->kernel_irqchip_allowed = true;
2491         whpx->kernel_irqchip_required = true;
2492         break;
2493 
2494     case ON_OFF_SPLIT_OFF:
2495         whpx->kernel_irqchip_allowed = false;
2496         whpx->kernel_irqchip_required = false;
2497         break;
2498 
2499     case ON_OFF_SPLIT_SPLIT:
2500         error_setg(errp, "WHPX: split irqchip currently not supported");
2501         error_append_hint(errp,
2502             "Try without kernel-irqchip or with kernel-irqchip=on|off");
2503         break;
2504 
2505     default:
2506         /*
2507          * The value was checked in visit_type_OnOffSplit() above. If
2508          * we get here, then something is wrong in QEMU.
2509          */
2510         abort();
2511     }
2512 }
2513 
2514 /*
2515  * Partition support
2516  */
2517 
2518 static int whpx_accel_init(MachineState *ms)
2519 {
2520     struct whpx_state *whpx;
2521     int ret;
2522     HRESULT hr;
2523     WHV_CAPABILITY whpx_cap;
2524     UINT32 whpx_cap_size;
2525     WHV_PARTITION_PROPERTY prop;
2526     UINT32 cpuidExitList[] = {1, 0x80000001};
2527     WHV_CAPABILITY_FEATURES features = {0};
2528 
2529     whpx = &whpx_global;
2530 
2531     if (!init_whp_dispatch()) {
2532         ret = -ENOSYS;
2533         goto error;
2534     }
2535 
2536     whpx->mem_quota = ms->ram_size;
2537 
2538     hr = whp_dispatch.WHvGetCapability(
2539         WHvCapabilityCodeHypervisorPresent, &whpx_cap,
2540         sizeof(whpx_cap), &whpx_cap_size);
2541     if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
2542         error_report("WHPX: No accelerator found, hr=%08lx", hr);
2543         ret = -ENOSPC;
2544         goto error;
2545     }
2546 
2547     hr = whp_dispatch.WHvGetCapability(
2548         WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
2549     if (FAILED(hr)) {
2550         error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
2551         ret = -EINVAL;
2552         goto error;
2553     }
2554 
2555     hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
2556     if (FAILED(hr)) {
2557         error_report("WHPX: Failed to create partition, hr=%08lx", hr);
2558         ret = -EINVAL;
2559         goto error;
2560     }
2561 
2562     /*
2563      * Query the XSAVE capability of the partition. Any error here is not
2564      * considered fatal.
2565      */
2566     hr = whp_dispatch.WHvGetPartitionProperty(
2567         whpx->partition,
2568         WHvPartitionPropertyCodeProcessorXsaveFeatures,
2569         &whpx_xsave_cap,
2570         sizeof(whpx_xsave_cap),
2571         &whpx_cap_size);
2572 
2573     /*
2574      * Windows version which don't support this property will return with the
2575      * specific error code.
2576      */
2577     if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
2578         error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
2579     }
2580 
2581     if (!whpx_has_xsave()) {
2582         printf("WHPX: Partition is not XSAVE capable\n");
2583     }
2584 
2585     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2586     prop.ProcessorCount = ms->smp.cpus;
2587     hr = whp_dispatch.WHvSetPartitionProperty(
2588         whpx->partition,
2589         WHvPartitionPropertyCodeProcessorCount,
2590         &prop,
2591         sizeof(WHV_PARTITION_PROPERTY));
2592 
2593     if (FAILED(hr)) {
2594         error_report("WHPX: Failed to set partition processor count to %u,"
2595                      " hr=%08lx", prop.ProcessorCount, hr);
2596         ret = -EINVAL;
2597         goto error;
2598     }
2599 
2600     /*
2601      * Error out if WHP doesn't support apic emulation and user is requiring
2602      * it.
2603      */
2604     if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
2605             !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
2606         error_report("WHPX: kernel irqchip requested, but unavailable. "
2607             "Try without kernel-irqchip or with kernel-irqchip=off");
2608         ret = -EINVAL;
2609         goto error;
2610     }
2611 
2612     if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
2613         whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
2614         WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
2615             WHvX64LocalApicEmulationModeXApic;
2616         printf("WHPX: setting APIC emulation mode in the hypervisor\n");
2617         hr = whp_dispatch.WHvSetPartitionProperty(
2618             whpx->partition,
2619             WHvPartitionPropertyCodeLocalApicEmulationMode,
2620             &mode,
2621             sizeof(mode));
2622         if (FAILED(hr)) {
2623             error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
2624             if (whpx->kernel_irqchip_required) {
2625                 error_report("WHPX: kernel irqchip requested, but unavailable");
2626                 ret = -EINVAL;
2627                 goto error;
2628             }
2629         } else {
2630             whpx->apic_in_platform = true;
2631         }
2632     }
2633 
2634     /* Register for MSR and CPUID exits */
2635     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2636     prop.ExtendedVmExits.X64MsrExit = 1;
2637     prop.ExtendedVmExits.X64CpuidExit = 1;
2638     prop.ExtendedVmExits.ExceptionExit = 1;
2639     if (whpx_apic_in_platform()) {
2640         prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
2641     }
2642 
2643     hr = whp_dispatch.WHvSetPartitionProperty(
2644             whpx->partition,
2645             WHvPartitionPropertyCodeExtendedVmExits,
2646             &prop,
2647             sizeof(WHV_PARTITION_PROPERTY));
2648     if (FAILED(hr)) {
2649         error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
2650         ret = -EINVAL;
2651         goto error;
2652     }
2653 
2654     hr = whp_dispatch.WHvSetPartitionProperty(
2655         whpx->partition,
2656         WHvPartitionPropertyCodeCpuidExitList,
2657         cpuidExitList,
2658         RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2659 
2660     if (FAILED(hr)) {
2661         error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2662                      hr);
2663         ret = -EINVAL;
2664         goto error;
2665     }
2666 
2667     /*
2668      * We do not want to intercept any exceptions from the guest,
2669      * until we actually start debugging with gdb.
2670      */
2671     whpx->exception_exit_bitmap = -1;
2672     hr = whpx_set_exception_exit_bitmap(0);
2673 
2674     if (FAILED(hr)) {
2675         error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
2676         ret = -EINVAL;
2677         goto error;
2678     }
2679 
2680     hr = whp_dispatch.WHvSetupPartition(whpx->partition);
2681     if (FAILED(hr)) {
2682         error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
2683         ret = -EINVAL;
2684         goto error;
2685     }
2686 
2687     whpx_memory_init();
2688 
2689     printf("Windows Hypervisor Platform accelerator is operational\n");
2690     return 0;
2691 
2692 error:
2693 
2694     if (NULL != whpx->partition) {
2695         whp_dispatch.WHvDeletePartition(whpx->partition);
2696         whpx->partition = NULL;
2697     }
2698 
2699     return ret;
2700 }
2701 
2702 int whpx_enabled(void)
2703 {
2704     return whpx_allowed;
2705 }
2706 
2707 bool whpx_apic_in_platform(void) {
2708     return whpx_global.apic_in_platform;
2709 }
2710 
2711 static void whpx_accel_class_init(ObjectClass *oc, void *data)
2712 {
2713     AccelClass *ac = ACCEL_CLASS(oc);
2714     ac->name = "WHPX";
2715     ac->init_machine = whpx_accel_init;
2716     ac->allowed = &whpx_allowed;
2717 
2718     object_class_property_add(oc, "kernel-irqchip", "on|off|split",
2719         NULL, whpx_set_kernel_irqchip,
2720         NULL, NULL);
2721     object_class_property_set_description(oc, "kernel-irqchip",
2722         "Configure WHPX in-kernel irqchip");
2723 }
2724 
2725 static void whpx_accel_instance_init(Object *obj)
2726 {
2727     struct whpx_state *whpx = &whpx_global;
2728 
2729     memset(whpx, 0, sizeof(struct whpx_state));
2730     /* Turn on kernel-irqchip, by default */
2731     whpx->kernel_irqchip_allowed = true;
2732 }
2733 
2734 static const TypeInfo whpx_accel_type = {
2735     .name = ACCEL_CLASS_NAME("whpx"),
2736     .parent = TYPE_ACCEL,
2737     .instance_init = whpx_accel_instance_init,
2738     .class_init = whpx_accel_class_init,
2739 };
2740 
2741 static void whpx_type_init(void)
2742 {
2743     type_register_static(&whpx_accel_type);
2744 }
2745 
2746 bool init_whp_dispatch(void)
2747 {
2748     if (whp_dispatch_initialized) {
2749         return true;
2750     }
2751 
2752     if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
2753         goto error;
2754     }
2755 
2756     if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
2757         goto error;
2758     }
2759 
2760     assert(load_whp_dispatch_fns(&hWinHvPlatform,
2761         WINHV_PLATFORM_FNS_SUPPLEMENTAL));
2762     whp_dispatch_initialized = true;
2763 
2764     return true;
2765 error:
2766     if (hWinHvPlatform) {
2767         FreeLibrary(hWinHvPlatform);
2768     }
2769 
2770     if (hWinHvEmulation) {
2771         FreeLibrary(hWinHvEmulation);
2772     }
2773 
2774     return false;
2775 }
2776 
2777 type_init(whpx_type_init);
2778