1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 /* 31 * This file and its contents are supplied under the terms of the 32 * Common Development and Distribution License ("CDDL"), version 1.0. 33 * You may only use this file in accordance with the terms of version 34 * 1.0 of the CDDL. 35 * 36 * A full copy of the text of the CDDL should have accompanied this 37 * source. A copy of the CDDL is also available via the Internet at 38 * http://www.illumos.org/license/CDDL. 39 * 40 * Copyright 2015 Pluribus Networks Inc. 41 * Copyright 2018 Joyent, Inc. 42 * Copyright 2022 Oxide Computer Company 43 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 44 */ 45 46 #include <sys/cdefs.h> 47 __FBSDID("$FreeBSD$"); 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/kernel.h> 52 #include <sys/module.h> 53 #include <sys/sysctl.h> 54 #include <sys/kmem.h> 55 #include <sys/pcpu.h> 56 #include <sys/mutex.h> 57 #include <sys/proc.h> 58 #include <sys/rwlock.h> 59 #include <sys/sched.h> 60 #include <sys/systm.h> 61 #include <sys/sunddi.h> 62 #include <sys/hma.h> 63 64 #include <machine/md_var.h> 65 #include <x86/psl.h> 66 #include <x86/apicreg.h> 67 68 #include <machine/specialreg.h> 69 #include <machine/vmm.h> 70 #include <machine/vmm_dev.h> 71 #include <machine/vmparam.h> 72 #include <sys/vmm_instruction_emul.h> 73 #include <sys/vmm_vm.h> 74 #include <sys/vmm_gpt.h> 75 #include <sys/vmm_data.h> 76 77 #include "vmm_ioport.h" 78 #include "vmm_host.h" 79 #include "vmm_util.h" 80 #include "vatpic.h" 81 #include "vatpit.h" 82 #include "vhpet.h" 83 #include "vioapic.h" 84 #include "vlapic.h" 85 #include "vpmtmr.h" 86 #include "vrtc.h" 87 #include "vmm_stat.h" 88 #include "vmm_lapic.h" 89 90 #include "io/ppt.h" 91 #include "io/iommu.h" 92 93 struct vlapic; 94 95 /* Flags for vtc_status */ 96 #define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */ 97 #define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */ 98 99 typedef struct vm_thread_ctx { 100 struct vm *vtc_vm; 101 int vtc_vcpuid; 102 uint_t vtc_status; 103 enum vcpu_ustate vtc_ustate; 104 } vm_thread_ctx_t; 105 106 #define VMM_MTRR_VAR_MAX 10 107 #define VMM_MTRR_DEF_MASK \ 108 (MTRR_DEF_ENABLE | MTRR_DEF_FIXED_ENABLE | MTRR_DEF_TYPE) 109 #define VMM_MTRR_PHYSBASE_MASK (MTRR_PHYSBASE_PHYSBASE | MTRR_PHYSBASE_TYPE) 110 #define VMM_MTRR_PHYSMASK_MASK (MTRR_PHYSMASK_PHYSMASK | MTRR_PHYSMASK_VALID) 111 struct vm_mtrr { 112 uint64_t def_type; 113 uint64_t fixed4k[8]; 114 uint64_t fixed16k[2]; 115 uint64_t fixed64k; 116 struct { 117 uint64_t base; 118 uint64_t mask; 119 } var[VMM_MTRR_VAR_MAX]; 120 }; 121 122 /* 123 * Initialization: 124 * (a) allocated when vcpu is created 125 * (i) initialized when vcpu is created and when it is reinitialized 126 * (o) initialized the first time the vcpu is created 127 * (x) initialized before use 128 */ 129 struct vcpu { 130 /* (o) protects state, run_state, hostcpu, sipi_vector */ 131 kmutex_t lock; 132 133 enum vcpu_state state; /* (o) vcpu state */ 134 enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */ 135 kcondvar_t vcpu_cv; /* (o) cpu waiter cv */ 136 kcondvar_t state_cv; /* (o) IDLE-transition cv */ 137 int hostcpu; /* (o) vcpu's current host cpu */ 138 int lastloccpu; /* (o) last host cpu localized to */ 139 int reqidle; /* (i) request vcpu to idle */ 140 struct vlapic *vlapic; /* (i) APIC device model */ 141 enum x2apic_state x2apic_state; /* (i) APIC mode */ 142 uint64_t exit_intinfo; /* (i) events pending at VM exit */ 143 uint64_t exc_pending; /* (i) exception pending */ 144 bool nmi_pending; /* (i) NMI pending */ 145 bool extint_pending; /* (i) INTR pending */ 146 147 uint8_t sipi_vector; /* (i) SIPI vector */ 148 hma_fpu_t *guestfpu; /* (a,i) guest fpu state */ 149 uint64_t guest_xcr0; /* (i) guest %xcr0 register */ 150 void *stats; /* (a,i) statistics */ 151 struct vm_exit exitinfo; /* (x) exit reason and collateral */ 152 uint64_t nextrip; /* (x) next instruction to execute */ 153 struct vie *vie_ctx; /* (x) instruction emulation context */ 154 vm_client_t *vmclient; /* (a) VM-system client */ 155 uint64_t tsc_offset; /* (x) offset from host TSC */ 156 struct vm_mtrr mtrr; /* (i) vcpu's MTRR */ 157 158 enum vcpu_ustate ustate; /* (i) microstate for the vcpu */ 159 hrtime_t ustate_when; /* (i) time of last ustate change */ 160 uint64_t ustate_total[VU_MAX]; /* (o) total time spent in ustates */ 161 vm_thread_ctx_t vtc; /* (o) thread state for ctxops */ 162 struct ctxop *ctxop; /* (o) ctxop storage for vcpu */ 163 }; 164 165 #define vcpu_lock(v) mutex_enter(&((v)->lock)) 166 #define vcpu_unlock(v) mutex_exit(&((v)->lock)) 167 #define vcpu_assert_locked(v) ASSERT(MUTEX_HELD(&((v)->lock))) 168 169 struct mem_seg { 170 size_t len; 171 bool sysmem; 172 vm_object_t *object; 173 }; 174 #define VM_MAX_MEMSEGS 5 175 176 struct mem_map { 177 vm_paddr_t gpa; 178 size_t len; 179 vm_ooffset_t segoff; 180 int segid; 181 int prot; 182 int flags; 183 }; 184 #define VM_MAX_MEMMAPS 8 185 186 /* 187 * Initialization: 188 * (o) initialized the first time the VM is created 189 * (i) initialized when VM is created and when it is reinitialized 190 * (x) initialized before use 191 */ 192 struct vm { 193 void *cookie; /* (i) cpu-specific data */ 194 void *iommu; /* (x) iommu-specific data */ 195 struct vhpet *vhpet; /* (i) virtual HPET */ 196 struct vioapic *vioapic; /* (i) virtual ioapic */ 197 struct vatpic *vatpic; /* (i) virtual atpic */ 198 struct vatpit *vatpit; /* (i) virtual atpit */ 199 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ 200 struct vrtc *vrtc; /* (o) virtual RTC */ 201 volatile cpuset_t active_cpus; /* (i) active vcpus */ 202 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for dbg */ 203 int suspend; /* (i) stop VM execution */ 204 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ 205 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ 206 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ 207 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ 208 struct vmspace *vmspace; /* (o) guest's address space */ 209 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ 210 /* The following describe the vm cpu topology */ 211 uint16_t sockets; /* (o) num of sockets */ 212 uint16_t cores; /* (o) num of cores/socket */ 213 uint16_t threads; /* (o) num of threads/core */ 214 uint16_t maxcpus; /* (o) max pluggable cpus */ 215 216 uint64_t boot_tsc_offset; /* (i) TSC offset at VM boot */ 217 hrtime_t boot_hrtime; /* (i) hrtime at VM boot */ 218 219 struct ioport_config ioports; /* (o) ioport handling */ 220 221 bool mem_transient; /* (o) alloc transient memory */ 222 }; 223 224 static int vmm_initialized; 225 226 227 static void 228 nullop_panic(void) 229 { 230 panic("null vmm operation call"); 231 } 232 233 /* Do not allow use of an un-set `ops` to do anything but panic */ 234 static struct vmm_ops vmm_ops_null = { 235 .init = (vmm_init_func_t)nullop_panic, 236 .cleanup = (vmm_cleanup_func_t)nullop_panic, 237 .resume = (vmm_resume_func_t)nullop_panic, 238 .vminit = (vmi_init_func_t)nullop_panic, 239 .vmrun = (vmi_run_func_t)nullop_panic, 240 .vmcleanup = (vmi_cleanup_func_t)nullop_panic, 241 .vmgetreg = (vmi_get_register_t)nullop_panic, 242 .vmsetreg = (vmi_set_register_t)nullop_panic, 243 .vmgetdesc = (vmi_get_desc_t)nullop_panic, 244 .vmsetdesc = (vmi_set_desc_t)nullop_panic, 245 .vmgetcap = (vmi_get_cap_t)nullop_panic, 246 .vmsetcap = (vmi_set_cap_t)nullop_panic, 247 .vlapic_init = (vmi_vlapic_init)nullop_panic, 248 .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic, 249 .vmsavectx = (vmi_savectx)nullop_panic, 250 .vmrestorectx = (vmi_restorectx)nullop_panic, 251 .vmgetmsr = (vmi_get_msr_t)nullop_panic, 252 .vmsetmsr = (vmi_set_msr_t)nullop_panic, 253 }; 254 255 static struct vmm_ops *ops = &vmm_ops_null; 256 static vmm_pte_ops_t *pte_ops = NULL; 257 258 #define VMM_INIT() ((*ops->init)()) 259 #define VMM_CLEANUP() ((*ops->cleanup)()) 260 #define VMM_RESUME() ((*ops->resume)()) 261 262 #define VMINIT(vm) ((*ops->vminit)(vm)) 263 #define VMRUN(vmi, vcpu, rip) ((*ops->vmrun)(vmi, vcpu, rip)) 264 #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi)) 265 266 #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv)) 267 #define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val)) 268 #define VMGETDESC(vmi, vcpu, num, dsc) ((*ops->vmgetdesc)(vmi, vcpu, num, dsc)) 269 #define VMSETDESC(vmi, vcpu, num, dsc) ((*ops->vmsetdesc)(vmi, vcpu, num, dsc)) 270 #define VMGETCAP(vmi, vcpu, num, rv) ((*ops->vmgetcap)(vmi, vcpu, num, rv)) 271 #define VMSETCAP(vmi, vcpu, num, val) ((*ops->vmsetcap)(vmi, vcpu, num, val)) 272 #define VLAPIC_INIT(vmi, vcpu) ((*ops->vlapic_init)(vmi, vcpu)) 273 #define VLAPIC_CLEANUP(vmi, vlapic) ((*ops->vlapic_cleanup)(vmi, vlapic)) 274 275 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 276 #define fpu_stop_emulating() clts() 277 278 SDT_PROVIDER_DEFINE(vmm); 279 280 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 281 NULL); 282 283 /* 284 * Halt the guest if all vcpus are executing a HLT instruction with 285 * interrupts disabled. 286 */ 287 static int halt_detection_enabled = 1; 288 289 /* Trap into hypervisor on all guest exceptions and reflect them back */ 290 static int trace_guest_exceptions; 291 292 static void vm_free_memmap(struct vm *vm, int ident); 293 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); 294 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t); 295 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid); 296 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector); 297 298 static void vmm_savectx(void *); 299 static void vmm_restorectx(void *); 300 static const struct ctxop_template vmm_ctxop_tpl = { 301 .ct_rev = CTXOP_TPL_REV, 302 .ct_save = vmm_savectx, 303 .ct_restore = vmm_restorectx, 304 }; 305 306 #ifdef KTR 307 static const char * 308 vcpu_state2str(enum vcpu_state state) 309 { 310 311 switch (state) { 312 case VCPU_IDLE: 313 return ("idle"); 314 case VCPU_FROZEN: 315 return ("frozen"); 316 case VCPU_RUNNING: 317 return ("running"); 318 case VCPU_SLEEPING: 319 return ("sleeping"); 320 default: 321 return ("unknown"); 322 } 323 } 324 #endif 325 326 static void 327 vcpu_cleanup(struct vm *vm, int i, bool destroy) 328 { 329 struct vcpu *vcpu = &vm->vcpu[i]; 330 331 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); 332 if (destroy) { 333 vmm_stat_free(vcpu->stats); 334 335 hma_fpu_free(vcpu->guestfpu); 336 vcpu->guestfpu = NULL; 337 338 vie_free(vcpu->vie_ctx); 339 vcpu->vie_ctx = NULL; 340 341 vmc_destroy(vcpu->vmclient); 342 vcpu->vmclient = NULL; 343 344 ctxop_free(vcpu->ctxop); 345 mutex_destroy(&vcpu->lock); 346 } 347 } 348 349 static void 350 vcpu_init(struct vm *vm, int vcpu_id, bool create) 351 { 352 struct vcpu *vcpu; 353 354 KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, 355 ("vcpu_init: invalid vcpu %d", vcpu_id)); 356 357 vcpu = &vm->vcpu[vcpu_id]; 358 359 if (create) { 360 mutex_init(&vcpu->lock, NULL, MUTEX_ADAPTIVE, NULL); 361 362 vcpu->state = VCPU_IDLE; 363 vcpu->hostcpu = NOCPU; 364 vcpu->lastloccpu = NOCPU; 365 vcpu->guestfpu = hma_fpu_alloc(KM_SLEEP); 366 vcpu->stats = vmm_stat_alloc(); 367 vcpu->vie_ctx = vie_alloc(); 368 369 vcpu->ustate = VU_INIT; 370 vcpu->ustate_when = gethrtime(); 371 372 vcpu->vtc.vtc_vm = vm; 373 vcpu->vtc.vtc_vcpuid = vcpu_id; 374 vcpu->ctxop = ctxop_allocate(&vmm_ctxop_tpl, &vcpu->vtc); 375 } else { 376 vie_reset(vcpu->vie_ctx); 377 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo)); 378 if (vcpu->ustate != VU_INIT) { 379 vcpu_ustate_change(vm, vcpu_id, VU_INIT); 380 } 381 bzero(&vcpu->mtrr, sizeof (vcpu->mtrr)); 382 } 383 384 vcpu->run_state = VRS_HALT; 385 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); 386 (void) vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); 387 vcpu->reqidle = 0; 388 vcpu->exit_intinfo = 0; 389 vcpu->nmi_pending = false; 390 vcpu->extint_pending = false; 391 vcpu->exc_pending = 0; 392 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 393 (void) hma_fpu_init(vcpu->guestfpu); 394 vmm_stat_init(vcpu->stats); 395 vcpu->tsc_offset = 0; 396 } 397 398 int 399 vcpu_trace_exceptions(struct vm *vm, int vcpuid) 400 { 401 402 return (trace_guest_exceptions); 403 } 404 405 struct vm_exit * 406 vm_exitinfo(struct vm *vm, int cpuid) 407 { 408 struct vcpu *vcpu; 409 410 if (cpuid < 0 || cpuid >= vm->maxcpus) 411 panic("vm_exitinfo: invalid cpuid %d", cpuid); 412 413 vcpu = &vm->vcpu[cpuid]; 414 415 return (&vcpu->exitinfo); 416 } 417 418 struct vie * 419 vm_vie_ctx(struct vm *vm, int cpuid) 420 { 421 if (cpuid < 0 || cpuid >= vm->maxcpus) 422 panic("vm_vie_ctx: invalid cpuid %d", cpuid); 423 424 return (vm->vcpu[cpuid].vie_ctx); 425 } 426 427 static int 428 vmm_init(void) 429 { 430 vmm_host_state_init(); 431 432 if (vmm_is_intel()) { 433 ops = &vmm_ops_intel; 434 pte_ops = &ept_pte_ops; 435 } else if (vmm_is_svm()) { 436 ops = &vmm_ops_amd; 437 pte_ops = &rvi_pte_ops; 438 } else { 439 return (ENXIO); 440 } 441 442 return (VMM_INIT()); 443 } 444 445 int 446 vmm_mod_load() 447 { 448 int error; 449 450 VERIFY(vmm_initialized == 0); 451 452 error = vmm_init(); 453 if (error == 0) 454 vmm_initialized = 1; 455 456 return (error); 457 } 458 459 int 460 vmm_mod_unload() 461 { 462 int error; 463 464 VERIFY(vmm_initialized == 1); 465 466 error = VMM_CLEANUP(); 467 if (error) 468 return (error); 469 vmm_initialized = 0; 470 471 return (0); 472 } 473 474 /* 475 * Create a test IOMMU domain to see if the host system has necessary hardware 476 * and drivers to do so. 477 */ 478 bool 479 vmm_check_iommu(void) 480 { 481 void *domain; 482 const size_t arb_test_sz = (1UL << 32); 483 484 domain = iommu_create_domain(arb_test_sz); 485 if (domain == NULL) { 486 return (false); 487 } 488 iommu_destroy_domain(domain); 489 return (true); 490 } 491 492 static void 493 vm_init(struct vm *vm, bool create) 494 { 495 int i; 496 497 vm->cookie = VMINIT(vm); 498 vm->iommu = NULL; 499 vm->vioapic = vioapic_init(vm); 500 vm->vhpet = vhpet_init(vm); 501 vm->vatpic = vatpic_init(vm); 502 vm->vatpit = vatpit_init(vm); 503 vm->vpmtmr = vpmtmr_init(vm); 504 if (create) 505 vm->vrtc = vrtc_init(vm); 506 507 vm_inout_init(vm, &vm->ioports); 508 509 CPU_ZERO(&vm->active_cpus); 510 CPU_ZERO(&vm->debug_cpus); 511 512 vm->suspend = 0; 513 CPU_ZERO(&vm->suspended_cpus); 514 515 for (i = 0; i < vm->maxcpus; i++) 516 vcpu_init(vm, i, create); 517 518 /* 519 * Configure the VM-wide TSC offset so that the call to vm_init() 520 * represents the boot time (when the TSC(s) read 0). Each vCPU will 521 * have its own offset from this, which is altered if/when the guest 522 * writes to MSR_TSC. 523 * 524 * The TSC offsetting math is all unsigned, using overflow for negative 525 * offets. A reading of the TSC is negated to form the boot offset. 526 */ 527 const uint64_t boot_tsc = rdtsc_offset(); 528 vm->boot_tsc_offset = (uint64_t)(-(int64_t)boot_tsc); 529 530 /* Convert the boot TSC reading to hrtime */ 531 vm->boot_hrtime = (hrtime_t)boot_tsc; 532 scalehrtime(&vm->boot_hrtime); 533 } 534 535 /* 536 * The default CPU topology is a single thread per package. 537 */ 538 uint_t cores_per_package = 1; 539 uint_t threads_per_core = 1; 540 541 /* 542 * Debugging tunable to enable dirty-page-tracking. 543 * (Remains off by default for now) 544 */ 545 bool gpt_track_dirty = false; 546 547 int 548 vm_create(uint64_t flags, struct vm **retvm) 549 { 550 struct vm *vm; 551 struct vmspace *vmspace; 552 553 /* 554 * If vmm.ko could not be successfully initialized then don't attempt 555 * to create the virtual machine. 556 */ 557 if (!vmm_initialized) 558 return (ENXIO); 559 560 vmspace = vmspace_alloc(VM_MAXUSER_ADDRESS, pte_ops, gpt_track_dirty); 561 if (vmspace == NULL) 562 return (ENOMEM); 563 564 vm = kmem_zalloc(sizeof (struct vm), KM_SLEEP); 565 566 vm->vmspace = vmspace; 567 vm->mem_transient = (flags & VCF_RESERVOIR_MEM) == 0; 568 for (uint_t i = 0; i < VM_MAXCPU; i++) { 569 vm->vcpu[i].vmclient = vmspace_client_alloc(vmspace); 570 } 571 572 vm->sockets = 1; 573 vm->cores = cores_per_package; /* XXX backwards compatibility */ 574 vm->threads = threads_per_core; /* XXX backwards compatibility */ 575 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 576 577 vm_init(vm, true); 578 579 *retvm = vm; 580 return (0); 581 } 582 583 void 584 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, 585 uint16_t *threads, uint16_t *maxcpus) 586 { 587 *sockets = vm->sockets; 588 *cores = vm->cores; 589 *threads = vm->threads; 590 *maxcpus = vm->maxcpus; 591 } 592 593 uint16_t 594 vm_get_maxcpus(struct vm *vm) 595 { 596 return (vm->maxcpus); 597 } 598 599 int 600 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, 601 uint16_t threads, uint16_t maxcpus) 602 { 603 if (maxcpus != 0) 604 return (EINVAL); /* XXX remove when supported */ 605 if ((sockets * cores * threads) > vm->maxcpus) 606 return (EINVAL); 607 /* XXX need to check sockets * cores * threads == vCPU, how? */ 608 vm->sockets = sockets; 609 vm->cores = cores; 610 vm->threads = threads; 611 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 612 return (0); 613 } 614 615 static void 616 vm_cleanup(struct vm *vm, bool destroy) 617 { 618 struct mem_map *mm; 619 int i; 620 621 ppt_unassign_all(vm); 622 623 if (vm->iommu != NULL) 624 iommu_destroy_domain(vm->iommu); 625 626 /* 627 * Devices which attach their own ioport hooks should be cleaned up 628 * first so they can tear down those registrations. 629 */ 630 vpmtmr_cleanup(vm->vpmtmr); 631 632 vm_inout_cleanup(vm, &vm->ioports); 633 634 if (destroy) 635 vrtc_cleanup(vm->vrtc); 636 else 637 vrtc_reset(vm->vrtc); 638 639 vatpit_cleanup(vm->vatpit); 640 vhpet_cleanup(vm->vhpet); 641 vatpic_cleanup(vm->vatpic); 642 vioapic_cleanup(vm->vioapic); 643 644 for (i = 0; i < vm->maxcpus; i++) 645 vcpu_cleanup(vm, i, destroy); 646 647 VMCLEANUP(vm->cookie); 648 649 /* 650 * System memory is removed from the guest address space only when 651 * the VM is destroyed. This is because the mapping remains the same 652 * across VM reset. 653 * 654 * Device memory can be relocated by the guest (e.g. using PCI BARs) 655 * so those mappings are removed on a VM reset. 656 */ 657 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 658 mm = &vm->mem_maps[i]; 659 if (destroy || !sysmem_mapping(vm, mm)) { 660 vm_free_memmap(vm, i); 661 } else { 662 /* 663 * We need to reset the IOMMU flag so this mapping can 664 * be reused when a VM is rebooted. Since the IOMMU 665 * domain has already been destroyed we can just reset 666 * the flag here. 667 */ 668 mm->flags &= ~VM_MEMMAP_F_IOMMU; 669 } 670 } 671 672 if (destroy) { 673 for (i = 0; i < VM_MAX_MEMSEGS; i++) 674 vm_free_memseg(vm, i); 675 676 vmspace_destroy(vm->vmspace); 677 vm->vmspace = NULL; 678 } 679 } 680 681 void 682 vm_destroy(struct vm *vm) 683 { 684 vm_cleanup(vm, true); 685 kmem_free(vm, sizeof (*vm)); 686 } 687 688 int 689 vm_reinit(struct vm *vm, uint64_t flags) 690 { 691 /* A virtual machine can be reset only if all vcpus are suspended. */ 692 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) != 0) { 693 if ((flags & VM_REINIT_F_FORCE_SUSPEND) == 0) { 694 return (EBUSY); 695 } 696 697 /* 698 * Force the VM (and all its vCPUs) into a suspended state. 699 * This should be quick and easy, since the vm_reinit() call is 700 * made while holding the VM write lock, which requires holding 701 * all of the vCPUs in the VCPU_FROZEN state. 702 */ 703 (void) atomic_cmpset_int((uint_t *)&vm->suspend, 0, 704 VM_SUSPEND_RESET); 705 for (uint_t i = 0; i < vm->maxcpus; i++) { 706 struct vcpu *vcpu = &vm->vcpu[i]; 707 708 if (CPU_ISSET(i, &vm->suspended_cpus) || 709 !CPU_ISSET(i, &vm->active_cpus)) { 710 continue; 711 } 712 713 vcpu_lock(vcpu); 714 VERIFY3U(vcpu->state, ==, VCPU_FROZEN); 715 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 716 vcpu_unlock(vcpu); 717 } 718 719 VERIFY0(CPU_CMP(&vm->suspended_cpus, &vm->active_cpus)); 720 } 721 722 vm_cleanup(vm, false); 723 vm_init(vm, false); 724 return (0); 725 } 726 727 int 728 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 729 { 730 vm_object_t *obj; 731 732 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 733 return (ENOMEM); 734 else 735 return (0); 736 } 737 738 int 739 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 740 { 741 return (vmspace_unmap(vm->vmspace, gpa, gpa + len)); 742 } 743 744 /* 745 * Return 'true' if 'gpa' is allocated in the guest address space. 746 * 747 * This function is called in the context of a running vcpu which acts as 748 * an implicit lock on 'vm->mem_maps[]'. 749 */ 750 bool 751 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) 752 { 753 struct mem_map *mm; 754 int i; 755 756 #ifdef INVARIANTS 757 int hostcpu, state; 758 state = vcpu_get_state(vm, vcpuid, &hostcpu); 759 KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, 760 ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); 761 #endif 762 763 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 764 mm = &vm->mem_maps[i]; 765 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) 766 return (true); /* 'gpa' is sysmem or devmem */ 767 } 768 769 if (ppt_is_mmio(vm, gpa)) 770 return (true); /* 'gpa' is pci passthru mmio */ 771 772 return (false); 773 } 774 775 int 776 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) 777 { 778 struct mem_seg *seg; 779 vm_object_t *obj; 780 781 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 782 return (EINVAL); 783 784 if (len == 0 || (len & PAGE_MASK)) 785 return (EINVAL); 786 787 seg = &vm->mem_segs[ident]; 788 if (seg->object != NULL) { 789 if (seg->len == len && seg->sysmem == sysmem) 790 return (EEXIST); 791 else 792 return (EINVAL); 793 } 794 795 obj = vm_object_mem_allocate(len, vm->mem_transient); 796 if (obj == NULL) 797 return (ENOMEM); 798 799 seg->len = len; 800 seg->object = obj; 801 seg->sysmem = sysmem; 802 return (0); 803 } 804 805 int 806 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, 807 vm_object_t **objptr) 808 { 809 struct mem_seg *seg; 810 811 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 812 return (EINVAL); 813 814 seg = &vm->mem_segs[ident]; 815 if (len) 816 *len = seg->len; 817 if (sysmem) 818 *sysmem = seg->sysmem; 819 if (objptr) 820 *objptr = seg->object; 821 return (0); 822 } 823 824 void 825 vm_free_memseg(struct vm *vm, int ident) 826 { 827 struct mem_seg *seg; 828 829 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, 830 ("%s: invalid memseg ident %d", __func__, ident)); 831 832 seg = &vm->mem_segs[ident]; 833 if (seg->object != NULL) { 834 vm_object_release(seg->object); 835 bzero(seg, sizeof (struct mem_seg)); 836 } 837 } 838 839 int 840 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, 841 size_t len, int prot, int flags) 842 { 843 struct mem_seg *seg; 844 struct mem_map *m, *map; 845 vm_ooffset_t last; 846 int i, error; 847 848 if (prot == 0 || (prot & ~(PROT_ALL)) != 0) 849 return (EINVAL); 850 851 if (flags & ~VM_MEMMAP_F_WIRED) 852 return (EINVAL); 853 854 if (segid < 0 || segid >= VM_MAX_MEMSEGS) 855 return (EINVAL); 856 857 seg = &vm->mem_segs[segid]; 858 if (seg->object == NULL) 859 return (EINVAL); 860 861 last = first + len; 862 if (first < 0 || first >= last || last > seg->len) 863 return (EINVAL); 864 865 if ((gpa | first | last) & PAGE_MASK) 866 return (EINVAL); 867 868 map = NULL; 869 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 870 m = &vm->mem_maps[i]; 871 if (m->len == 0) { 872 map = m; 873 break; 874 } 875 } 876 877 if (map == NULL) 878 return (ENOSPC); 879 880 error = vmspace_map(vm->vmspace, seg->object, first, gpa, len, prot); 881 if (error != 0) 882 return (EFAULT); 883 884 vm_object_reference(seg->object); 885 886 if ((flags & VM_MEMMAP_F_WIRED) != 0) { 887 error = vmspace_populate(vm->vmspace, gpa, gpa + len); 888 if (error != 0) { 889 VERIFY0(vmspace_unmap(vm->vmspace, gpa, gpa + len)); 890 return (EFAULT); 891 } 892 } 893 894 map->gpa = gpa; 895 map->len = len; 896 map->segoff = first; 897 map->segid = segid; 898 map->prot = prot; 899 map->flags = flags; 900 return (0); 901 } 902 903 int 904 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len) 905 { 906 struct mem_map *m; 907 int i; 908 909 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 910 m = &vm->mem_maps[i]; 911 if (m->gpa == gpa && m->len == len && 912 (m->flags & VM_MEMMAP_F_IOMMU) == 0) { 913 vm_free_memmap(vm, i); 914 return (0); 915 } 916 } 917 918 return (EINVAL); 919 } 920 921 int 922 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, 923 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 924 { 925 struct mem_map *mm, *mmnext; 926 int i; 927 928 mmnext = NULL; 929 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 930 mm = &vm->mem_maps[i]; 931 if (mm->len == 0 || mm->gpa < *gpa) 932 continue; 933 if (mmnext == NULL || mm->gpa < mmnext->gpa) 934 mmnext = mm; 935 } 936 937 if (mmnext != NULL) { 938 *gpa = mmnext->gpa; 939 if (segid) 940 *segid = mmnext->segid; 941 if (segoff) 942 *segoff = mmnext->segoff; 943 if (len) 944 *len = mmnext->len; 945 if (prot) 946 *prot = mmnext->prot; 947 if (flags) 948 *flags = mmnext->flags; 949 return (0); 950 } else { 951 return (ENOENT); 952 } 953 } 954 955 static void 956 vm_free_memmap(struct vm *vm, int ident) 957 { 958 struct mem_map *mm; 959 int error; 960 961 mm = &vm->mem_maps[ident]; 962 if (mm->len) { 963 error = vmspace_unmap(vm->vmspace, mm->gpa, 964 mm->gpa + mm->len); 965 KASSERT(error == 0, ("%s: vmspace_unmap error %d", 966 __func__, error)); 967 bzero(mm, sizeof (struct mem_map)); 968 } 969 } 970 971 static __inline bool 972 sysmem_mapping(struct vm *vm, struct mem_map *mm) 973 { 974 975 if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) 976 return (true); 977 else 978 return (false); 979 } 980 981 vm_paddr_t 982 vmm_sysmem_maxaddr(struct vm *vm) 983 { 984 struct mem_map *mm; 985 vm_paddr_t maxaddr; 986 int i; 987 988 maxaddr = 0; 989 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 990 mm = &vm->mem_maps[i]; 991 if (sysmem_mapping(vm, mm)) { 992 if (maxaddr < mm->gpa + mm->len) 993 maxaddr = mm->gpa + mm->len; 994 } 995 } 996 return (maxaddr); 997 } 998 999 static void 1000 vm_iommu_modify(struct vm *vm, bool map) 1001 { 1002 int i, sz; 1003 vm_paddr_t gpa, hpa; 1004 struct mem_map *mm; 1005 vm_client_t *vmc; 1006 1007 sz = PAGE_SIZE; 1008 vmc = vmspace_client_alloc(vm->vmspace); 1009 1010 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 1011 mm = &vm->mem_maps[i]; 1012 if (!sysmem_mapping(vm, mm)) 1013 continue; 1014 1015 if (map) { 1016 KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, 1017 ("iommu map found invalid memmap %lx/%lx/%x", 1018 mm->gpa, mm->len, mm->flags)); 1019 if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) 1020 continue; 1021 mm->flags |= VM_MEMMAP_F_IOMMU; 1022 } else { 1023 if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) 1024 continue; 1025 mm->flags &= ~VM_MEMMAP_F_IOMMU; 1026 KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, 1027 ("iommu unmap found invalid memmap %lx/%lx/%x", 1028 mm->gpa, mm->len, mm->flags)); 1029 } 1030 1031 gpa = mm->gpa; 1032 while (gpa < mm->gpa + mm->len) { 1033 vm_page_t *vmp; 1034 1035 vmp = vmc_hold(vmc, gpa, PROT_WRITE); 1036 ASSERT(vmp != NULL); 1037 hpa = ((uintptr_t)vmp_get_pfn(vmp) << PAGESHIFT); 1038 (void) vmp_release(vmp); 1039 1040 /* 1041 * When originally ported from FreeBSD, the logic for 1042 * adding memory to the guest domain would 1043 * simultaneously remove it from the host domain. The 1044 * justification for that is not clear, and FreeBSD has 1045 * subsequently changed the behavior to not remove the 1046 * memory from the host domain. 1047 * 1048 * Leaving the guest memory in the host domain for the 1049 * life of the VM is necessary to make it available for 1050 * DMA, such as through viona in the TX path. 1051 */ 1052 if (map) { 1053 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 1054 } else { 1055 iommu_remove_mapping(vm->iommu, gpa, sz); 1056 } 1057 1058 gpa += PAGE_SIZE; 1059 } 1060 } 1061 vmc_destroy(vmc); 1062 1063 /* 1064 * Invalidate the cached translations associated with the domain 1065 * from which pages were removed. 1066 */ 1067 iommu_invalidate_tlb(vm->iommu); 1068 } 1069 1070 int 1071 vm_unassign_pptdev(struct vm *vm, int pptfd) 1072 { 1073 int error; 1074 1075 error = ppt_unassign_device(vm, pptfd); 1076 if (error) 1077 return (error); 1078 1079 if (ppt_assigned_devices(vm) == 0) 1080 vm_iommu_modify(vm, false); 1081 1082 return (0); 1083 } 1084 1085 int 1086 vm_assign_pptdev(struct vm *vm, int pptfd) 1087 { 1088 int error; 1089 vm_paddr_t maxaddr; 1090 1091 /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ 1092 if (ppt_assigned_devices(vm) == 0) { 1093 KASSERT(vm->iommu == NULL, 1094 ("vm_assign_pptdev: iommu must be NULL")); 1095 maxaddr = vmm_sysmem_maxaddr(vm); 1096 vm->iommu = iommu_create_domain(maxaddr); 1097 if (vm->iommu == NULL) 1098 return (ENXIO); 1099 vm_iommu_modify(vm, true); 1100 } 1101 1102 error = ppt_assign_device(vm, pptfd); 1103 return (error); 1104 } 1105 1106 int 1107 vm_get_register(struct vm *vm, int vcpuid, int reg, uint64_t *retval) 1108 { 1109 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1110 return (EINVAL); 1111 1112 if (reg >= VM_REG_LAST) 1113 return (EINVAL); 1114 1115 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1116 switch (reg) { 1117 case VM_REG_GUEST_XCR0: 1118 *retval = vcpu->guest_xcr0; 1119 return (0); 1120 default: 1121 return (VMGETREG(vm->cookie, vcpuid, reg, retval)); 1122 } 1123 } 1124 1125 int 1126 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) 1127 { 1128 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1129 return (EINVAL); 1130 1131 if (reg >= VM_REG_LAST) 1132 return (EINVAL); 1133 1134 int error; 1135 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1136 switch (reg) { 1137 case VM_REG_GUEST_RIP: 1138 error = VMSETREG(vm->cookie, vcpuid, reg, val); 1139 if (error == 0) { 1140 vcpu->nextrip = val; 1141 } 1142 return (error); 1143 case VM_REG_GUEST_XCR0: 1144 if (!validate_guest_xcr0(val, vmm_get_host_xcr0())) { 1145 return (EINVAL); 1146 } 1147 vcpu->guest_xcr0 = val; 1148 return (0); 1149 default: 1150 return (VMSETREG(vm->cookie, vcpuid, reg, val)); 1151 } 1152 } 1153 1154 static bool 1155 is_descriptor_table(int reg) 1156 { 1157 switch (reg) { 1158 case VM_REG_GUEST_IDTR: 1159 case VM_REG_GUEST_GDTR: 1160 return (true); 1161 default: 1162 return (false); 1163 } 1164 } 1165 1166 static bool 1167 is_segment_register(int reg) 1168 { 1169 switch (reg) { 1170 case VM_REG_GUEST_ES: 1171 case VM_REG_GUEST_CS: 1172 case VM_REG_GUEST_SS: 1173 case VM_REG_GUEST_DS: 1174 case VM_REG_GUEST_FS: 1175 case VM_REG_GUEST_GS: 1176 case VM_REG_GUEST_TR: 1177 case VM_REG_GUEST_LDTR: 1178 return (true); 1179 default: 1180 return (false); 1181 } 1182 } 1183 1184 int 1185 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) 1186 { 1187 1188 if (vcpu < 0 || vcpu >= vm->maxcpus) 1189 return (EINVAL); 1190 1191 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1192 return (EINVAL); 1193 1194 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 1195 } 1196 1197 int 1198 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc) 1199 { 1200 if (vcpu < 0 || vcpu >= vm->maxcpus) 1201 return (EINVAL); 1202 1203 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1204 return (EINVAL); 1205 1206 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 1207 } 1208 1209 static int 1210 translate_hma_xsave_result(hma_fpu_xsave_result_t res) 1211 { 1212 switch (res) { 1213 case HFXR_OK: 1214 return (0); 1215 case HFXR_NO_SPACE: 1216 return (ENOSPC); 1217 case HFXR_BAD_ALIGN: 1218 case HFXR_UNSUP_FMT: 1219 case HFXR_UNSUP_FEAT: 1220 case HFXR_INVALID_DATA: 1221 return (EINVAL); 1222 default: 1223 panic("unexpected xsave result"); 1224 } 1225 } 1226 1227 int 1228 vm_get_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1229 { 1230 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1231 return (EINVAL); 1232 1233 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1234 hma_fpu_xsave_result_t res; 1235 1236 res = hma_fpu_get_xsave_state(vcpu->guestfpu, buf, len); 1237 return (translate_hma_xsave_result(res)); 1238 } 1239 1240 int 1241 vm_set_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1242 { 1243 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1244 return (EINVAL); 1245 1246 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1247 hma_fpu_xsave_result_t res; 1248 1249 res = hma_fpu_set_xsave_state(vcpu->guestfpu, buf, len); 1250 return (translate_hma_xsave_result(res)); 1251 } 1252 1253 int 1254 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec) 1255 { 1256 struct vcpu *vcpu; 1257 1258 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1259 return (EINVAL); 1260 } 1261 1262 vcpu = &vm->vcpu[vcpuid]; 1263 1264 vcpu_lock(vcpu); 1265 *state = vcpu->run_state; 1266 *sipi_vec = vcpu->sipi_vector; 1267 vcpu_unlock(vcpu); 1268 1269 return (0); 1270 } 1271 1272 int 1273 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec) 1274 { 1275 struct vcpu *vcpu; 1276 1277 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1278 return (EINVAL); 1279 } 1280 if (!VRS_IS_VALID(state)) { 1281 return (EINVAL); 1282 } 1283 1284 vcpu = &vm->vcpu[vcpuid]; 1285 1286 vcpu_lock(vcpu); 1287 vcpu->run_state = state; 1288 vcpu->sipi_vector = sipi_vec; 1289 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1290 vcpu_unlock(vcpu); 1291 1292 return (0); 1293 } 1294 1295 void 1296 vm_track_dirty_pages(struct vm *vm, uint64_t gpa, size_t len, uint8_t *bitmap) 1297 { 1298 vmspace_t *vms = vm_get_vmspace(vm); 1299 vmspace_track_dirty(vms, gpa, len, bitmap); 1300 } 1301 1302 static void 1303 restore_guest_fpustate(struct vcpu *vcpu) 1304 { 1305 /* Save host FPU and restore guest FPU */ 1306 fpu_stop_emulating(); 1307 hma_fpu_start_guest(vcpu->guestfpu); 1308 1309 /* restore guest XCR0 if XSAVE is enabled in the host */ 1310 if (rcr4() & CR4_XSAVE) 1311 load_xcr(0, vcpu->guest_xcr0); 1312 1313 /* 1314 * The FPU is now "dirty" with the guest's state so turn on emulation 1315 * to trap any access to the FPU by the host. 1316 */ 1317 fpu_start_emulating(); 1318 } 1319 1320 static void 1321 save_guest_fpustate(struct vcpu *vcpu) 1322 { 1323 1324 if ((rcr0() & CR0_TS) == 0) 1325 panic("fpu emulation not enabled in host!"); 1326 1327 /* save guest XCR0 and restore host XCR0 */ 1328 if (rcr4() & CR4_XSAVE) { 1329 vcpu->guest_xcr0 = rxcr(0); 1330 load_xcr(0, vmm_get_host_xcr0()); 1331 } 1332 1333 /* save guest FPU and restore host FPU */ 1334 fpu_stop_emulating(); 1335 hma_fpu_stop_guest(vcpu->guestfpu); 1336 /* 1337 * When the host state has been restored, we should not re-enable 1338 * CR0.TS on illumos for eager FPU. 1339 */ 1340 } 1341 1342 static int 1343 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, 1344 bool from_idle) 1345 { 1346 struct vcpu *vcpu; 1347 int error; 1348 1349 vcpu = &vm->vcpu[vcpuid]; 1350 vcpu_assert_locked(vcpu); 1351 1352 /* 1353 * State transitions from the vmmdev_ioctl() must always begin from 1354 * the VCPU_IDLE state. This guarantees that there is only a single 1355 * ioctl() operating on a vcpu at any point. 1356 */ 1357 if (from_idle) { 1358 while (vcpu->state != VCPU_IDLE) { 1359 vcpu->reqidle = 1; 1360 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1361 cv_wait(&vcpu->state_cv, &vcpu->lock); 1362 } 1363 } else { 1364 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " 1365 "vcpu idle state")); 1366 } 1367 1368 if (vcpu->state == VCPU_RUNNING) { 1369 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " 1370 "mismatch for running vcpu", curcpu, vcpu->hostcpu)); 1371 } else { 1372 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " 1373 "vcpu that is not running", vcpu->hostcpu)); 1374 } 1375 1376 /* 1377 * The following state transitions are allowed: 1378 * IDLE -> FROZEN -> IDLE 1379 * FROZEN -> RUNNING -> FROZEN 1380 * FROZEN -> SLEEPING -> FROZEN 1381 */ 1382 switch (vcpu->state) { 1383 case VCPU_IDLE: 1384 case VCPU_RUNNING: 1385 case VCPU_SLEEPING: 1386 error = (newstate != VCPU_FROZEN); 1387 break; 1388 case VCPU_FROZEN: 1389 error = (newstate == VCPU_FROZEN); 1390 break; 1391 default: 1392 error = 1; 1393 break; 1394 } 1395 1396 if (error) 1397 return (EBUSY); 1398 1399 vcpu->state = newstate; 1400 if (newstate == VCPU_RUNNING) 1401 vcpu->hostcpu = curcpu; 1402 else 1403 vcpu->hostcpu = NOCPU; 1404 1405 if (newstate == VCPU_IDLE) { 1406 cv_broadcast(&vcpu->state_cv); 1407 } 1408 1409 return (0); 1410 } 1411 1412 static void 1413 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1414 { 1415 int error; 1416 1417 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) 1418 panic("Error %d setting state to %d\n", error, newstate); 1419 } 1420 1421 static void 1422 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1423 { 1424 int error; 1425 1426 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) 1427 panic("Error %d setting state to %d", error, newstate); 1428 } 1429 1430 /* 1431 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 1432 */ 1433 static int 1434 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) 1435 { 1436 struct vcpu *vcpu; 1437 int vcpu_halted, vm_halted; 1438 bool userspace_exit = false; 1439 1440 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); 1441 1442 vcpu = &vm->vcpu[vcpuid]; 1443 vcpu_halted = 0; 1444 vm_halted = 0; 1445 1446 vcpu_lock(vcpu); 1447 while (1) { 1448 /* 1449 * Do a final check for pending interrupts (including NMI and 1450 * INIT) before putting this thread to sleep. 1451 */ 1452 if (vm_nmi_pending(vm, vcpuid)) 1453 break; 1454 if (vcpu_run_state_pending(vm, vcpuid)) 1455 break; 1456 if (!intr_disabled) { 1457 if (vm_extint_pending(vm, vcpuid) || 1458 vlapic_pending_intr(vcpu->vlapic, NULL)) { 1459 break; 1460 } 1461 } 1462 1463 /* 1464 * Also check for software events which would cause a wake-up. 1465 * This will set the appropriate exitcode directly, rather than 1466 * requiring a trip through VM_RUN(). 1467 */ 1468 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1469 userspace_exit = true; 1470 break; 1471 } 1472 1473 /* 1474 * Some Linux guests implement "halt" by having all vcpus 1475 * execute HLT with interrupts disabled. 'halted_cpus' keeps 1476 * track of the vcpus that have entered this state. When all 1477 * vcpus enter the halted state the virtual machine is halted. 1478 */ 1479 if (intr_disabled) { 1480 if (!vcpu_halted && halt_detection_enabled) { 1481 vcpu_halted = 1; 1482 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); 1483 } 1484 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { 1485 vm_halted = 1; 1486 break; 1487 } 1488 } 1489 1490 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1491 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1492 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1493 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1494 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1495 } 1496 1497 if (vcpu_halted) 1498 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); 1499 1500 vcpu_unlock(vcpu); 1501 1502 if (vm_halted) { 1503 (void) vm_suspend(vm, VM_SUSPEND_HALT); 1504 } 1505 1506 return (userspace_exit ? -1 : 0); 1507 } 1508 1509 static int 1510 vm_handle_paging(struct vm *vm, int vcpuid) 1511 { 1512 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1513 vm_client_t *vmc = vcpu->vmclient; 1514 struct vm_exit *vme = &vcpu->exitinfo; 1515 int rv, ftype; 1516 1517 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1518 __func__, vme->inst_length)); 1519 1520 ftype = vme->u.paging.fault_type; 1521 KASSERT(ftype == PROT_READ || 1522 ftype == PROT_WRITE || ftype == PROT_EXEC, 1523 ("vm_handle_paging: invalid fault_type %d", ftype)); 1524 1525 rv = vmc_fault(vmc, vme->u.paging.gpa, ftype); 1526 1527 if (rv != 0) 1528 return (EFAULT); 1529 return (0); 1530 } 1531 1532 int 1533 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval, 1534 int rsize) 1535 { 1536 int err = ESRCH; 1537 1538 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1539 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1540 1541 err = vlapic_mmio_read(vlapic, gpa, rval, rsize); 1542 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1543 err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize); 1544 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1545 err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize); 1546 } 1547 1548 return (err); 1549 } 1550 1551 int 1552 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval, 1553 int wsize) 1554 { 1555 int err = ESRCH; 1556 1557 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1558 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1559 1560 err = vlapic_mmio_write(vlapic, gpa, wval, wsize); 1561 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1562 err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize); 1563 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1564 err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize); 1565 } 1566 1567 return (err); 1568 } 1569 1570 static int 1571 vm_handle_mmio_emul(struct vm *vm, int vcpuid) 1572 { 1573 struct vie *vie; 1574 struct vcpu *vcpu; 1575 struct vm_exit *vme; 1576 uint64_t inst_addr; 1577 int error, fault, cs_d; 1578 1579 vcpu = &vm->vcpu[vcpuid]; 1580 vme = &vcpu->exitinfo; 1581 vie = vcpu->vie_ctx; 1582 1583 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1584 __func__, vme->inst_length)); 1585 1586 inst_addr = vme->rip + vme->u.mmio_emul.cs_base; 1587 cs_d = vme->u.mmio_emul.cs_d; 1588 1589 /* Fetch the faulting instruction */ 1590 if (vie_needs_fetch(vie)) { 1591 error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr, 1592 &fault); 1593 if (error != 0) { 1594 return (error); 1595 } else if (fault) { 1596 /* 1597 * If a fault during instruction fetch was encountered, 1598 * it will have asserted that the appropriate exception 1599 * be injected at next entry. 1600 * No further work is required. 1601 */ 1602 return (0); 1603 } 1604 } 1605 1606 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1607 /* Dump (unrecognized) instruction bytes in userspace */ 1608 vie_fallback_exitinfo(vie, vme); 1609 return (-1); 1610 } 1611 if (vme->u.mmio_emul.gla != VIE_INVALID_GLA && 1612 vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) { 1613 /* Decoded GLA does not match GLA from VM exit state */ 1614 vie_fallback_exitinfo(vie, vme); 1615 return (-1); 1616 } 1617 1618 repeat: 1619 error = vie_emulate_mmio(vie, vm, vcpuid); 1620 if (error < 0) { 1621 /* 1622 * MMIO not handled by any of the in-kernel-emulated devices, so 1623 * make a trip out to userspace for it. 1624 */ 1625 vie_exitinfo(vie, vme); 1626 } else if (error == EAGAIN) { 1627 /* 1628 * Continue emulating the rep-prefixed instruction, which has 1629 * not completed its iterations. 1630 * 1631 * In case this can be emulated in-kernel and has a high 1632 * repetition count (causing a tight spin), it should be 1633 * deferential to yield conditions. 1634 */ 1635 if (!vcpu_should_yield(vm, vcpuid)) { 1636 goto repeat; 1637 } else { 1638 /* 1639 * Defer to the contending load by making a trip to 1640 * userspace with a no-op (BOGUS) exit reason. 1641 */ 1642 vie_reset(vie); 1643 vme->exitcode = VM_EXITCODE_BOGUS; 1644 return (-1); 1645 } 1646 } else if (error == 0) { 1647 /* Update %rip now that instruction has been emulated */ 1648 vie_advance_pc(vie, &vcpu->nextrip); 1649 } 1650 return (error); 1651 } 1652 1653 static int 1654 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme) 1655 { 1656 struct vcpu *vcpu; 1657 struct vie *vie; 1658 int err; 1659 1660 vcpu = &vm->vcpu[vcpuid]; 1661 vie = vcpu->vie_ctx; 1662 1663 repeat: 1664 err = vie_emulate_inout(vie, vm, vcpuid); 1665 1666 if (err < 0) { 1667 /* 1668 * In/out not handled by any of the in-kernel-emulated devices, 1669 * so make a trip out to userspace for it. 1670 */ 1671 vie_exitinfo(vie, vme); 1672 return (err); 1673 } else if (err == EAGAIN) { 1674 /* 1675 * Continue emulating the rep-prefixed ins/outs, which has not 1676 * completed its iterations. 1677 * 1678 * In case this can be emulated in-kernel and has a high 1679 * repetition count (causing a tight spin), it should be 1680 * deferential to yield conditions. 1681 */ 1682 if (!vcpu_should_yield(vm, vcpuid)) { 1683 goto repeat; 1684 } else { 1685 /* 1686 * Defer to the contending load by making a trip to 1687 * userspace with a no-op (BOGUS) exit reason. 1688 */ 1689 vie_reset(vie); 1690 vme->exitcode = VM_EXITCODE_BOGUS; 1691 return (-1); 1692 } 1693 } else if (err != 0) { 1694 /* Emulation failure. Bail all the way out to userspace. */ 1695 vme->exitcode = VM_EXITCODE_INST_EMUL; 1696 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul)); 1697 return (-1); 1698 } 1699 1700 vie_advance_pc(vie, &vcpu->nextrip); 1701 return (0); 1702 } 1703 1704 static int 1705 vm_handle_inst_emul(struct vm *vm, int vcpuid) 1706 { 1707 struct vie *vie; 1708 struct vcpu *vcpu; 1709 struct vm_exit *vme; 1710 uint64_t cs_base; 1711 int error, fault, cs_d; 1712 1713 vcpu = &vm->vcpu[vcpuid]; 1714 vme = &vcpu->exitinfo; 1715 vie = vcpu->vie_ctx; 1716 1717 vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d); 1718 1719 /* Fetch the faulting instruction */ 1720 ASSERT(vie_needs_fetch(vie)); 1721 error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base, 1722 &fault); 1723 if (error != 0) { 1724 return (error); 1725 } else if (fault) { 1726 /* 1727 * If a fault during instruction fetch was encounted, it will 1728 * have asserted that the appropriate exception be injected at 1729 * next entry. No further work is required. 1730 */ 1731 return (0); 1732 } 1733 1734 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1735 /* Dump (unrecognized) instruction bytes in userspace */ 1736 vie_fallback_exitinfo(vie, vme); 1737 return (-1); 1738 } 1739 1740 error = vie_emulate_other(vie, vm, vcpuid); 1741 if (error != 0) { 1742 /* 1743 * Instruction emulation was unable to complete successfully, so 1744 * kick it out to userspace for handling. 1745 */ 1746 vie_fallback_exitinfo(vie, vme); 1747 } else { 1748 /* Update %rip now that instruction has been emulated */ 1749 vie_advance_pc(vie, &vcpu->nextrip); 1750 } 1751 return (error); 1752 } 1753 1754 static int 1755 vm_handle_suspend(struct vm *vm, int vcpuid) 1756 { 1757 int i; 1758 struct vcpu *vcpu; 1759 1760 vcpu = &vm->vcpu[vcpuid]; 1761 1762 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 1763 1764 /* 1765 * Wait until all 'active_cpus' have suspended themselves. 1766 */ 1767 vcpu_lock(vcpu); 1768 vcpu_ustate_change(vm, vcpuid, VU_INIT); 1769 while (1) { 1770 int rc; 1771 1772 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 1773 break; 1774 } 1775 1776 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1777 rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->lock, hz, 1778 TR_CLOCK_TICK); 1779 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1780 1781 /* 1782 * If the userspace process driving the instance is killed, any 1783 * vCPUs yet to be marked suspended (because they are not 1784 * VM_RUN-ing in the kernel presently) will never reach that 1785 * state. 1786 * 1787 * To avoid vm_handle_suspend() getting stuck in the kernel 1788 * waiting for those vCPUs, offer a bail-out even though it 1789 * means returning without all vCPUs in a suspended state. 1790 */ 1791 if (rc <= 0) { 1792 if ((curproc->p_flag & SEXITING) != 0) { 1793 break; 1794 } 1795 } 1796 } 1797 vcpu_unlock(vcpu); 1798 1799 /* 1800 * Wakeup the other sleeping vcpus and return to userspace. 1801 */ 1802 for (i = 0; i < vm->maxcpus; i++) { 1803 if (CPU_ISSET(i, &vm->suspended_cpus)) { 1804 vcpu_notify_event(vm, i); 1805 } 1806 } 1807 1808 return (-1); 1809 } 1810 1811 static int 1812 vm_handle_reqidle(struct vm *vm, int vcpuid) 1813 { 1814 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1815 1816 vcpu_lock(vcpu); 1817 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); 1818 vcpu->reqidle = 0; 1819 vcpu_unlock(vcpu); 1820 return (-1); 1821 } 1822 1823 static int 1824 vm_handle_run_state(struct vm *vm, int vcpuid) 1825 { 1826 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1827 bool handled = false; 1828 1829 vcpu_lock(vcpu); 1830 while (1) { 1831 if ((vcpu->run_state & VRS_PEND_INIT) != 0) { 1832 vcpu_unlock(vcpu); 1833 VERIFY0(vcpu_arch_reset(vm, vcpuid, true)); 1834 vcpu_lock(vcpu); 1835 1836 vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT); 1837 vcpu->run_state |= VRS_INIT; 1838 } 1839 1840 if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) == 1841 (VRS_INIT | VRS_PEND_SIPI)) { 1842 const uint8_t vector = vcpu->sipi_vector; 1843 1844 vcpu_unlock(vcpu); 1845 VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector)); 1846 vcpu_lock(vcpu); 1847 1848 vcpu->run_state &= ~VRS_PEND_SIPI; 1849 vcpu->run_state |= VRS_RUN; 1850 } 1851 1852 /* 1853 * If the vCPU is now in the running state, there is no need to 1854 * wait for anything prior to re-entry. 1855 */ 1856 if ((vcpu->run_state & VRS_RUN) != 0) { 1857 handled = true; 1858 break; 1859 } 1860 1861 /* 1862 * Also check for software events which would cause a wake-up. 1863 * This will set the appropriate exitcode directly, rather than 1864 * requiring a trip through VM_RUN(). 1865 */ 1866 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1867 break; 1868 } 1869 1870 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1871 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1872 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1873 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1874 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1875 } 1876 vcpu_unlock(vcpu); 1877 1878 return (handled ? 0 : -1); 1879 } 1880 1881 static int 1882 vm_rdmtrr(const struct vm_mtrr *mtrr, uint32_t num, uint64_t *val) 1883 { 1884 switch (num) { 1885 case MSR_MTRRcap: 1886 *val = MTRR_CAP_WC | MTRR_CAP_FIXED | VMM_MTRR_VAR_MAX; 1887 break; 1888 case MSR_MTRRdefType: 1889 *val = mtrr->def_type; 1890 break; 1891 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1892 *val = mtrr->fixed4k[num - MSR_MTRR4kBase]; 1893 break; 1894 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1895 *val = mtrr->fixed16k[num - MSR_MTRR16kBase]; 1896 break; 1897 case MSR_MTRR64kBase: 1898 *val = mtrr->fixed64k; 1899 break; 1900 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { 1901 uint_t offset = num - MSR_MTRRVarBase; 1902 if (offset % 2 == 0) { 1903 *val = mtrr->var[offset / 2].base; 1904 } else { 1905 *val = mtrr->var[offset / 2].mask; 1906 } 1907 break; 1908 } 1909 default: 1910 return (-1); 1911 } 1912 1913 return (0); 1914 } 1915 1916 static int 1917 vm_wrmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t val) 1918 { 1919 switch (num) { 1920 case MSR_MTRRcap: 1921 /* MTRRCAP is read only */ 1922 return (-1); 1923 case MSR_MTRRdefType: 1924 if (val & ~VMM_MTRR_DEF_MASK) { 1925 /* generate #GP on writes to reserved fields */ 1926 return (-1); 1927 } 1928 mtrr->def_type = val; 1929 break; 1930 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1931 mtrr->fixed4k[num - MSR_MTRR4kBase] = val; 1932 break; 1933 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1934 mtrr->fixed16k[num - MSR_MTRR16kBase] = val; 1935 break; 1936 case MSR_MTRR64kBase: 1937 mtrr->fixed64k = val; 1938 break; 1939 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { 1940 uint_t offset = num - MSR_MTRRVarBase; 1941 if (offset % 2 == 0) { 1942 if (val & ~VMM_MTRR_PHYSBASE_MASK) { 1943 /* generate #GP on writes to reserved fields */ 1944 return (-1); 1945 } 1946 mtrr->var[offset / 2].base = val; 1947 } else { 1948 if (val & ~VMM_MTRR_PHYSMASK_MASK) { 1949 /* generate #GP on writes to reserved fields */ 1950 return (-1); 1951 } 1952 mtrr->var[offset / 2].mask = val; 1953 } 1954 break; 1955 } 1956 default: 1957 return (-1); 1958 } 1959 1960 return (0); 1961 } 1962 1963 static bool 1964 is_mtrr_msr(uint32_t msr) 1965 { 1966 switch (msr) { 1967 case MSR_MTRRcap: 1968 case MSR_MTRRdefType: 1969 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1970 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1971 case MSR_MTRR64kBase: 1972 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 1973 return (true); 1974 default: 1975 return (false); 1976 } 1977 } 1978 1979 static int 1980 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 1981 { 1982 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1983 const uint32_t code = vme->u.msr.code; 1984 uint64_t val = 0; 1985 1986 switch (code) { 1987 case MSR_MCG_CAP: 1988 case MSR_MCG_STATUS: 1989 val = 0; 1990 break; 1991 1992 case MSR_MTRRcap: 1993 case MSR_MTRRdefType: 1994 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1995 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1996 case MSR_MTRR64kBase: 1997 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 1998 if (vm_rdmtrr(&vcpu->mtrr, code, &val) != 0) 1999 vm_inject_gp(vm, vcpuid); 2000 break; 2001 2002 case MSR_TSC: 2003 /* 2004 * In all likelihood, this should always be handled in guest 2005 * context by VMX/SVM rather than taking an exit. (Both VMX and 2006 * SVM pass through read-only access to MSR_TSC to the guest.) 2007 * 2008 * No physical offset is requested of vcpu_tsc_offset() since 2009 * rdtsc_offset() takes care of that instead. 2010 */ 2011 val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset(); 2012 break; 2013 2014 default: 2015 /* 2016 * Anything not handled at this point will be kicked out to 2017 * userspace for attempted processing there. 2018 */ 2019 return (-1); 2020 } 2021 2022 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, 2023 val & 0xffffffff)); 2024 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 2025 val >> 32)); 2026 return (0); 2027 } 2028 2029 static int 2030 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 2031 { 2032 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2033 const uint32_t code = vme->u.msr.code; 2034 const uint64_t val = vme->u.msr.wval; 2035 2036 switch (code) { 2037 case MSR_MCG_CAP: 2038 case MSR_MCG_STATUS: 2039 /* Ignore writes */ 2040 break; 2041 2042 case MSR_MTRRcap: 2043 case MSR_MTRRdefType: 2044 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 2045 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 2046 case MSR_MTRR64kBase: 2047 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 2048 if (vm_wrmtrr(&vcpu->mtrr, code, val) != 0) 2049 vm_inject_gp(vm, vcpuid); 2050 break; 2051 2052 case MSR_TSC: 2053 /* 2054 * The effect of writing the TSC MSR is that a subsequent read 2055 * of the TSC would report that value written (plus any time 2056 * elapsed between the write and the read). The guest TSC value 2057 * is calculated from a global offset for the guest (which 2058 * effectively makes its TSC read 0 at guest boot) and a 2059 * per-vCPU offset to handle these writes to the MSR. 2060 * 2061 * To calculate that per-vCPU offset, we can work backwards from 2062 * the guest value at the time of write: 2063 * 2064 * value = host TSC + VM boot offset + vCPU offset 2065 * 2066 * so therefore: 2067 * 2068 * value - host TSC - VM boot offset = vCPU offset 2069 */ 2070 vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset(); 2071 break; 2072 2073 default: 2074 /* 2075 * Anything not handled at this point will be kicked out to 2076 * userspace for attempted processing there. 2077 */ 2078 return (-1); 2079 } 2080 2081 return (0); 2082 } 2083 2084 int 2085 vm_suspend(struct vm *vm, enum vm_suspend_how how) 2086 { 2087 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) 2088 return (EINVAL); 2089 2090 if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) { 2091 return (EALREADY); 2092 } 2093 2094 /* 2095 * Notify all active vcpus that they are now suspended. 2096 */ 2097 for (uint_t i = 0; i < vm->maxcpus; i++) { 2098 struct vcpu *vcpu = &vm->vcpu[i]; 2099 2100 vcpu_lock(vcpu); 2101 if (vcpu->state == VCPU_IDLE || vcpu->state == VCPU_FROZEN) { 2102 /* 2103 * Any vCPUs not actively running or in HLT can be 2104 * marked as suspended immediately. 2105 */ 2106 if (CPU_ISSET(i, &vm->active_cpus)) { 2107 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 2108 } 2109 } else { 2110 /* 2111 * Those which are running or in HLT will pick up the 2112 * suspended state after notification. 2113 */ 2114 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2115 } 2116 vcpu_unlock(vcpu); 2117 } 2118 return (0); 2119 } 2120 2121 void 2122 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip) 2123 { 2124 struct vm_exit *vmexit; 2125 2126 vmexit = vm_exitinfo(vm, vcpuid); 2127 vmexit->rip = rip; 2128 vmexit->inst_length = 0; 2129 vmexit->exitcode = VM_EXITCODE_RUN_STATE; 2130 vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1); 2131 } 2132 2133 /* 2134 * Some vmm resources, such as the lapic, may have CPU-specific resources 2135 * allocated to them which would benefit from migration onto the host CPU which 2136 * is processing the vcpu state. 2137 */ 2138 static void 2139 vm_localize_resources(struct vm *vm, struct vcpu *vcpu) 2140 { 2141 /* 2142 * Localizing cyclic resources requires acquisition of cpu_lock, and 2143 * doing so with kpreempt disabled is a recipe for deadlock disaster. 2144 */ 2145 VERIFY(curthread->t_preempt == 0); 2146 2147 /* 2148 * Do not bother with localization if this vCPU is about to return to 2149 * the host CPU it was last localized to. 2150 */ 2151 if (vcpu->lastloccpu == curcpu) 2152 return; 2153 2154 /* 2155 * Localize system-wide resources to the primary boot vCPU. While any 2156 * of the other vCPUs may access them, it keeps the potential interrupt 2157 * footprint constrained to CPUs involved with this instance. 2158 */ 2159 if (vcpu == &vm->vcpu[0]) { 2160 vhpet_localize_resources(vm->vhpet); 2161 vrtc_localize_resources(vm->vrtc); 2162 vatpit_localize_resources(vm->vatpit); 2163 } 2164 2165 vlapic_localize_resources(vcpu->vlapic); 2166 2167 vcpu->lastloccpu = curcpu; 2168 } 2169 2170 static void 2171 vmm_savectx(void *arg) 2172 { 2173 vm_thread_ctx_t *vtc = arg; 2174 struct vm *vm = vtc->vtc_vm; 2175 const int vcpuid = vtc->vtc_vcpuid; 2176 2177 if (ops->vmsavectx != NULL) { 2178 ops->vmsavectx(vm->cookie, vcpuid); 2179 } 2180 2181 /* 2182 * Account for going off-cpu, unless the vCPU is idled, where being 2183 * off-cpu is the explicit point. 2184 */ 2185 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2186 vtc->vtc_ustate = vm->vcpu[vcpuid].ustate; 2187 vcpu_ustate_change(vm, vcpuid, VU_SCHED); 2188 } 2189 2190 /* 2191 * If the CPU holds the restored guest FPU state, save it and restore 2192 * the host FPU state before this thread goes off-cpu. 2193 */ 2194 if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) { 2195 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2196 2197 save_guest_fpustate(vcpu); 2198 vtc->vtc_status &= ~VTCS_FPU_RESTORED; 2199 } 2200 } 2201 2202 static void 2203 vmm_restorectx(void *arg) 2204 { 2205 vm_thread_ctx_t *vtc = arg; 2206 struct vm *vm = vtc->vtc_vm; 2207 const int vcpuid = vtc->vtc_vcpuid; 2208 2209 /* Complete microstate accounting for vCPU being off-cpu */ 2210 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2211 vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate); 2212 } 2213 2214 /* 2215 * When coming back on-cpu, only restore the guest FPU status if the 2216 * thread is in a context marked as requiring it. This should be rare, 2217 * occurring only when a future logic error results in a voluntary 2218 * sleep during the VMRUN critical section. 2219 * 2220 * The common case will result in elision of the guest FPU state 2221 * restoration, deferring that action until it is clearly necessary 2222 * during vm_run. 2223 */ 2224 VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0); 2225 if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) { 2226 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2227 2228 restore_guest_fpustate(vcpu); 2229 vtc->vtc_status |= VTCS_FPU_RESTORED; 2230 } 2231 2232 if (ops->vmrestorectx != NULL) { 2233 ops->vmrestorectx(vm->cookie, vcpuid); 2234 } 2235 2236 } 2237 2238 static int 2239 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry, 2240 struct vm_exit *vme) 2241 { 2242 struct vcpu *vcpu; 2243 struct vie *vie; 2244 int err; 2245 2246 vcpu = &vm->vcpu[vcpuid]; 2247 vie = vcpu->vie_ctx; 2248 err = 0; 2249 2250 switch (entry->cmd) { 2251 case VEC_DEFAULT: 2252 return (0); 2253 case VEC_DISCARD_INSTR: 2254 vie_reset(vie); 2255 return (0); 2256 case VEC_FULFILL_MMIO: 2257 err = vie_fulfill_mmio(vie, &entry->u.mmio); 2258 if (err == 0) { 2259 err = vie_emulate_mmio(vie, vm, vcpuid); 2260 if (err == 0) { 2261 vie_advance_pc(vie, &vcpu->nextrip); 2262 } else if (err < 0) { 2263 vie_exitinfo(vie, vme); 2264 } else if (err == EAGAIN) { 2265 /* 2266 * Clear the instruction emulation state in 2267 * order to re-enter VM context and continue 2268 * this 'rep <instruction>' 2269 */ 2270 vie_reset(vie); 2271 err = 0; 2272 } 2273 } 2274 break; 2275 case VEC_FULFILL_INOUT: 2276 err = vie_fulfill_inout(vie, &entry->u.inout); 2277 if (err == 0) { 2278 err = vie_emulate_inout(vie, vm, vcpuid); 2279 if (err == 0) { 2280 vie_advance_pc(vie, &vcpu->nextrip); 2281 } else if (err < 0) { 2282 vie_exitinfo(vie, vme); 2283 } else if (err == EAGAIN) { 2284 /* 2285 * Clear the instruction emulation state in 2286 * order to re-enter VM context and continue 2287 * this 'rep ins/outs' 2288 */ 2289 vie_reset(vie); 2290 err = 0; 2291 } 2292 } 2293 break; 2294 default: 2295 return (EINVAL); 2296 } 2297 return (err); 2298 } 2299 2300 static int 2301 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme) 2302 { 2303 struct vie *vie; 2304 2305 vie = vm->vcpu[vcpuid].vie_ctx; 2306 2307 if (vie_pending(vie)) { 2308 /* 2309 * Userspace has not fulfilled the pending needs of the 2310 * instruction emulation, so bail back out. 2311 */ 2312 vie_exitinfo(vie, vme); 2313 return (-1); 2314 } 2315 2316 return (0); 2317 } 2318 2319 int 2320 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry) 2321 { 2322 int error; 2323 struct vcpu *vcpu; 2324 struct vm_exit *vme; 2325 bool intr_disabled; 2326 int affinity_type = CPU_CURRENT; 2327 2328 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2329 return (EINVAL); 2330 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 2331 return (EINVAL); 2332 2333 vcpu = &vm->vcpu[vcpuid]; 2334 vme = &vcpu->exitinfo; 2335 2336 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 2337 2338 vcpu->vtc.vtc_status = 0; 2339 ctxop_attach(curthread, vcpu->ctxop); 2340 2341 error = vm_entry_actions(vm, vcpuid, entry, vme); 2342 if (error != 0) { 2343 goto exit; 2344 } 2345 2346 restart: 2347 error = vm_loop_checks(vm, vcpuid, vme); 2348 if (error != 0) { 2349 goto exit; 2350 } 2351 2352 thread_affinity_set(curthread, affinity_type); 2353 /* 2354 * Resource localization should happen after the CPU affinity for the 2355 * thread has been set to ensure that access from restricted contexts, 2356 * such as VMX-accelerated APIC operations, can occur without inducing 2357 * cyclic cross-calls. 2358 * 2359 * This must be done prior to disabling kpreempt via critical_enter(). 2360 */ 2361 vm_localize_resources(vm, vcpu); 2362 affinity_type = CPU_CURRENT; 2363 critical_enter(); 2364 2365 /* Force a trip through update_sregs to reload %fs/%gs and friends */ 2366 PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb); 2367 2368 if ((vcpu->vtc.vtc_status & VTCS_FPU_RESTORED) == 0) { 2369 restore_guest_fpustate(vcpu); 2370 vcpu->vtc.vtc_status |= VTCS_FPU_RESTORED; 2371 } 2372 vcpu->vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL; 2373 2374 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 2375 error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip); 2376 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 2377 2378 /* 2379 * Once clear of the delicate contexts comprising the VM_RUN handler, 2380 * thread CPU affinity can be loosened while other processing occurs. 2381 */ 2382 vcpu->vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL; 2383 thread_affinity_clear(curthread); 2384 critical_exit(); 2385 2386 if (error != 0) { 2387 /* Communicate out any error from VMRUN() above */ 2388 goto exit; 2389 } 2390 2391 vcpu->nextrip = vme->rip + vme->inst_length; 2392 switch (vme->exitcode) { 2393 case VM_EXITCODE_REQIDLE: 2394 error = vm_handle_reqidle(vm, vcpuid); 2395 break; 2396 case VM_EXITCODE_RUN_STATE: 2397 error = vm_handle_run_state(vm, vcpuid); 2398 break; 2399 case VM_EXITCODE_SUSPENDED: 2400 error = vm_handle_suspend(vm, vcpuid); 2401 break; 2402 case VM_EXITCODE_IOAPIC_EOI: 2403 vioapic_process_eoi(vm, vcpuid, 2404 vme->u.ioapic_eoi.vector); 2405 break; 2406 case VM_EXITCODE_HLT: 2407 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 2408 error = vm_handle_hlt(vm, vcpuid, intr_disabled); 2409 break; 2410 case VM_EXITCODE_PAGING: 2411 error = vm_handle_paging(vm, vcpuid); 2412 break; 2413 case VM_EXITCODE_MMIO_EMUL: 2414 error = vm_handle_mmio_emul(vm, vcpuid); 2415 break; 2416 case VM_EXITCODE_INOUT: 2417 error = vm_handle_inout(vm, vcpuid, vme); 2418 break; 2419 case VM_EXITCODE_INST_EMUL: 2420 error = vm_handle_inst_emul(vm, vcpuid); 2421 break; 2422 case VM_EXITCODE_MONITOR: 2423 case VM_EXITCODE_MWAIT: 2424 case VM_EXITCODE_VMINSN: 2425 vm_inject_ud(vm, vcpuid); 2426 break; 2427 case VM_EXITCODE_RDMSR: 2428 error = vm_handle_rdmsr(vm, vcpuid, vme); 2429 break; 2430 case VM_EXITCODE_WRMSR: 2431 error = vm_handle_wrmsr(vm, vcpuid, vme); 2432 break; 2433 case VM_EXITCODE_HT: 2434 affinity_type = CPU_BEST; 2435 break; 2436 case VM_EXITCODE_MTRAP: 2437 VERIFY0(vm_suspend_cpu(vm, vcpuid)); 2438 error = -1; 2439 break; 2440 default: 2441 /* handled in userland */ 2442 error = -1; 2443 break; 2444 } 2445 2446 if (error == 0) { 2447 /* VM exit conditions handled in-kernel, continue running */ 2448 goto restart; 2449 } 2450 2451 exit: 2452 kpreempt_disable(); 2453 ctxop_detach(curthread, vcpu->ctxop); 2454 /* Make sure all of the needed vCPU context state is saved */ 2455 vmm_savectx(&vcpu->vtc); 2456 kpreempt_enable(); 2457 2458 vcpu_ustate_change(vm, vcpuid, VU_EMU_USER); 2459 return (error); 2460 } 2461 2462 int 2463 vm_restart_instruction(void *arg, int vcpuid) 2464 { 2465 struct vm *vm; 2466 struct vcpu *vcpu; 2467 enum vcpu_state state; 2468 uint64_t rip; 2469 int error; 2470 2471 vm = arg; 2472 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2473 return (EINVAL); 2474 2475 vcpu = &vm->vcpu[vcpuid]; 2476 state = vcpu_get_state(vm, vcpuid, NULL); 2477 if (state == VCPU_RUNNING) { 2478 /* 2479 * When a vcpu is "running" the next instruction is determined 2480 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. 2481 * Thus setting 'inst_length' to zero will cause the current 2482 * instruction to be restarted. 2483 */ 2484 vcpu->exitinfo.inst_length = 0; 2485 } else if (state == VCPU_FROZEN) { 2486 /* 2487 * When a vcpu is "frozen" it is outside the critical section 2488 * around VMRUN() and 'nextrip' points to the next instruction. 2489 * Thus instruction restart is achieved by setting 'nextrip' 2490 * to the vcpu's %rip. 2491 */ 2492 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); 2493 KASSERT(!error, ("%s: error %d getting rip", __func__, error)); 2494 vcpu->nextrip = rip; 2495 } else { 2496 panic("%s: invalid state %d", __func__, state); 2497 } 2498 return (0); 2499 } 2500 2501 int 2502 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) 2503 { 2504 struct vcpu *vcpu; 2505 2506 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2507 return (EINVAL); 2508 2509 vcpu = &vm->vcpu[vcpuid]; 2510 2511 if (VM_INTINFO_PENDING(info)) { 2512 const uint32_t type = VM_INTINFO_TYPE(info); 2513 const uint8_t vector = VM_INTINFO_VECTOR(info); 2514 2515 if (type == VM_INTINFO_NMI && vector != IDT_NMI) 2516 return (EINVAL); 2517 if (type == VM_INTINFO_HWEXCP && vector >= 32) 2518 return (EINVAL); 2519 if (info & VM_INTINFO_MASK_RSVD) 2520 return (EINVAL); 2521 } else { 2522 info = 0; 2523 } 2524 vcpu->exit_intinfo = info; 2525 return (0); 2526 } 2527 2528 enum exc_class { 2529 EXC_BENIGN, 2530 EXC_CONTRIBUTORY, 2531 EXC_PAGEFAULT 2532 }; 2533 2534 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ 2535 2536 static enum exc_class 2537 exception_class(uint64_t info) 2538 { 2539 ASSERT(VM_INTINFO_PENDING(info)); 2540 2541 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ 2542 switch (VM_INTINFO_TYPE(info)) { 2543 case VM_INTINFO_HWINTR: 2544 case VM_INTINFO_SWINTR: 2545 case VM_INTINFO_NMI: 2546 return (EXC_BENIGN); 2547 default: 2548 /* 2549 * Hardware exception. 2550 * 2551 * SVM and VT-x use identical type values to represent NMI, 2552 * hardware interrupt and software interrupt. 2553 * 2554 * SVM uses type '3' for all exceptions. VT-x uses type '3' 2555 * for exceptions except #BP and #OF. #BP and #OF use a type 2556 * value of '5' or '6'. Therefore we don't check for explicit 2557 * values of 'type' to classify 'intinfo' into a hardware 2558 * exception. 2559 */ 2560 break; 2561 } 2562 2563 switch (VM_INTINFO_VECTOR(info)) { 2564 case IDT_PF: 2565 case IDT_VE: 2566 return (EXC_PAGEFAULT); 2567 case IDT_DE: 2568 case IDT_TS: 2569 case IDT_NP: 2570 case IDT_SS: 2571 case IDT_GP: 2572 return (EXC_CONTRIBUTORY); 2573 default: 2574 return (EXC_BENIGN); 2575 } 2576 } 2577 2578 /* 2579 * Fetch event pending injection into the guest, if one exists. 2580 * 2581 * Returns true if an event is to be injected (which is placed in `retinfo`). 2582 */ 2583 bool 2584 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) 2585 { 2586 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2587 const uint64_t info1 = vcpu->exit_intinfo; 2588 vcpu->exit_intinfo = 0; 2589 const uint64_t info2 = vcpu->exc_pending; 2590 vcpu->exc_pending = 0; 2591 2592 if (VM_INTINFO_PENDING(info1) && VM_INTINFO_PENDING(info2)) { 2593 /* 2594 * If an exception occurs while attempting to call the 2595 * double-fault handler the processor enters shutdown mode 2596 * (aka triple fault). 2597 */ 2598 if (VM_INTINFO_TYPE(info1) == VM_INTINFO_HWEXCP && 2599 VM_INTINFO_VECTOR(info1) == IDT_DF) { 2600 (void) vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); 2601 *retinfo = 0; 2602 return (false); 2603 } 2604 /* 2605 * "Conditions for Generating a Double Fault" 2606 * Intel SDM, Vol3, Table 6-5 2607 */ 2608 const enum exc_class exc1 = exception_class(info1); 2609 const enum exc_class exc2 = exception_class(info2); 2610 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || 2611 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { 2612 /* Convert nested fault into a double fault. */ 2613 *retinfo = 2614 VM_INTINFO_VALID | 2615 VM_INTINFO_DEL_ERRCODE | 2616 VM_INTINFO_HWEXCP | 2617 IDT_DF; 2618 } else { 2619 /* Handle exceptions serially */ 2620 vcpu->exit_intinfo = info1; 2621 *retinfo = info2; 2622 } 2623 return (true); 2624 } else if (VM_INTINFO_PENDING(info1)) { 2625 *retinfo = info1; 2626 return (true); 2627 } else if (VM_INTINFO_PENDING(info2)) { 2628 *retinfo = info2; 2629 return (true); 2630 } 2631 2632 return (false); 2633 } 2634 2635 int 2636 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) 2637 { 2638 struct vcpu *vcpu; 2639 2640 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2641 return (EINVAL); 2642 2643 vcpu = &vm->vcpu[vcpuid]; 2644 *info1 = vcpu->exit_intinfo; 2645 *info2 = vcpu->exc_pending; 2646 return (0); 2647 } 2648 2649 int 2650 vm_inject_exception(struct vm *vm, int vcpuid, uint8_t vector, 2651 bool errcode_valid, uint32_t errcode, bool restart_instruction) 2652 { 2653 struct vcpu *vcpu; 2654 uint64_t regval; 2655 int error; 2656 2657 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2658 return (EINVAL); 2659 2660 if (vector >= 32) 2661 return (EINVAL); 2662 2663 /* 2664 * NMIs are to be injected via their own specialized path using 2665 * vm_inject_nmi(). 2666 */ 2667 if (vector == IDT_NMI) { 2668 return (EINVAL); 2669 } 2670 2671 /* 2672 * A double fault exception should never be injected directly into 2673 * the guest. It is a derived exception that results from specific 2674 * combinations of nested faults. 2675 */ 2676 if (vector == IDT_DF) { 2677 return (EINVAL); 2678 } 2679 2680 vcpu = &vm->vcpu[vcpuid]; 2681 2682 if (VM_INTINFO_PENDING(vcpu->exc_pending)) { 2683 /* Unable to inject exception due to one already pending */ 2684 return (EBUSY); 2685 } 2686 2687 if (errcode_valid) { 2688 /* 2689 * Exceptions don't deliver an error code in real mode. 2690 */ 2691 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val); 2692 VERIFY0(error); 2693 if ((regval & CR0_PE) == 0) { 2694 errcode_valid = false; 2695 } 2696 } 2697 2698 /* 2699 * From section 26.6.1 "Interruptibility State" in Intel SDM: 2700 * 2701 * Event blocking by "STI" or "MOV SS" is cleared after guest executes 2702 * one instruction or incurs an exception. 2703 */ 2704 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); 2705 VERIFY0(error); 2706 2707 if (restart_instruction) { 2708 VERIFY0(vm_restart_instruction(vm, vcpuid)); 2709 } 2710 2711 uint64_t val = VM_INTINFO_VALID | VM_INTINFO_HWEXCP | vector; 2712 if (errcode_valid) { 2713 val |= VM_INTINFO_DEL_ERRCODE; 2714 val |= (uint64_t)errcode << VM_INTINFO_SHIFT_ERRCODE; 2715 } 2716 vcpu->exc_pending = val; 2717 return (0); 2718 } 2719 2720 void 2721 vm_inject_ud(struct vm *vm, int vcpuid) 2722 { 2723 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_UD, false, 0, true)); 2724 } 2725 2726 void 2727 vm_inject_gp(struct vm *vm, int vcpuid) 2728 { 2729 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_GP, true, 0, true)); 2730 } 2731 2732 void 2733 vm_inject_ac(struct vm *vm, int vcpuid, uint32_t errcode) 2734 { 2735 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_AC, true, errcode, true)); 2736 } 2737 2738 void 2739 vm_inject_ss(struct vm *vm, int vcpuid, uint32_t errcode) 2740 { 2741 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_SS, true, errcode, true)); 2742 } 2743 2744 void 2745 vm_inject_pf(struct vm *vm, int vcpuid, uint32_t errcode, uint64_t cr2) 2746 { 2747 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2)); 2748 VERIFY0(vm_inject_exception(vm, vcpuid, IDT_PF, true, errcode, true)); 2749 } 2750 2751 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 2752 2753 int 2754 vm_inject_nmi(struct vm *vm, int vcpuid) 2755 { 2756 struct vcpu *vcpu; 2757 2758 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2759 return (EINVAL); 2760 2761 vcpu = &vm->vcpu[vcpuid]; 2762 2763 vcpu->nmi_pending = true; 2764 vcpu_notify_event(vm, vcpuid); 2765 return (0); 2766 } 2767 2768 bool 2769 vm_nmi_pending(struct vm *vm, int vcpuid) 2770 { 2771 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2772 2773 return (vcpu->nmi_pending); 2774 } 2775 2776 void 2777 vm_nmi_clear(struct vm *vm, int vcpuid) 2778 { 2779 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2780 2781 ASSERT(vcpu->nmi_pending); 2782 2783 vcpu->nmi_pending = false; 2784 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 2785 } 2786 2787 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); 2788 2789 int 2790 vm_inject_extint(struct vm *vm, int vcpuid) 2791 { 2792 struct vcpu *vcpu; 2793 2794 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2795 return (EINVAL); 2796 2797 vcpu = &vm->vcpu[vcpuid]; 2798 2799 vcpu->extint_pending = true; 2800 vcpu_notify_event(vm, vcpuid); 2801 return (0); 2802 } 2803 2804 bool 2805 vm_extint_pending(struct vm *vm, int vcpuid) 2806 { 2807 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2808 2809 return (vcpu->extint_pending); 2810 } 2811 2812 void 2813 vm_extint_clear(struct vm *vm, int vcpuid) 2814 { 2815 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2816 2817 ASSERT(vcpu->extint_pending); 2818 2819 vcpu->extint_pending = false; 2820 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); 2821 } 2822 2823 int 2824 vm_inject_init(struct vm *vm, int vcpuid) 2825 { 2826 struct vcpu *vcpu; 2827 2828 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2829 return (EINVAL); 2830 2831 vcpu = &vm->vcpu[vcpuid]; 2832 vcpu_lock(vcpu); 2833 vcpu->run_state |= VRS_PEND_INIT; 2834 /* 2835 * As part of queuing the INIT request, clear any pending SIPI. It 2836 * would not otherwise survive across the reset of the vCPU when it 2837 * undergoes the requested INIT. We would not want it to linger when it 2838 * could be mistaken as a subsequent (after the INIT) SIPI request. 2839 */ 2840 vcpu->run_state &= ~VRS_PEND_SIPI; 2841 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2842 2843 vcpu_unlock(vcpu); 2844 return (0); 2845 } 2846 2847 int 2848 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector) 2849 { 2850 struct vcpu *vcpu; 2851 2852 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2853 return (EINVAL); 2854 2855 vcpu = &vm->vcpu[vcpuid]; 2856 vcpu_lock(vcpu); 2857 vcpu->run_state |= VRS_PEND_SIPI; 2858 vcpu->sipi_vector = vector; 2859 /* SIPI is only actionable if the CPU is waiting in INIT state */ 2860 if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) { 2861 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2862 } 2863 vcpu_unlock(vcpu); 2864 return (0); 2865 } 2866 2867 bool 2868 vcpu_run_state_pending(struct vm *vm, int vcpuid) 2869 { 2870 struct vcpu *vcpu; 2871 2872 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 2873 vcpu = &vm->vcpu[vcpuid]; 2874 2875 /* Of interest: vCPU not in running state or with pending INIT */ 2876 return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN); 2877 } 2878 2879 int 2880 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only) 2881 { 2882 struct seg_desc desc; 2883 const enum vm_reg_name clear_regs[] = { 2884 VM_REG_GUEST_CR2, 2885 VM_REG_GUEST_CR3, 2886 VM_REG_GUEST_CR4, 2887 VM_REG_GUEST_RAX, 2888 VM_REG_GUEST_RBX, 2889 VM_REG_GUEST_RCX, 2890 VM_REG_GUEST_RSI, 2891 VM_REG_GUEST_RDI, 2892 VM_REG_GUEST_RBP, 2893 VM_REG_GUEST_RSP, 2894 VM_REG_GUEST_R8, 2895 VM_REG_GUEST_R9, 2896 VM_REG_GUEST_R10, 2897 VM_REG_GUEST_R11, 2898 VM_REG_GUEST_R12, 2899 VM_REG_GUEST_R13, 2900 VM_REG_GUEST_R14, 2901 VM_REG_GUEST_R15, 2902 VM_REG_GUEST_DR0, 2903 VM_REG_GUEST_DR1, 2904 VM_REG_GUEST_DR2, 2905 VM_REG_GUEST_DR3, 2906 VM_REG_GUEST_EFER, 2907 }; 2908 const enum vm_reg_name data_segs[] = { 2909 VM_REG_GUEST_SS, 2910 VM_REG_GUEST_DS, 2911 VM_REG_GUEST_ES, 2912 VM_REG_GUEST_FS, 2913 VM_REG_GUEST_GS, 2914 }; 2915 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2916 2917 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2918 return (EINVAL); 2919 2920 for (uint_t i = 0; i < nitems(clear_regs); i++) { 2921 VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0)); 2922 } 2923 2924 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2)); 2925 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0)); 2926 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010)); 2927 2928 /* 2929 * The prescribed contents of %rdx differ slightly between the Intel and 2930 * AMD architectural definitions. The former expects the Extended Model 2931 * in bits 16-19 where the latter expects all the Family, Model, and 2932 * Stepping be there. Common boot ROMs appear to disregard this 2933 * anyways, so we stick with a compromise value similar to what is 2934 * spelled out in the Intel SDM. 2935 */ 2936 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600)); 2937 2938 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0)); 2939 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400)); 2940 2941 /* CS: Present, R/W, Accessed */ 2942 desc.access = 0x0093; 2943 desc.base = 0xffff0000; 2944 desc.limit = 0xffff; 2945 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 2946 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000)); 2947 2948 /* SS, DS, ES, FS, GS: Present, R/W, Accessed */ 2949 desc.access = 0x0093; 2950 desc.base = 0; 2951 desc.limit = 0xffff; 2952 for (uint_t i = 0; i < nitems(data_segs); i++) { 2953 VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc)); 2954 VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0)); 2955 } 2956 2957 /* GDTR, IDTR */ 2958 desc.base = 0; 2959 desc.limit = 0xffff; 2960 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc)); 2961 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc)); 2962 2963 /* LDTR: Present, LDT */ 2964 desc.access = 0x0082; 2965 desc.base = 0; 2966 desc.limit = 0xffff; 2967 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc)); 2968 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0)); 2969 2970 /* TR: Present, 32-bit TSS */ 2971 desc.access = 0x008b; 2972 desc.base = 0; 2973 desc.limit = 0xffff; 2974 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc)); 2975 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0)); 2976 2977 vlapic_reset(vm_lapic(vm, vcpuid)); 2978 2979 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0)); 2980 2981 vcpu->exit_intinfo = 0; 2982 vcpu->exc_pending = 0; 2983 vcpu->nmi_pending = false; 2984 vcpu->extint_pending = 0; 2985 2986 /* 2987 * A CPU reset caused by power-on or system reset clears more state than 2988 * one which is trigged from an INIT IPI. 2989 */ 2990 if (!init_only) { 2991 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 2992 (void) hma_fpu_init(vcpu->guestfpu); 2993 2994 /* XXX: clear MSRs and other pieces */ 2995 bzero(&vcpu->mtrr, sizeof (vcpu->mtrr)); 2996 } 2997 2998 return (0); 2999 } 3000 3001 static int 3002 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector) 3003 { 3004 struct seg_desc desc; 3005 3006 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3007 return (EINVAL); 3008 3009 /* CS: Present, R/W, Accessed */ 3010 desc.access = 0x0093; 3011 desc.base = (uint64_t)vector << 12; 3012 desc.limit = 0xffff; 3013 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 3014 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 3015 (uint64_t)vector << 8)); 3016 3017 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0)); 3018 3019 return (0); 3020 } 3021 3022 int 3023 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 3024 { 3025 if (vcpu < 0 || vcpu >= vm->maxcpus) 3026 return (EINVAL); 3027 3028 if (type < 0 || type >= VM_CAP_MAX) 3029 return (EINVAL); 3030 3031 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 3032 } 3033 3034 int 3035 vm_set_capability(struct vm *vm, int vcpu, int type, int val) 3036 { 3037 if (vcpu < 0 || vcpu >= vm->maxcpus) 3038 return (EINVAL); 3039 3040 if (type < 0 || type >= VM_CAP_MAX) 3041 return (EINVAL); 3042 3043 return (VMSETCAP(vm->cookie, vcpu, type, val)); 3044 } 3045 3046 struct vlapic * 3047 vm_lapic(struct vm *vm, int cpu) 3048 { 3049 ASSERT3S(cpu, >=, 0); 3050 ASSERT3S(cpu, <, VM_MAXCPU); 3051 3052 return (vm->vcpu[cpu].vlapic); 3053 } 3054 3055 struct vioapic * 3056 vm_ioapic(struct vm *vm) 3057 { 3058 3059 return (vm->vioapic); 3060 } 3061 3062 struct vhpet * 3063 vm_hpet(struct vm *vm) 3064 { 3065 3066 return (vm->vhpet); 3067 } 3068 3069 void * 3070 vm_iommu_domain(struct vm *vm) 3071 { 3072 3073 return (vm->iommu); 3074 } 3075 3076 int 3077 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, 3078 bool from_idle) 3079 { 3080 int error; 3081 struct vcpu *vcpu; 3082 3083 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3084 panic("vcpu_set_state: invalid vcpuid %d", vcpuid); 3085 3086 vcpu = &vm->vcpu[vcpuid]; 3087 3088 vcpu_lock(vcpu); 3089 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); 3090 vcpu_unlock(vcpu); 3091 3092 return (error); 3093 } 3094 3095 enum vcpu_state 3096 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 3097 { 3098 struct vcpu *vcpu; 3099 enum vcpu_state state; 3100 3101 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3102 panic("vcpu_get_state: invalid vcpuid %d", vcpuid); 3103 3104 vcpu = &vm->vcpu[vcpuid]; 3105 3106 vcpu_lock(vcpu); 3107 state = vcpu->state; 3108 if (hostcpu != NULL) 3109 *hostcpu = vcpu->hostcpu; 3110 vcpu_unlock(vcpu); 3111 3112 return (state); 3113 } 3114 3115 uint64_t 3116 vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj) 3117 { 3118 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3119 3120 uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset; 3121 3122 if (phys_adj) { 3123 /* Include any offset for the current physical CPU too */ 3124 extern hrtime_t tsc_gethrtime_tick_delta(void); 3125 vcpu_off += (uint64_t)tsc_gethrtime_tick_delta(); 3126 } 3127 3128 return (vcpu_off); 3129 } 3130 3131 /* Normalize hrtime against the boot time for a VM */ 3132 hrtime_t 3133 vm_normalize_hrtime(struct vm *vm, hrtime_t hrt) 3134 { 3135 /* To avoid underflow/overflow UB, perform math as unsigned */ 3136 return ((hrtime_t)((uint64_t)hrt - (uint64_t)vm->boot_hrtime)); 3137 } 3138 3139 /* Denormalize hrtime against the boot time for a VM */ 3140 hrtime_t 3141 vm_denormalize_hrtime(struct vm *vm, hrtime_t hrt) 3142 { 3143 /* To avoid underflow/overflow UB, perform math as unsigned */ 3144 return ((hrtime_t)((uint64_t)hrt + (uint64_t)vm->boot_hrtime)); 3145 } 3146 3147 int 3148 vm_activate_cpu(struct vm *vm, int vcpuid) 3149 { 3150 3151 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3152 return (EINVAL); 3153 3154 if (CPU_ISSET(vcpuid, &vm->active_cpus)) 3155 return (EBUSY); 3156 3157 if (vm->suspend != 0) { 3158 return (EBUSY); 3159 } 3160 3161 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); 3162 3163 /* 3164 * It is possible that this vCPU was undergoing activation at the same 3165 * time that the VM was being suspended. If that happens to be the 3166 * case, it should reflect the suspended state immediately. 3167 */ 3168 if (atomic_load_acq_int((uint_t *)&vm->suspend) != 0) { 3169 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 3170 } 3171 3172 return (0); 3173 } 3174 3175 int 3176 vm_suspend_cpu(struct vm *vm, int vcpuid) 3177 { 3178 int i; 3179 3180 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3181 return (EINVAL); 3182 3183 if (vcpuid == -1) { 3184 vm->debug_cpus = vm->active_cpus; 3185 for (i = 0; i < vm->maxcpus; i++) { 3186 if (CPU_ISSET(i, &vm->active_cpus)) 3187 vcpu_notify_event(vm, i); 3188 } 3189 } else { 3190 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 3191 return (EINVAL); 3192 3193 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); 3194 vcpu_notify_event(vm, vcpuid); 3195 } 3196 return (0); 3197 } 3198 3199 int 3200 vm_resume_cpu(struct vm *vm, int vcpuid) 3201 { 3202 3203 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3204 return (EINVAL); 3205 3206 if (vcpuid == -1) { 3207 CPU_ZERO(&vm->debug_cpus); 3208 } else { 3209 if (!CPU_ISSET(vcpuid, &vm->debug_cpus)) 3210 return (EINVAL); 3211 3212 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); 3213 } 3214 return (0); 3215 } 3216 3217 static bool 3218 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry, 3219 uint64_t entry_rip) 3220 { 3221 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3222 struct vm_exit *vme = &vcpu->exitinfo; 3223 bool bail = false; 3224 3225 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3226 3227 if (vm->suspend) { 3228 if (on_entry) { 3229 VERIFY(vm->suspend > VM_SUSPEND_NONE && 3230 vm->suspend < VM_SUSPEND_LAST); 3231 3232 vme->exitcode = VM_EXITCODE_SUSPENDED; 3233 vme->u.suspended.how = vm->suspend; 3234 } else { 3235 /* 3236 * Handling VM suspend is complicated, so if that 3237 * condition is detected outside of VM-entry itself, 3238 * just emit a BOGUS exitcode so we take a lap to pick 3239 * up the event during an entry and are directed into 3240 * the vm_handle_suspend() logic. 3241 */ 3242 vme->exitcode = VM_EXITCODE_BOGUS; 3243 } 3244 bail = true; 3245 } 3246 if (vcpu->reqidle) { 3247 vme->exitcode = VM_EXITCODE_REQIDLE; 3248 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); 3249 3250 if (!on_entry) { 3251 /* 3252 * A reqidle request detected outside of VM-entry can be 3253 * handled directly by clearing the request (and taking 3254 * a lap to userspace). 3255 */ 3256 vcpu_assert_locked(vcpu); 3257 vcpu->reqidle = 0; 3258 } 3259 bail = true; 3260 } 3261 if (vcpu_should_yield(vm, vcpuid)) { 3262 vme->exitcode = VM_EXITCODE_BOGUS; 3263 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); 3264 bail = true; 3265 } 3266 if (CPU_ISSET(vcpuid, &vm->debug_cpus)) { 3267 vme->exitcode = VM_EXITCODE_DEBUG; 3268 bail = true; 3269 } 3270 3271 if (bail) { 3272 if (on_entry) { 3273 /* 3274 * If bailing out during VM-entry, the current %rip must 3275 * be recorded in the exitinfo. 3276 */ 3277 vme->rip = entry_rip; 3278 } 3279 vme->inst_length = 0; 3280 } 3281 return (bail); 3282 } 3283 3284 static bool 3285 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid) 3286 { 3287 /* 3288 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or 3289 * wait-for-SIPI) expect that %rip is already populated in the vm_exit 3290 * structure, and we would only modify the exitcode. 3291 */ 3292 return (vcpu_bailout_checks(vm, vcpuid, false, 0)); 3293 } 3294 3295 bool 3296 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip) 3297 { 3298 /* 3299 * Bail-out checks done as part of VM entry require an updated %rip to 3300 * populate the vm_exit struct if any of the conditions of interest are 3301 * matched in the check. 3302 */ 3303 return (vcpu_bailout_checks(vm, vcpuid, true, rip)); 3304 } 3305 3306 cpuset_t 3307 vm_active_cpus(struct vm *vm) 3308 { 3309 3310 return (vm->active_cpus); 3311 } 3312 3313 cpuset_t 3314 vm_debug_cpus(struct vm *vm) 3315 { 3316 3317 return (vm->debug_cpus); 3318 } 3319 3320 cpuset_t 3321 vm_suspended_cpus(struct vm *vm) 3322 { 3323 3324 return (vm->suspended_cpus); 3325 } 3326 3327 void * 3328 vcpu_stats(struct vm *vm, int vcpuid) 3329 { 3330 3331 return (vm->vcpu[vcpuid].stats); 3332 } 3333 3334 int 3335 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 3336 { 3337 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3338 return (EINVAL); 3339 3340 *state = vm->vcpu[vcpuid].x2apic_state; 3341 3342 return (0); 3343 } 3344 3345 int 3346 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 3347 { 3348 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3349 return (EINVAL); 3350 3351 if (state >= X2APIC_STATE_LAST) 3352 return (EINVAL); 3353 3354 vm->vcpu[vcpuid].x2apic_state = state; 3355 3356 vlapic_set_x2apic_state(vm, vcpuid, state); 3357 3358 return (0); 3359 } 3360 3361 /* 3362 * This function is called to ensure that a vcpu "sees" a pending event 3363 * as soon as possible: 3364 * - If the vcpu thread is sleeping then it is woken up. 3365 * - If the vcpu is running on a different host_cpu then an IPI will be directed 3366 * to the host_cpu to cause the vcpu to trap into the hypervisor. 3367 */ 3368 static void 3369 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype) 3370 { 3371 int hostcpu; 3372 3373 ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT); 3374 3375 hostcpu = vcpu->hostcpu; 3376 if (vcpu->state == VCPU_RUNNING) { 3377 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); 3378 if (hostcpu != curcpu) { 3379 if (ntype == VCPU_NOTIFY_APIC) { 3380 vlapic_post_intr(vcpu->vlapic, hostcpu); 3381 } else { 3382 poke_cpu(hostcpu); 3383 } 3384 } else { 3385 /* 3386 * If the 'vcpu' is running on 'curcpu' then it must 3387 * be sending a notification to itself (e.g. SELF_IPI). 3388 * The pending event will be picked up when the vcpu 3389 * transitions back to guest context. 3390 */ 3391 } 3392 } else { 3393 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " 3394 "with hostcpu %d", vcpu->state, hostcpu)); 3395 if (vcpu->state == VCPU_SLEEPING) { 3396 cv_signal(&vcpu->vcpu_cv); 3397 } 3398 } 3399 } 3400 3401 void 3402 vcpu_notify_event(struct vm *vm, int vcpuid) 3403 { 3404 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3405 3406 vcpu_lock(vcpu); 3407 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 3408 vcpu_unlock(vcpu); 3409 } 3410 3411 void 3412 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype) 3413 { 3414 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3415 3416 if (ntype == VCPU_NOTIFY_NONE) { 3417 return; 3418 } 3419 3420 vcpu_lock(vcpu); 3421 vcpu_notify_event_locked(vcpu, ntype); 3422 vcpu_unlock(vcpu); 3423 } 3424 3425 void 3426 vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate) 3427 { 3428 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3429 hrtime_t now = gethrtime(); 3430 3431 ASSERT3U(ustate, !=, vcpu->ustate); 3432 ASSERT3S(ustate, <, VU_MAX); 3433 ASSERT3S(ustate, >=, VU_INIT); 3434 3435 hrtime_t delta = now - vcpu->ustate_when; 3436 vcpu->ustate_total[vcpu->ustate] += delta; 3437 3438 membar_producer(); 3439 3440 vcpu->ustate_when = now; 3441 vcpu->ustate = ustate; 3442 } 3443 3444 struct vmspace * 3445 vm_get_vmspace(struct vm *vm) 3446 { 3447 3448 return (vm->vmspace); 3449 } 3450 3451 struct vm_client * 3452 vm_get_vmclient(struct vm *vm, int vcpuid) 3453 { 3454 return (vm->vcpu[vcpuid].vmclient); 3455 } 3456 3457 int 3458 vm_apicid2vcpuid(struct vm *vm, int apicid) 3459 { 3460 /* 3461 * XXX apic id is assumed to be numerically identical to vcpu id 3462 */ 3463 return (apicid); 3464 } 3465 3466 struct vatpic * 3467 vm_atpic(struct vm *vm) 3468 { 3469 return (vm->vatpic); 3470 } 3471 3472 struct vatpit * 3473 vm_atpit(struct vm *vm) 3474 { 3475 return (vm->vatpit); 3476 } 3477 3478 struct vpmtmr * 3479 vm_pmtmr(struct vm *vm) 3480 { 3481 3482 return (vm->vpmtmr); 3483 } 3484 3485 struct vrtc * 3486 vm_rtc(struct vm *vm) 3487 { 3488 3489 return (vm->vrtc); 3490 } 3491 3492 enum vm_reg_name 3493 vm_segment_name(int seg) 3494 { 3495 static enum vm_reg_name seg_names[] = { 3496 VM_REG_GUEST_ES, 3497 VM_REG_GUEST_CS, 3498 VM_REG_GUEST_SS, 3499 VM_REG_GUEST_DS, 3500 VM_REG_GUEST_FS, 3501 VM_REG_GUEST_GS 3502 }; 3503 3504 KASSERT(seg >= 0 && seg < nitems(seg_names), 3505 ("%s: invalid segment encoding %d", __func__, seg)); 3506 return (seg_names[seg]); 3507 } 3508 3509 void 3510 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, 3511 uint_t num_copyinfo) 3512 { 3513 for (uint_t idx = 0; idx < num_copyinfo; idx++) { 3514 if (copyinfo[idx].cookie != NULL) { 3515 (void) vmp_release((vm_page_t *)copyinfo[idx].cookie); 3516 } 3517 } 3518 bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo)); 3519 } 3520 3521 int 3522 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 3523 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, 3524 uint_t num_copyinfo, int *fault) 3525 { 3526 uint_t idx, nused; 3527 size_t n, off, remaining; 3528 vm_client_t *vmc = vm_get_vmclient(vm, vcpuid); 3529 3530 bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo); 3531 3532 nused = 0; 3533 remaining = len; 3534 while (remaining > 0) { 3535 uint64_t gpa; 3536 int error; 3537 3538 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); 3539 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); 3540 if (error || *fault) 3541 return (error); 3542 off = gpa & PAGEOFFSET; 3543 n = min(remaining, PAGESIZE - off); 3544 copyinfo[nused].gpa = gpa; 3545 copyinfo[nused].len = n; 3546 remaining -= n; 3547 gla += n; 3548 nused++; 3549 } 3550 3551 for (idx = 0; idx < nused; idx++) { 3552 vm_page_t *vmp; 3553 caddr_t hva; 3554 3555 vmp = vmc_hold(vmc, copyinfo[idx].gpa & PAGEMASK, prot); 3556 if (vmp == NULL) { 3557 break; 3558 } 3559 if ((prot & PROT_WRITE) != 0) { 3560 hva = (caddr_t)vmp_get_writable(vmp); 3561 } else { 3562 hva = (caddr_t)vmp_get_readable(vmp); 3563 } 3564 copyinfo[idx].hva = hva + (copyinfo[idx].gpa & PAGEOFFSET); 3565 copyinfo[idx].cookie = vmp; 3566 copyinfo[idx].prot = prot; 3567 } 3568 3569 if (idx != nused) { 3570 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); 3571 return (EFAULT); 3572 } else { 3573 *fault = 0; 3574 return (0); 3575 } 3576 } 3577 3578 void 3579 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, 3580 size_t len) 3581 { 3582 char *dst; 3583 int idx; 3584 3585 dst = kaddr; 3586 idx = 0; 3587 while (len > 0) { 3588 ASSERT(copyinfo[idx].prot & PROT_READ); 3589 3590 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); 3591 len -= copyinfo[idx].len; 3592 dst += copyinfo[idx].len; 3593 idx++; 3594 } 3595 } 3596 3597 void 3598 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, 3599 struct vm_copyinfo *copyinfo, size_t len) 3600 { 3601 const char *src; 3602 int idx; 3603 3604 src = kaddr; 3605 idx = 0; 3606 while (len > 0) { 3607 ASSERT(copyinfo[idx].prot & PROT_WRITE); 3608 3609 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); 3610 len -= copyinfo[idx].len; 3611 src += copyinfo[idx].len; 3612 idx++; 3613 } 3614 } 3615 3616 /* 3617 * Return the amount of in-use and wired memory for the VM. Since 3618 * these are global stats, only return the values with for vCPU 0 3619 */ 3620 VMM_STAT_DECLARE(VMM_MEM_RESIDENT); 3621 3622 static void 3623 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 3624 { 3625 if (vcpu == 0) { 3626 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, 3627 PAGE_SIZE * vmspace_resident_count(vm->vmspace)); 3628 } 3629 } 3630 3631 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); 3632 3633 int 3634 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port, 3635 uint8_t bytes, uint32_t *val) 3636 { 3637 return (vm_inout_access(&vm->ioports, in, port, bytes, val)); 3638 } 3639 3640 /* 3641 * bhyve-internal interfaces to attach or detach IO port handlers. 3642 * Must be called with VM write lock held for safety. 3643 */ 3644 int 3645 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg, 3646 void **cookie) 3647 { 3648 int err; 3649 err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg); 3650 if (err == 0) { 3651 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3652 } 3653 return (err); 3654 } 3655 int 3656 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func, 3657 void **old_arg) 3658 { 3659 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3660 int err; 3661 3662 err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg); 3663 if (err == 0) { 3664 *cookie = NULL; 3665 } 3666 return (err); 3667 } 3668 3669 /* 3670 * External driver interfaces to attach or detach IO port handlers. 3671 * Must be called with VM write lock held for safety. 3672 */ 3673 int 3674 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func, 3675 void *arg, void **cookie) 3676 { 3677 int err; 3678 3679 if (port == 0) { 3680 return (EINVAL); 3681 } 3682 3683 err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg); 3684 if (err == 0) { 3685 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3686 } 3687 return (err); 3688 } 3689 void 3690 vm_ioport_unhook(struct vm *vm, void **cookie) 3691 { 3692 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3693 ioport_handler_t old_func; 3694 void *old_arg; 3695 int err; 3696 3697 err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg); 3698 3699 /* ioport-hook-using drivers are expected to be well-behaved */ 3700 VERIFY0(err); 3701 VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie); 3702 3703 *cookie = NULL; 3704 } 3705 3706 int 3707 vmm_kstat_update_vcpu(struct kstat *ksp, int rw) 3708 { 3709 struct vm *vm = ksp->ks_private; 3710 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 3711 const int vcpuid = vvk->vvk_vcpu.value.ui32; 3712 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3713 3714 ASSERT3U(vcpuid, <, VM_MAXCPU); 3715 3716 vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT]; 3717 vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN]; 3718 vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE]; 3719 vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN]; 3720 vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER]; 3721 vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED]; 3722 3723 return (0); 3724 } 3725 3726 SET_DECLARE(vmm_data_version_entries, const vmm_data_version_entry_t); 3727 3728 static inline bool 3729 vmm_data_is_cpu_specific(uint16_t data_class) 3730 { 3731 switch (data_class) { 3732 case VDC_REGISTER: 3733 case VDC_MSR: 3734 case VDC_FPU: 3735 case VDC_LAPIC: 3736 return (true); 3737 default: 3738 return (false); 3739 } 3740 } 3741 3742 static int 3743 vmm_data_find(const vmm_data_req_t *req, const vmm_data_version_entry_t **resp) 3744 { 3745 const vmm_data_version_entry_t **vdpp, *vdp; 3746 3747 ASSERT(resp != NULL); 3748 ASSERT(req->vdr_result_len != NULL); 3749 3750 SET_FOREACH(vdpp, vmm_data_version_entries) { 3751 vdp = *vdpp; 3752 if (vdp->vdve_class == req->vdr_class && 3753 vdp->vdve_version == req->vdr_version) { 3754 /* 3755 * Enforce any data length expectation expressed by the 3756 * provider for this data. 3757 */ 3758 if (vdp->vdve_len_expect != 0 && 3759 vdp->vdve_len_expect > req->vdr_len) { 3760 *req->vdr_result_len = vdp->vdve_len_expect; 3761 return (ENOSPC); 3762 } 3763 *resp = vdp; 3764 return (0); 3765 } 3766 } 3767 return (EINVAL); 3768 } 3769 3770 static void * 3771 vmm_data_from_class(const vmm_data_req_t *req, struct vm *vm, int vcpuid) 3772 { 3773 switch (req->vdr_class) { 3774 /* per-cpu data/devices */ 3775 case VDC_LAPIC: 3776 return (vm_lapic(vm, vcpuid)); 3777 case VDC_VMM_ARCH: 3778 return (vm); 3779 3780 case VDC_FPU: 3781 case VDC_REGISTER: 3782 case VDC_MSR: 3783 /* 3784 * These have per-CPU handling which is dispatched outside 3785 * vmm_data_version_entries listing. 3786 */ 3787 return (NULL); 3788 3789 /* system-wide data/devices */ 3790 case VDC_IOAPIC: 3791 return (vm->vioapic); 3792 case VDC_ATPIT: 3793 return (vm->vatpit); 3794 case VDC_ATPIC: 3795 return (vm->vatpic); 3796 case VDC_HPET: 3797 return (vm->vhpet); 3798 case VDC_PM_TIMER: 3799 return (vm->vpmtmr); 3800 case VDC_RTC: 3801 return (vm->vrtc); 3802 3803 default: 3804 /* The data class will have been validated by now */ 3805 panic("Unexpected class %u", req->vdr_class); 3806 } 3807 } 3808 3809 const uint32_t arch_msr_iter[] = { 3810 MSR_EFER, 3811 3812 /* 3813 * While gsbase and fsbase are accessible via the MSR accessors, they 3814 * are not included in MSR iteration since they are covered by the 3815 * segment descriptor interface too. 3816 */ 3817 MSR_KGSBASE, 3818 3819 MSR_STAR, 3820 MSR_LSTAR, 3821 MSR_CSTAR, 3822 MSR_SF_MASK, 3823 3824 MSR_SYSENTER_CS_MSR, 3825 MSR_SYSENTER_ESP_MSR, 3826 MSR_SYSENTER_EIP_MSR, 3827 MSR_PAT, 3828 }; 3829 const uint32_t generic_msr_iter[] = { 3830 MSR_TSC, 3831 MSR_MTRRcap, 3832 MSR_MTRRdefType, 3833 3834 MSR_MTRR4kBase, MSR_MTRR4kBase + 1, MSR_MTRR4kBase + 2, 3835 MSR_MTRR4kBase + 3, MSR_MTRR4kBase + 4, MSR_MTRR4kBase + 5, 3836 MSR_MTRR4kBase + 6, MSR_MTRR4kBase + 7, 3837 3838 MSR_MTRR16kBase, MSR_MTRR16kBase + 1, 3839 3840 MSR_MTRR64kBase, 3841 }; 3842 3843 static int 3844 vmm_data_read_msrs(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 3845 { 3846 VERIFY3U(req->vdr_class, ==, VDC_MSR); 3847 VERIFY3U(req->vdr_version, ==, 1); 3848 3849 const uint_t num_msrs = nitems(arch_msr_iter) + nitems(generic_msr_iter) 3850 + (VMM_MTRR_VAR_MAX * 2); 3851 const uint32_t output_len = 3852 num_msrs * sizeof (struct vdi_field_entry_v1); 3853 *req->vdr_result_len = output_len; 3854 3855 if (req->vdr_len < output_len) { 3856 return (ENOSPC); 3857 } 3858 3859 struct vdi_field_entry_v1 *entryp = req->vdr_data; 3860 for (uint_t i = 0; i < nitems(arch_msr_iter); i++, entryp++) { 3861 const uint32_t msr = arch_msr_iter[i]; 3862 uint64_t val = 0; 3863 3864 int err = ops->vmgetmsr(vm->cookie, vcpuid, msr, &val); 3865 /* All of these MSRs are expected to work */ 3866 VERIFY0(err); 3867 entryp->vfe_ident = msr; 3868 entryp->vfe_value = val; 3869 } 3870 3871 struct vm_mtrr *mtrr = &vm->vcpu[vcpuid].mtrr; 3872 for (uint_t i = 0; i < nitems(generic_msr_iter); i++, entryp++) { 3873 const uint32_t msr = generic_msr_iter[i]; 3874 3875 entryp->vfe_ident = msr; 3876 switch (msr) { 3877 case MSR_TSC: 3878 /* 3879 * Communicate this as the difference from the VM-wide 3880 * offset of the boot time. 3881 */ 3882 entryp->vfe_value = vm->vcpu[vcpuid].tsc_offset; 3883 break; 3884 case MSR_MTRRcap: 3885 case MSR_MTRRdefType: 3886 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 3887 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 3888 case MSR_MTRR64kBase: { 3889 int err = vm_rdmtrr(mtrr, msr, &entryp->vfe_value); 3890 VERIFY0(err); 3891 break; 3892 } 3893 default: 3894 panic("unexpected msr export %x", msr); 3895 } 3896 } 3897 /* Copy the variable MTRRs */ 3898 for (uint_t i = 0; i < (VMM_MTRR_VAR_MAX * 2); i++, entryp++) { 3899 const uint32_t msr = MSR_MTRRVarBase + i; 3900 3901 entryp->vfe_ident = msr; 3902 int err = vm_rdmtrr(mtrr, msr, &entryp->vfe_value); 3903 VERIFY0(err); 3904 } 3905 return (0); 3906 } 3907 3908 static int 3909 vmm_data_write_msrs(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 3910 { 3911 VERIFY3U(req->vdr_class, ==, VDC_MSR); 3912 VERIFY3U(req->vdr_version, ==, 1); 3913 3914 const struct vdi_field_entry_v1 *entryp = req->vdr_data; 3915 const uint_t entry_count = 3916 req->vdr_len / sizeof (struct vdi_field_entry_v1); 3917 struct vm_mtrr *mtrr = &vm->vcpu[vcpuid].mtrr; 3918 3919 /* 3920 * First make sure that all of the MSRs can be manipulated. 3921 * For now, this check is done by going though the getmsr handler 3922 */ 3923 for (uint_t i = 0; i < entry_count; i++, entryp++) { 3924 const uint32_t msr = entryp->vfe_ident; 3925 uint64_t val; 3926 int err = 0; 3927 3928 switch (msr) { 3929 case MSR_TSC: 3930 break; 3931 default: 3932 if (is_mtrr_msr(msr)) { 3933 err = vm_rdmtrr(mtrr, msr, &val); 3934 } else { 3935 err = ops->vmgetmsr(vm->cookie, vcpuid, msr, 3936 &val); 3937 } 3938 break; 3939 } 3940 if (err != 0) { 3941 return (err); 3942 } 3943 } 3944 3945 /* 3946 * Fairly confident that all of the 'set' operations are at least 3947 * targeting valid MSRs, continue on. 3948 */ 3949 entryp = req->vdr_data; 3950 for (uint_t i = 0; i < entry_count; i++, entryp++) { 3951 const uint32_t msr = entryp->vfe_ident; 3952 const uint64_t val = entryp->vfe_value; 3953 int err = 0; 3954 3955 switch (msr) { 3956 case MSR_TSC: 3957 vm->vcpu[vcpuid].tsc_offset = entryp->vfe_value; 3958 break; 3959 default: 3960 if (is_mtrr_msr(msr)) { 3961 if (msr == MSR_MTRRcap) { 3962 /* 3963 * MTRRcap is read-only. If the current 3964 * value matches the incoming one, 3965 * consider it a success 3966 */ 3967 uint64_t comp; 3968 err = vm_rdmtrr(mtrr, msr, &comp); 3969 if (err != 0 || comp != val) { 3970 err = EINVAL; 3971 } 3972 } else { 3973 err = vm_wrmtrr(mtrr, msr, val); 3974 } 3975 } else { 3976 err = ops->vmsetmsr(vm->cookie, vcpuid, msr, 3977 val); 3978 } 3979 break; 3980 } 3981 if (err != 0) { 3982 return (err); 3983 } 3984 } 3985 *req->vdr_result_len = entry_count * sizeof (struct vdi_field_entry_v1); 3986 3987 return (0); 3988 } 3989 3990 static const vmm_data_version_entry_t msr_v1 = { 3991 .vdve_class = VDC_MSR, 3992 .vdve_version = 1, 3993 .vdve_len_per_item = sizeof (struct vdi_field_entry_v1), 3994 /* Requires backend-specific dispatch */ 3995 .vdve_readf = NULL, 3996 .vdve_writef = NULL, 3997 }; 3998 VMM_DATA_VERSION(msr_v1); 3999 4000 static const uint32_t vmm_arch_v1_fields[] = { 4001 VAI_TSC_BOOT_OFFSET, 4002 VAI_BOOT_HRTIME, 4003 VAI_TSC_FREQ, 4004 }; 4005 4006 static bool 4007 vmm_read_arch_field(struct vm *vm, uint32_t ident, uint64_t *valp) 4008 { 4009 ASSERT(valp != NULL); 4010 4011 switch (ident) { 4012 case VAI_TSC_BOOT_OFFSET: 4013 *valp = vm->boot_tsc_offset; 4014 return (true); 4015 case VAI_BOOT_HRTIME: 4016 *valp = vm->boot_hrtime; 4017 return (true); 4018 case VAI_TSC_FREQ: 4019 /* 4020 * Since the system TSC calibration is not public, just derive 4021 * it from the scaling functions available. 4022 */ 4023 *valp = unscalehrtime(NANOSEC); 4024 return (true); 4025 default: 4026 break; 4027 } 4028 return (false); 4029 } 4030 4031 static int 4032 vmm_data_read_vmm_arch(void *arg, const vmm_data_req_t *req) 4033 { 4034 struct vm *vm = arg; 4035 4036 VERIFY3U(req->vdr_class, ==, VDC_VMM_ARCH); 4037 VERIFY3U(req->vdr_version, ==, 1); 4038 4039 struct vdi_field_entry_v1 *entryp = req->vdr_data; 4040 4041 /* Specific fields requested */ 4042 if ((req->vdr_flags & VDX_FLAG_READ_COPYIN) != 0) { 4043 const uint_t count = 4044 req->vdr_len / sizeof (struct vdi_field_entry_v1); 4045 4046 for (uint_t i = 0; i < count; i++, entryp++) { 4047 if (!vmm_read_arch_field(vm, entryp->vfe_ident, 4048 &entryp->vfe_value)) { 4049 return (EINVAL); 4050 } 4051 } 4052 *req->vdr_result_len = 4053 count * sizeof (struct vdi_field_entry_v1); 4054 return (0); 4055 } 4056 4057 /* Emit all of the possible values */ 4058 const uint32_t total_size = nitems(vmm_arch_v1_fields) * 4059 sizeof (struct vdi_field_entry_v1); 4060 *req->vdr_result_len = total_size; 4061 if (req->vdr_len < total_size) { 4062 return (ENOSPC); 4063 } 4064 for (uint_t i = 0; i < nitems(vmm_arch_v1_fields); i++, entryp++) { 4065 entryp->vfe_ident = vmm_arch_v1_fields[i]; 4066 VERIFY(vmm_read_arch_field(vm, entryp->vfe_ident, 4067 &entryp->vfe_value)); 4068 } 4069 return (0); 4070 } 4071 4072 static int 4073 vmm_data_write_vmm_arch(void *arg, const vmm_data_req_t *req) 4074 { 4075 struct vm *vm = arg; 4076 4077 VERIFY3U(req->vdr_class, ==, VDC_VMM_ARCH); 4078 VERIFY3U(req->vdr_version, ==, 1); 4079 4080 const struct vdi_field_entry_v1 *entryp = req->vdr_data; 4081 const uint_t entry_count = 4082 req->vdr_len / sizeof (struct vdi_field_entry_v1); 4083 4084 for (uint_t i = 0; i < entry_count; i++, entryp++) { 4085 const uint64_t val = entryp->vfe_value; 4086 4087 switch (entryp->vfe_ident) { 4088 case VAI_TSC_BOOT_OFFSET: 4089 vm->boot_tsc_offset = val; 4090 break; 4091 case VAI_BOOT_HRTIME: 4092 vm->boot_hrtime = val; 4093 break; 4094 case VAI_TSC_FREQ: 4095 /* Guest TSC frequency not (currently) adjustable */ 4096 return (EPERM); 4097 default: 4098 return (EINVAL); 4099 } 4100 } 4101 *req->vdr_result_len = entry_count * sizeof (struct vdi_field_entry_v1); 4102 return (0); 4103 } 4104 4105 static const vmm_data_version_entry_t vmm_arch_v1 = { 4106 .vdve_class = VDC_VMM_ARCH, 4107 .vdve_version = 1, 4108 .vdve_len_per_item = sizeof (struct vdi_field_entry_v1), 4109 .vdve_readf = vmm_data_read_vmm_arch, 4110 .vdve_writef = vmm_data_write_vmm_arch, 4111 }; 4112 VMM_DATA_VERSION(vmm_arch_v1); 4113 4114 static int 4115 vmm_data_read_versions(void *arg, const vmm_data_req_t *req) 4116 { 4117 VERIFY3U(req->vdr_class, ==, VDC_VERSION); 4118 VERIFY3U(req->vdr_version, ==, 1); 4119 4120 const uint32_t total_size = SET_COUNT(vmm_data_version_entries) * 4121 sizeof (struct vdi_version_entry_v1); 4122 4123 /* Make sure there is room for all of the entries */ 4124 *req->vdr_result_len = total_size; 4125 if (req->vdr_len < *req->vdr_result_len) { 4126 return (ENOSPC); 4127 } 4128 4129 struct vdi_version_entry_v1 *entryp = req->vdr_data; 4130 const vmm_data_version_entry_t **vdpp; 4131 SET_FOREACH(vdpp, vmm_data_version_entries) { 4132 const vmm_data_version_entry_t *vdp = *vdpp; 4133 4134 entryp->vve_class = vdp->vdve_class; 4135 entryp->vve_version = vdp->vdve_version; 4136 entryp->vve_len_expect = vdp->vdve_len_expect; 4137 entryp->vve_len_per_item = vdp->vdve_len_per_item; 4138 entryp++; 4139 } 4140 return (0); 4141 } 4142 4143 static int 4144 vmm_data_write_versions(void *arg, const vmm_data_req_t *req) 4145 { 4146 /* Writing to the version information makes no sense */ 4147 return (EPERM); 4148 } 4149 4150 static const vmm_data_version_entry_t versions_v1 = { 4151 .vdve_class = VDC_VERSION, 4152 .vdve_version = 1, 4153 .vdve_len_per_item = sizeof (struct vdi_version_entry_v1), 4154 .vdve_readf = vmm_data_read_versions, 4155 .vdve_writef = vmm_data_write_versions, 4156 }; 4157 VMM_DATA_VERSION(versions_v1); 4158 4159 int 4160 vmm_data_read(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 4161 { 4162 int err = 0; 4163 4164 if (vmm_data_is_cpu_specific(req->vdr_class)) { 4165 if (vcpuid >= VM_MAXCPU) { 4166 return (EINVAL); 4167 } 4168 } 4169 4170 const vmm_data_version_entry_t *entry = NULL; 4171 err = vmm_data_find(req, &entry); 4172 if (err != 0) { 4173 return (err); 4174 } 4175 ASSERT(entry != NULL); 4176 4177 void *datap = vmm_data_from_class(req, vm, vcpuid); 4178 if (datap != NULL) { 4179 err = entry->vdve_readf(datap, req); 4180 4181 /* 4182 * Successful reads of fixed-length data should populate the 4183 * length of that result. 4184 */ 4185 if (err == 0 && entry->vdve_len_expect != 0) { 4186 *req->vdr_result_len = entry->vdve_len_expect; 4187 } 4188 } else { 4189 switch (req->vdr_class) { 4190 case VDC_MSR: 4191 err = vmm_data_read_msrs(vm, vcpuid, req); 4192 break; 4193 case VDC_FPU: 4194 /* TODO: wire up to xsave export via hma_fpu iface */ 4195 err = EINVAL; 4196 break; 4197 case VDC_REGISTER: 4198 default: 4199 err = EINVAL; 4200 break; 4201 } 4202 } 4203 4204 return (err); 4205 } 4206 4207 int 4208 vmm_data_write(struct vm *vm, int vcpuid, const vmm_data_req_t *req) 4209 { 4210 int err = 0; 4211 4212 if (vmm_data_is_cpu_specific(req->vdr_class)) { 4213 if (vcpuid >= VM_MAXCPU) { 4214 return (EINVAL); 4215 } 4216 } 4217 4218 const vmm_data_version_entry_t *entry = NULL; 4219 err = vmm_data_find(req, &entry); 4220 if (err != 0) { 4221 return (err); 4222 } 4223 ASSERT(entry != NULL); 4224 4225 void *datap = vmm_data_from_class(req, vm, vcpuid); 4226 if (datap != NULL) { 4227 err = entry->vdve_writef(datap, req); 4228 /* 4229 * Successful writes of fixed-length data should populate the 4230 * length of that result. 4231 */ 4232 if (err == 0 && entry->vdve_len_expect != 0) { 4233 *req->vdr_result_len = entry->vdve_len_expect; 4234 } 4235 } else { 4236 switch (req->vdr_class) { 4237 case VDC_MSR: 4238 err = vmm_data_write_msrs(vm, vcpuid, req); 4239 break; 4240 case VDC_FPU: 4241 /* TODO: wire up to xsave import via hma_fpu iface */ 4242 err = EINVAL; 4243 break; 4244 case VDC_REGISTER: 4245 default: 4246 err = EINVAL; 4247 break; 4248 } 4249 } 4250 4251 return (err); 4252 } 4253