1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 /* 31 * This file and its contents are supplied under the terms of the 32 * Common Development and Distribution License ("CDDL"), version 1.0. 33 * You may only use this file in accordance with the terms of version 34 * 1.0 of the CDDL. 35 * 36 * A full copy of the text of the CDDL should have accompanied this 37 * source. A copy of the CDDL is also available via the Internet at 38 * http://www.illumos.org/license/CDDL. 39 * 40 * Copyright 2015 Pluribus Networks Inc. 41 * Copyright 2018 Joyent, Inc. 42 * Copyright 2021 Oxide Computer Company 43 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 44 */ 45 46 #include <sys/cdefs.h> 47 __FBSDID("$FreeBSD$"); 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/kernel.h> 52 #include <sys/module.h> 53 #include <sys/sysctl.h> 54 #include <sys/malloc.h> 55 #include <sys/pcpu.h> 56 #include <sys/mutex.h> 57 #include <sys/proc.h> 58 #include <sys/rwlock.h> 59 #include <sys/sched.h> 60 #include <sys/systm.h> 61 #include <sys/sunddi.h> 62 #include <sys/hma.h> 63 64 #include <machine/md_var.h> 65 #include <x86/psl.h> 66 #include <x86/apicreg.h> 67 68 #include <machine/specialreg.h> 69 #include <machine/vmm.h> 70 #include <machine/vmm_dev.h> 71 #include <machine/vmparam.h> 72 #include <sys/vmm_instruction_emul.h> 73 #include <sys/vmm_vm.h> 74 #include <sys/vmm_gpt.h> 75 76 #include "vmm_ioport.h" 77 #include "vmm_ktr.h" 78 #include "vmm_host.h" 79 #include "vmm_util.h" 80 #include "vatpic.h" 81 #include "vatpit.h" 82 #include "vhpet.h" 83 #include "vioapic.h" 84 #include "vlapic.h" 85 #include "vpmtmr.h" 86 #include "vrtc.h" 87 #include "vmm_stat.h" 88 #include "vmm_lapic.h" 89 90 #include "io/ppt.h" 91 #include "io/iommu.h" 92 93 struct vlapic; 94 95 /* Flags for vtc_status */ 96 #define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */ 97 #define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */ 98 99 typedef struct vm_thread_ctx { 100 struct vm *vtc_vm; 101 int vtc_vcpuid; 102 uint_t vtc_status; 103 enum vcpu_ustate vtc_ustate; 104 } vm_thread_ctx_t; 105 106 #define VMM_MTRR_VAR_MAX 10 107 #define VMM_MTRR_DEF_MASK \ 108 (MTRR_DEF_ENABLE | MTRR_DEF_FIXED_ENABLE | MTRR_DEF_TYPE) 109 #define VMM_MTRR_PHYSBASE_MASK (MTRR_PHYSBASE_PHYSBASE | MTRR_PHYSBASE_TYPE) 110 #define VMM_MTRR_PHYSMASK_MASK (MTRR_PHYSMASK_PHYSMASK | MTRR_PHYSMASK_VALID) 111 struct vm_mtrr { 112 uint64_t def_type; 113 uint64_t fixed4k[8]; 114 uint64_t fixed16k[2]; 115 uint64_t fixed64k; 116 struct { 117 uint64_t base; 118 uint64_t mask; 119 } var[VMM_MTRR_VAR_MAX]; 120 }; 121 122 /* 123 * Initialization: 124 * (a) allocated when vcpu is created 125 * (i) initialized when vcpu is created and when it is reinitialized 126 * (o) initialized the first time the vcpu is created 127 * (x) initialized before use 128 */ 129 struct vcpu { 130 /* (o) protects state, run_state, hostcpu, sipi_vector */ 131 kmutex_t lock; 132 133 enum vcpu_state state; /* (o) vcpu state */ 134 enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */ 135 kcondvar_t vcpu_cv; /* (o) cpu waiter cv */ 136 kcondvar_t state_cv; /* (o) IDLE-transition cv */ 137 int hostcpu; /* (o) vcpu's current host cpu */ 138 int lastloccpu; /* (o) last host cpu localized to */ 139 int reqidle; /* (i) request vcpu to idle */ 140 struct vlapic *vlapic; /* (i) APIC device model */ 141 enum x2apic_state x2apic_state; /* (i) APIC mode */ 142 uint64_t exitintinfo; /* (i) events pending at VM exit */ 143 int nmi_pending; /* (i) NMI pending */ 144 int extint_pending; /* (i) INTR pending */ 145 int exception_pending; /* (i) exception pending */ 146 int exc_vector; /* (x) exception collateral */ 147 int exc_errcode_valid; 148 uint32_t exc_errcode; 149 uint8_t sipi_vector; /* (i) SIPI vector */ 150 hma_fpu_t *guestfpu; /* (a,i) guest fpu state */ 151 uint64_t guest_xcr0; /* (i) guest %xcr0 register */ 152 void *stats; /* (a,i) statistics */ 153 struct vm_exit exitinfo; /* (x) exit reason and collateral */ 154 uint64_t nextrip; /* (x) next instruction to execute */ 155 struct vie *vie_ctx; /* (x) instruction emulation context */ 156 vm_client_t *vmclient; /* (a) VM-system client */ 157 uint64_t tsc_offset; /* (x) offset from host TSC */ 158 struct vm_mtrr mtrr; /* (i) vcpu's MTRR */ 159 160 enum vcpu_ustate ustate; /* (i) microstate for the vcpu */ 161 hrtime_t ustate_when; /* (i) time of last ustate change */ 162 uint64_t ustate_total[VU_MAX]; /* (o) total time spent in ustates */ 163 vm_thread_ctx_t vtc; /* (o) thread state for ctxops */ 164 struct ctxop *ctxop; /* (o) ctxop storage for vcpu */ 165 }; 166 167 #define vcpu_lock(v) mutex_enter(&((v)->lock)) 168 #define vcpu_unlock(v) mutex_exit(&((v)->lock)) 169 #define vcpu_assert_locked(v) ASSERT(MUTEX_HELD(&((v)->lock))) 170 171 struct mem_seg { 172 size_t len; 173 bool sysmem; 174 vm_object_t *object; 175 }; 176 #define VM_MAX_MEMSEGS 5 177 178 struct mem_map { 179 vm_paddr_t gpa; 180 size_t len; 181 vm_ooffset_t segoff; 182 int segid; 183 int prot; 184 int flags; 185 }; 186 #define VM_MAX_MEMMAPS 8 187 188 /* 189 * Initialization: 190 * (o) initialized the first time the VM is created 191 * (i) initialized when VM is created and when it is reinitialized 192 * (x) initialized before use 193 */ 194 struct vm { 195 void *cookie; /* (i) cpu-specific data */ 196 void *iommu; /* (x) iommu-specific data */ 197 struct vhpet *vhpet; /* (i) virtual HPET */ 198 struct vioapic *vioapic; /* (i) virtual ioapic */ 199 struct vatpic *vatpic; /* (i) virtual atpic */ 200 struct vatpit *vatpit; /* (i) virtual atpit */ 201 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ 202 struct vrtc *vrtc; /* (o) virtual RTC */ 203 volatile cpuset_t active_cpus; /* (i) active vcpus */ 204 volatile cpuset_t debug_cpus; /* (i) vcpus stopped for dbg */ 205 int suspend; /* (i) stop VM execution */ 206 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ 207 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ 208 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ 209 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ 210 struct vmspace *vmspace; /* (o) guest's address space */ 211 char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ 212 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ 213 /* The following describe the vm cpu topology */ 214 uint16_t sockets; /* (o) num of sockets */ 215 uint16_t cores; /* (o) num of cores/socket */ 216 uint16_t threads; /* (o) num of threads/core */ 217 uint16_t maxcpus; /* (o) max pluggable cpus */ 218 uint64_t boot_tsc_offset; /* (i) TSC offset at VM boot */ 219 220 struct ioport_config ioports; /* (o) ioport handling */ 221 222 bool mem_transient; /* (o) alloc transient memory */ 223 }; 224 225 static int vmm_initialized; 226 227 228 static void 229 nullop_panic(void) 230 { 231 panic("null vmm operation call"); 232 } 233 234 /* Do not allow use of an un-set `ops` to do anything but panic */ 235 static struct vmm_ops vmm_ops_null = { 236 .init = (vmm_init_func_t)nullop_panic, 237 .cleanup = (vmm_cleanup_func_t)nullop_panic, 238 .resume = (vmm_resume_func_t)nullop_panic, 239 .vminit = (vmi_init_func_t)nullop_panic, 240 .vmrun = (vmi_run_func_t)nullop_panic, 241 .vmcleanup = (vmi_cleanup_func_t)nullop_panic, 242 .vmgetreg = (vmi_get_register_t)nullop_panic, 243 .vmsetreg = (vmi_set_register_t)nullop_panic, 244 .vmgetdesc = (vmi_get_desc_t)nullop_panic, 245 .vmsetdesc = (vmi_set_desc_t)nullop_panic, 246 .vmgetcap = (vmi_get_cap_t)nullop_panic, 247 .vmsetcap = (vmi_set_cap_t)nullop_panic, 248 .vlapic_init = (vmi_vlapic_init)nullop_panic, 249 .vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic, 250 .vmsavectx = (vmi_savectx)nullop_panic, 251 .vmrestorectx = (vmi_restorectx)nullop_panic, 252 }; 253 254 static struct vmm_ops *ops = &vmm_ops_null; 255 static vmm_pte_ops_t *pte_ops = NULL; 256 257 #define VMM_INIT() ((*ops->init)()) 258 #define VMM_CLEANUP() ((*ops->cleanup)()) 259 #define VMM_RESUME() ((*ops->resume)()) 260 261 #define VMINIT(vm) ((*ops->vminit)(vm)) 262 #define VMRUN(vmi, vcpu, rip) ((*ops->vmrun)(vmi, vcpu, rip)) 263 #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi)) 264 265 #define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv)) 266 #define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val)) 267 #define VMGETDESC(vmi, vcpu, num, dsc) ((*ops->vmgetdesc)(vmi, vcpu, num, dsc)) 268 #define VMSETDESC(vmi, vcpu, num, dsc) ((*ops->vmsetdesc)(vmi, vcpu, num, dsc)) 269 #define VMGETCAP(vmi, vcpu, num, rv) ((*ops->vmgetcap)(vmi, vcpu, num, rv)) 270 #define VMSETCAP(vmi, vcpu, num, val) ((*ops->vmsetcap)(vmi, vcpu, num, val)) 271 #define VLAPIC_INIT(vmi, vcpu) ((*ops->vlapic_init)(vmi, vcpu)) 272 #define VLAPIC_CLEANUP(vmi, vlapic) ((*ops->vlapic_cleanup)(vmi, vlapic)) 273 274 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 275 #define fpu_stop_emulating() clts() 276 277 SDT_PROVIDER_DEFINE(vmm); 278 279 static MALLOC_DEFINE(M_VM, "vm", "vm"); 280 281 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 282 NULL); 283 284 /* 285 * Halt the guest if all vcpus are executing a HLT instruction with 286 * interrupts disabled. 287 */ 288 static int halt_detection_enabled = 1; 289 290 /* Trap into hypervisor on all guest exceptions and reflect them back */ 291 static int trace_guest_exceptions; 292 293 static void vm_free_memmap(struct vm *vm, int ident); 294 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); 295 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t); 296 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid); 297 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector); 298 299 static void vmm_savectx(void *); 300 static void vmm_restorectx(void *); 301 static const struct ctxop_template vmm_ctxop_tpl = { 302 .ct_rev = CTXOP_TPL_REV, 303 .ct_save = vmm_savectx, 304 .ct_restore = vmm_restorectx, 305 }; 306 307 #ifdef KTR 308 static const char * 309 vcpu_state2str(enum vcpu_state state) 310 { 311 312 switch (state) { 313 case VCPU_IDLE: 314 return ("idle"); 315 case VCPU_FROZEN: 316 return ("frozen"); 317 case VCPU_RUNNING: 318 return ("running"); 319 case VCPU_SLEEPING: 320 return ("sleeping"); 321 default: 322 return ("unknown"); 323 } 324 } 325 #endif 326 327 static void 328 vcpu_cleanup(struct vm *vm, int i, bool destroy) 329 { 330 struct vcpu *vcpu = &vm->vcpu[i]; 331 332 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); 333 if (destroy) { 334 vmm_stat_free(vcpu->stats); 335 336 hma_fpu_free(vcpu->guestfpu); 337 vcpu->guestfpu = NULL; 338 339 vie_free(vcpu->vie_ctx); 340 vcpu->vie_ctx = NULL; 341 342 vmc_destroy(vcpu->vmclient); 343 vcpu->vmclient = NULL; 344 345 ctxop_free(vcpu->ctxop); 346 mutex_destroy(&vcpu->lock); 347 } 348 } 349 350 static void 351 vcpu_init(struct vm *vm, int vcpu_id, bool create) 352 { 353 struct vcpu *vcpu; 354 355 KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, 356 ("vcpu_init: invalid vcpu %d", vcpu_id)); 357 358 vcpu = &vm->vcpu[vcpu_id]; 359 360 if (create) { 361 mutex_init(&vcpu->lock, NULL, MUTEX_ADAPTIVE, NULL); 362 363 vcpu->state = VCPU_IDLE; 364 vcpu->hostcpu = NOCPU; 365 vcpu->lastloccpu = NOCPU; 366 vcpu->guestfpu = hma_fpu_alloc(KM_SLEEP); 367 vcpu->stats = vmm_stat_alloc(); 368 vcpu->vie_ctx = vie_alloc(); 369 370 vcpu->ustate = VU_INIT; 371 vcpu->ustate_when = gethrtime(); 372 373 vcpu->vtc.vtc_vm = vm; 374 vcpu->vtc.vtc_vcpuid = vcpu_id; 375 vcpu->ctxop = ctxop_allocate(&vmm_ctxop_tpl, &vcpu->vtc); 376 } else { 377 vie_reset(vcpu->vie_ctx); 378 bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo)); 379 if (vcpu->ustate != VU_INIT) { 380 vcpu_ustate_change(vm, vcpu_id, VU_INIT); 381 } 382 bzero(&vcpu->mtrr, sizeof (vcpu->mtrr)); 383 } 384 385 vcpu->run_state = VRS_HALT; 386 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); 387 (void) vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); 388 vcpu->reqidle = 0; 389 vcpu->exitintinfo = 0; 390 vcpu->nmi_pending = 0; 391 vcpu->extint_pending = 0; 392 vcpu->exception_pending = 0; 393 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 394 (void) hma_fpu_init(vcpu->guestfpu); 395 vmm_stat_init(vcpu->stats); 396 vcpu->tsc_offset = 0; 397 } 398 399 int 400 vcpu_trace_exceptions(struct vm *vm, int vcpuid) 401 { 402 403 return (trace_guest_exceptions); 404 } 405 406 struct vm_exit * 407 vm_exitinfo(struct vm *vm, int cpuid) 408 { 409 struct vcpu *vcpu; 410 411 if (cpuid < 0 || cpuid >= vm->maxcpus) 412 panic("vm_exitinfo: invalid cpuid %d", cpuid); 413 414 vcpu = &vm->vcpu[cpuid]; 415 416 return (&vcpu->exitinfo); 417 } 418 419 struct vie * 420 vm_vie_ctx(struct vm *vm, int cpuid) 421 { 422 if (cpuid < 0 || cpuid >= vm->maxcpus) 423 panic("vm_vie_ctx: invalid cpuid %d", cpuid); 424 425 return (vm->vcpu[cpuid].vie_ctx); 426 } 427 428 static int 429 vmm_init(void) 430 { 431 vmm_host_state_init(); 432 433 if (vmm_is_intel()) { 434 ops = &vmm_ops_intel; 435 pte_ops = &ept_pte_ops; 436 } else if (vmm_is_svm()) { 437 ops = &vmm_ops_amd; 438 pte_ops = &rvi_pte_ops; 439 } else { 440 return (ENXIO); 441 } 442 443 return (VMM_INIT()); 444 } 445 446 int 447 vmm_mod_load() 448 { 449 int error; 450 451 VERIFY(vmm_initialized == 0); 452 453 error = vmm_init(); 454 if (error == 0) 455 vmm_initialized = 1; 456 457 return (error); 458 } 459 460 int 461 vmm_mod_unload() 462 { 463 int error; 464 465 VERIFY(vmm_initialized == 1); 466 467 iommu_cleanup(); 468 error = VMM_CLEANUP(); 469 if (error) 470 return (error); 471 vmm_initialized = 0; 472 473 return (0); 474 } 475 476 static void 477 vm_init(struct vm *vm, bool create) 478 { 479 int i; 480 481 vm->cookie = VMINIT(vm); 482 vm->iommu = NULL; 483 vm->vioapic = vioapic_init(vm); 484 vm->vhpet = vhpet_init(vm); 485 vm->vatpic = vatpic_init(vm); 486 vm->vatpit = vatpit_init(vm); 487 vm->vpmtmr = vpmtmr_init(vm); 488 if (create) 489 vm->vrtc = vrtc_init(vm); 490 491 vm_inout_init(vm, &vm->ioports); 492 493 CPU_ZERO(&vm->active_cpus); 494 CPU_ZERO(&vm->debug_cpus); 495 496 vm->suspend = 0; 497 CPU_ZERO(&vm->suspended_cpus); 498 499 for (i = 0; i < vm->maxcpus; i++) 500 vcpu_init(vm, i, create); 501 502 /* 503 * Configure the VM-wide TSC offset so that the call to vm_init() 504 * represents the boot time (when the TSC(s) read 0). Each vCPU will 505 * have its own offset from this, which is altered if/when the guest 506 * writes to MSR_TSC. 507 * 508 * The TSC offsetting math is all unsigned, using overflow for negative 509 * offets. A reading of the TSC is negated to form the boot offset. 510 */ 511 vm->boot_tsc_offset = (uint64_t)(-(int64_t)rdtsc_offset()); 512 } 513 514 /* 515 * The default CPU topology is a single thread per package. 516 */ 517 uint_t cores_per_package = 1; 518 uint_t threads_per_core = 1; 519 520 /* 521 * Debugging tunable to enable dirty-page-tracking. 522 * (Remains off by default for now) 523 */ 524 bool gpt_track_dirty = false; 525 526 int 527 vm_create(const char *name, uint64_t flags, struct vm **retvm) 528 { 529 struct vm *vm; 530 struct vmspace *vmspace; 531 532 /* 533 * If vmm.ko could not be successfully initialized then don't attempt 534 * to create the virtual machine. 535 */ 536 if (!vmm_initialized) 537 return (ENXIO); 538 539 /* Name validation has already occurred */ 540 VERIFY3U(strnlen(name, VM_MAX_NAMELEN), <, VM_MAX_NAMELEN); 541 542 vmspace = vmspace_alloc(VM_MAXUSER_ADDRESS, pte_ops, gpt_track_dirty); 543 if (vmspace == NULL) 544 return (ENOMEM); 545 546 vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO); 547 (void) strlcpy(vm->name, name, sizeof (vm->name)); 548 549 vm->vmspace = vmspace; 550 vm->mem_transient = (flags & VCF_RESERVOIR_MEM) == 0; 551 for (uint_t i = 0; i < VM_MAXCPU; i++) { 552 vm->vcpu[i].vmclient = vmspace_client_alloc(vmspace); 553 } 554 555 vm->sockets = 1; 556 vm->cores = cores_per_package; /* XXX backwards compatibility */ 557 vm->threads = threads_per_core; /* XXX backwards compatibility */ 558 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 559 560 vm_init(vm, true); 561 562 *retvm = vm; 563 return (0); 564 } 565 566 void 567 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, 568 uint16_t *threads, uint16_t *maxcpus) 569 { 570 *sockets = vm->sockets; 571 *cores = vm->cores; 572 *threads = vm->threads; 573 *maxcpus = vm->maxcpus; 574 } 575 576 uint16_t 577 vm_get_maxcpus(struct vm *vm) 578 { 579 return (vm->maxcpus); 580 } 581 582 int 583 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, 584 uint16_t threads, uint16_t maxcpus) 585 { 586 if (maxcpus != 0) 587 return (EINVAL); /* XXX remove when supported */ 588 if ((sockets * cores * threads) > vm->maxcpus) 589 return (EINVAL); 590 /* XXX need to check sockets * cores * threads == vCPU, how? */ 591 vm->sockets = sockets; 592 vm->cores = cores; 593 vm->threads = threads; 594 vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ 595 return (0); 596 } 597 598 static void 599 vm_cleanup(struct vm *vm, bool destroy) 600 { 601 struct mem_map *mm; 602 int i; 603 604 ppt_unassign_all(vm); 605 606 if (vm->iommu != NULL) 607 iommu_destroy_domain(vm->iommu); 608 609 /* 610 * Devices which attach their own ioport hooks should be cleaned up 611 * first so they can tear down those registrations. 612 */ 613 vpmtmr_cleanup(vm->vpmtmr); 614 615 vm_inout_cleanup(vm, &vm->ioports); 616 617 if (destroy) 618 vrtc_cleanup(vm->vrtc); 619 else 620 vrtc_reset(vm->vrtc); 621 622 vatpit_cleanup(vm->vatpit); 623 vhpet_cleanup(vm->vhpet); 624 vatpic_cleanup(vm->vatpic); 625 vioapic_cleanup(vm->vioapic); 626 627 for (i = 0; i < vm->maxcpus; i++) 628 vcpu_cleanup(vm, i, destroy); 629 630 VMCLEANUP(vm->cookie); 631 632 /* 633 * System memory is removed from the guest address space only when 634 * the VM is destroyed. This is because the mapping remains the same 635 * across VM reset. 636 * 637 * Device memory can be relocated by the guest (e.g. using PCI BARs) 638 * so those mappings are removed on a VM reset. 639 */ 640 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 641 mm = &vm->mem_maps[i]; 642 if (destroy || !sysmem_mapping(vm, mm)) { 643 vm_free_memmap(vm, i); 644 } else { 645 /* 646 * We need to reset the IOMMU flag so this mapping can 647 * be reused when a VM is rebooted. Since the IOMMU 648 * domain has already been destroyed we can just reset 649 * the flag here. 650 */ 651 mm->flags &= ~VM_MEMMAP_F_IOMMU; 652 } 653 } 654 655 if (destroy) { 656 for (i = 0; i < VM_MAX_MEMSEGS; i++) 657 vm_free_memseg(vm, i); 658 659 vmspace_destroy(vm->vmspace); 660 vm->vmspace = NULL; 661 } 662 } 663 664 void 665 vm_destroy(struct vm *vm) 666 { 667 vm_cleanup(vm, true); 668 free(vm, M_VM); 669 } 670 671 int 672 vm_reinit(struct vm *vm, uint64_t flags) 673 { 674 /* A virtual machine can be reset only if all vcpus are suspended. */ 675 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) != 0) { 676 if ((flags & VM_REINIT_F_FORCE_SUSPEND) == 0) { 677 return (EBUSY); 678 } 679 680 /* 681 * Force the VM (and all its vCPUs) into a suspended state. 682 * This should be quick and easy, since the vm_reinit() call is 683 * made while holding the VM write lock, which requires holding 684 * all of the vCPUs in the VCPU_FROZEN state. 685 */ 686 (void) atomic_cmpset_int((uint_t *)&vm->suspend, 0, 687 VM_SUSPEND_RESET); 688 for (uint_t i = 0; i < vm->maxcpus; i++) { 689 struct vcpu *vcpu = &vm->vcpu[i]; 690 691 if (CPU_ISSET(i, &vm->suspended_cpus) || 692 !CPU_ISSET(i, &vm->active_cpus)) { 693 continue; 694 } 695 696 vcpu_lock(vcpu); 697 VERIFY3U(vcpu->state, ==, VCPU_FROZEN); 698 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 699 vcpu_unlock(vcpu); 700 } 701 702 VERIFY0(CPU_CMP(&vm->suspended_cpus, &vm->active_cpus)); 703 } 704 705 vm_cleanup(vm, false); 706 vm_init(vm, false); 707 return (0); 708 } 709 710 const char * 711 vm_name(struct vm *vm) 712 { 713 return (vm->name); 714 } 715 716 int 717 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 718 { 719 vm_object_t *obj; 720 721 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 722 return (ENOMEM); 723 else 724 return (0); 725 } 726 727 int 728 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 729 { 730 return (vmspace_unmap(vm->vmspace, gpa, gpa + len)); 731 } 732 733 /* 734 * Return 'true' if 'gpa' is allocated in the guest address space. 735 * 736 * This function is called in the context of a running vcpu which acts as 737 * an implicit lock on 'vm->mem_maps[]'. 738 */ 739 bool 740 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) 741 { 742 struct mem_map *mm; 743 int i; 744 745 #ifdef INVARIANTS 746 int hostcpu, state; 747 state = vcpu_get_state(vm, vcpuid, &hostcpu); 748 KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, 749 ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); 750 #endif 751 752 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 753 mm = &vm->mem_maps[i]; 754 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) 755 return (true); /* 'gpa' is sysmem or devmem */ 756 } 757 758 if (ppt_is_mmio(vm, gpa)) 759 return (true); /* 'gpa' is pci passthru mmio */ 760 761 return (false); 762 } 763 764 int 765 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) 766 { 767 struct mem_seg *seg; 768 vm_object_t *obj; 769 770 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 771 return (EINVAL); 772 773 if (len == 0 || (len & PAGE_MASK)) 774 return (EINVAL); 775 776 seg = &vm->mem_segs[ident]; 777 if (seg->object != NULL) { 778 if (seg->len == len && seg->sysmem == sysmem) 779 return (EEXIST); 780 else 781 return (EINVAL); 782 } 783 784 obj = vm_object_mem_allocate(len, vm->mem_transient); 785 if (obj == NULL) 786 return (ENOMEM); 787 788 seg->len = len; 789 seg->object = obj; 790 seg->sysmem = sysmem; 791 return (0); 792 } 793 794 int 795 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, 796 vm_object_t **objptr) 797 { 798 struct mem_seg *seg; 799 800 if (ident < 0 || ident >= VM_MAX_MEMSEGS) 801 return (EINVAL); 802 803 seg = &vm->mem_segs[ident]; 804 if (len) 805 *len = seg->len; 806 if (sysmem) 807 *sysmem = seg->sysmem; 808 if (objptr) 809 *objptr = seg->object; 810 return (0); 811 } 812 813 void 814 vm_free_memseg(struct vm *vm, int ident) 815 { 816 struct mem_seg *seg; 817 818 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, 819 ("%s: invalid memseg ident %d", __func__, ident)); 820 821 seg = &vm->mem_segs[ident]; 822 if (seg->object != NULL) { 823 vm_object_release(seg->object); 824 bzero(seg, sizeof (struct mem_seg)); 825 } 826 } 827 828 int 829 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, 830 size_t len, int prot, int flags) 831 { 832 struct mem_seg *seg; 833 struct mem_map *m, *map; 834 vm_ooffset_t last; 835 int i, error; 836 837 if (prot == 0 || (prot & ~(PROT_ALL)) != 0) 838 return (EINVAL); 839 840 if (flags & ~VM_MEMMAP_F_WIRED) 841 return (EINVAL); 842 843 if (segid < 0 || segid >= VM_MAX_MEMSEGS) 844 return (EINVAL); 845 846 seg = &vm->mem_segs[segid]; 847 if (seg->object == NULL) 848 return (EINVAL); 849 850 last = first + len; 851 if (first < 0 || first >= last || last > seg->len) 852 return (EINVAL); 853 854 if ((gpa | first | last) & PAGE_MASK) 855 return (EINVAL); 856 857 map = NULL; 858 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 859 m = &vm->mem_maps[i]; 860 if (m->len == 0) { 861 map = m; 862 break; 863 } 864 } 865 866 if (map == NULL) 867 return (ENOSPC); 868 869 error = vmspace_map(vm->vmspace, seg->object, first, gpa, len, prot); 870 if (error != 0) 871 return (EFAULT); 872 873 vm_object_reference(seg->object); 874 875 if ((flags & VM_MEMMAP_F_WIRED) != 0) { 876 error = vmspace_populate(vm->vmspace, gpa, gpa + len); 877 if (error != 0) { 878 VERIFY0(vmspace_unmap(vm->vmspace, gpa, gpa + len)); 879 return (EFAULT); 880 } 881 } 882 883 map->gpa = gpa; 884 map->len = len; 885 map->segoff = first; 886 map->segid = segid; 887 map->prot = prot; 888 map->flags = flags; 889 return (0); 890 } 891 892 int 893 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len) 894 { 895 struct mem_map *m; 896 int i; 897 898 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 899 m = &vm->mem_maps[i]; 900 if (m->gpa == gpa && m->len == len && 901 (m->flags & VM_MEMMAP_F_IOMMU) == 0) { 902 vm_free_memmap(vm, i); 903 return (0); 904 } 905 } 906 907 return (EINVAL); 908 } 909 910 int 911 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, 912 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 913 { 914 struct mem_map *mm, *mmnext; 915 int i; 916 917 mmnext = NULL; 918 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 919 mm = &vm->mem_maps[i]; 920 if (mm->len == 0 || mm->gpa < *gpa) 921 continue; 922 if (mmnext == NULL || mm->gpa < mmnext->gpa) 923 mmnext = mm; 924 } 925 926 if (mmnext != NULL) { 927 *gpa = mmnext->gpa; 928 if (segid) 929 *segid = mmnext->segid; 930 if (segoff) 931 *segoff = mmnext->segoff; 932 if (len) 933 *len = mmnext->len; 934 if (prot) 935 *prot = mmnext->prot; 936 if (flags) 937 *flags = mmnext->flags; 938 return (0); 939 } else { 940 return (ENOENT); 941 } 942 } 943 944 static void 945 vm_free_memmap(struct vm *vm, int ident) 946 { 947 struct mem_map *mm; 948 int error; 949 950 mm = &vm->mem_maps[ident]; 951 if (mm->len) { 952 error = vmspace_unmap(vm->vmspace, mm->gpa, 953 mm->gpa + mm->len); 954 KASSERT(error == 0, ("%s: vmspace_unmap error %d", 955 __func__, error)); 956 bzero(mm, sizeof (struct mem_map)); 957 } 958 } 959 960 static __inline bool 961 sysmem_mapping(struct vm *vm, struct mem_map *mm) 962 { 963 964 if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) 965 return (true); 966 else 967 return (false); 968 } 969 970 vm_paddr_t 971 vmm_sysmem_maxaddr(struct vm *vm) 972 { 973 struct mem_map *mm; 974 vm_paddr_t maxaddr; 975 int i; 976 977 maxaddr = 0; 978 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 979 mm = &vm->mem_maps[i]; 980 if (sysmem_mapping(vm, mm)) { 981 if (maxaddr < mm->gpa + mm->len) 982 maxaddr = mm->gpa + mm->len; 983 } 984 } 985 return (maxaddr); 986 } 987 988 static void 989 vm_iommu_modify(struct vm *vm, bool map) 990 { 991 int i, sz; 992 vm_paddr_t gpa, hpa; 993 struct mem_map *mm; 994 #ifdef __FreeBSD__ 995 void *vp, *cookie, *host_domain; 996 #endif 997 vm_client_t *vmc; 998 999 sz = PAGE_SIZE; 1000 #ifdef __FreeBSD__ 1001 host_domain = iommu_host_domain(); 1002 #endif 1003 vmc = vmspace_client_alloc(vm->vmspace); 1004 1005 for (i = 0; i < VM_MAX_MEMMAPS; i++) { 1006 mm = &vm->mem_maps[i]; 1007 if (!sysmem_mapping(vm, mm)) 1008 continue; 1009 1010 if (map) { 1011 KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, 1012 ("iommu map found invalid memmap %lx/%lx/%x", 1013 mm->gpa, mm->len, mm->flags)); 1014 if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) 1015 continue; 1016 mm->flags |= VM_MEMMAP_F_IOMMU; 1017 } else { 1018 if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) 1019 continue; 1020 mm->flags &= ~VM_MEMMAP_F_IOMMU; 1021 KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, 1022 ("iommu unmap found invalid memmap %lx/%lx/%x", 1023 mm->gpa, mm->len, mm->flags)); 1024 } 1025 1026 gpa = mm->gpa; 1027 while (gpa < mm->gpa + mm->len) { 1028 vm_page_t *vmp; 1029 1030 vmp = vmc_hold(vmc, gpa, PROT_WRITE); 1031 ASSERT(vmp != NULL); 1032 hpa = ((uintptr_t)vmp_get_pfn(vmp) << PAGESHIFT); 1033 (void) vmp_release(vmp); 1034 1035 if (map) { 1036 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 1037 #ifdef __FreeBSD__ 1038 iommu_remove_mapping(host_domain, hpa, sz); 1039 #endif 1040 } else { 1041 iommu_remove_mapping(vm->iommu, gpa, sz); 1042 #ifdef __FreeBSD__ 1043 iommu_create_mapping(host_domain, hpa, hpa, sz); 1044 #endif 1045 } 1046 1047 gpa += PAGE_SIZE; 1048 } 1049 } 1050 vmc_destroy(vmc); 1051 1052 /* 1053 * Invalidate the cached translations associated with the domain 1054 * from which pages were removed. 1055 */ 1056 #ifdef __FreeBSD__ 1057 if (map) 1058 iommu_invalidate_tlb(host_domain); 1059 else 1060 iommu_invalidate_tlb(vm->iommu); 1061 #else 1062 iommu_invalidate_tlb(vm->iommu); 1063 #endif 1064 } 1065 1066 int 1067 vm_unassign_pptdev(struct vm *vm, int pptfd) 1068 { 1069 int error; 1070 1071 error = ppt_unassign_device(vm, pptfd); 1072 if (error) 1073 return (error); 1074 1075 if (ppt_assigned_devices(vm) == 0) 1076 vm_iommu_modify(vm, false); 1077 1078 return (0); 1079 } 1080 1081 int 1082 vm_assign_pptdev(struct vm *vm, int pptfd) 1083 { 1084 int error; 1085 vm_paddr_t maxaddr; 1086 1087 /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ 1088 if (ppt_assigned_devices(vm) == 0) { 1089 KASSERT(vm->iommu == NULL, 1090 ("vm_assign_pptdev: iommu must be NULL")); 1091 maxaddr = vmm_sysmem_maxaddr(vm); 1092 vm->iommu = iommu_create_domain(maxaddr); 1093 if (vm->iommu == NULL) 1094 return (ENXIO); 1095 vm_iommu_modify(vm, true); 1096 } 1097 1098 error = ppt_assign_device(vm, pptfd); 1099 return (error); 1100 } 1101 1102 int 1103 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) 1104 { 1105 1106 if (vcpu < 0 || vcpu >= vm->maxcpus) 1107 return (EINVAL); 1108 1109 if (reg >= VM_REG_LAST) 1110 return (EINVAL); 1111 1112 return (VMGETREG(vm->cookie, vcpu, reg, retval)); 1113 } 1114 1115 int 1116 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) 1117 { 1118 struct vcpu *vcpu; 1119 int error; 1120 1121 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1122 return (EINVAL); 1123 1124 if (reg >= VM_REG_LAST) 1125 return (EINVAL); 1126 1127 error = VMSETREG(vm->cookie, vcpuid, reg, val); 1128 if (error || reg != VM_REG_GUEST_RIP) 1129 return (error); 1130 1131 /* Set 'nextrip' to match the value of %rip */ 1132 VCPU_CTR1(vm, vcpuid, "Setting nextrip to %lx", val); 1133 vcpu = &vm->vcpu[vcpuid]; 1134 vcpu->nextrip = val; 1135 return (0); 1136 } 1137 1138 static bool 1139 is_descriptor_table(int reg) 1140 { 1141 switch (reg) { 1142 case VM_REG_GUEST_IDTR: 1143 case VM_REG_GUEST_GDTR: 1144 return (true); 1145 default: 1146 return (false); 1147 } 1148 } 1149 1150 static bool 1151 is_segment_register(int reg) 1152 { 1153 switch (reg) { 1154 case VM_REG_GUEST_ES: 1155 case VM_REG_GUEST_CS: 1156 case VM_REG_GUEST_SS: 1157 case VM_REG_GUEST_DS: 1158 case VM_REG_GUEST_FS: 1159 case VM_REG_GUEST_GS: 1160 case VM_REG_GUEST_TR: 1161 case VM_REG_GUEST_LDTR: 1162 return (true); 1163 default: 1164 return (false); 1165 } 1166 } 1167 1168 int 1169 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) 1170 { 1171 1172 if (vcpu < 0 || vcpu >= vm->maxcpus) 1173 return (EINVAL); 1174 1175 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1176 return (EINVAL); 1177 1178 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 1179 } 1180 1181 int 1182 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc) 1183 { 1184 if (vcpu < 0 || vcpu >= vm->maxcpus) 1185 return (EINVAL); 1186 1187 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 1188 return (EINVAL); 1189 1190 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 1191 } 1192 1193 static int 1194 translate_hma_xsave_result(hma_fpu_xsave_result_t res) 1195 { 1196 switch (res) { 1197 case HFXR_OK: 1198 return (0); 1199 case HFXR_NO_SPACE: 1200 return (ENOSPC); 1201 case HFXR_BAD_ALIGN: 1202 case HFXR_UNSUP_FMT: 1203 case HFXR_UNSUP_FEAT: 1204 case HFXR_INVALID_DATA: 1205 return (EINVAL); 1206 default: 1207 panic("unexpected xsave result"); 1208 } 1209 } 1210 1211 int 1212 vm_get_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1213 { 1214 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1215 return (EINVAL); 1216 1217 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1218 hma_fpu_xsave_result_t res; 1219 1220 res = hma_fpu_get_xsave_state(vcpu->guestfpu, buf, len); 1221 return (translate_hma_xsave_result(res)); 1222 } 1223 1224 int 1225 vm_set_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) 1226 { 1227 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 1228 return (EINVAL); 1229 1230 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1231 hma_fpu_xsave_result_t res; 1232 1233 res = hma_fpu_set_xsave_state(vcpu->guestfpu, buf, len); 1234 return (translate_hma_xsave_result(res)); 1235 } 1236 1237 int 1238 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec) 1239 { 1240 struct vcpu *vcpu; 1241 1242 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1243 return (EINVAL); 1244 } 1245 1246 vcpu = &vm->vcpu[vcpuid]; 1247 1248 vcpu_lock(vcpu); 1249 *state = vcpu->run_state; 1250 *sipi_vec = vcpu->sipi_vector; 1251 vcpu_unlock(vcpu); 1252 1253 return (0); 1254 } 1255 1256 int 1257 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec) 1258 { 1259 struct vcpu *vcpu; 1260 1261 if (vcpuid < 0 || vcpuid >= vm->maxcpus) { 1262 return (EINVAL); 1263 } 1264 if (!VRS_IS_VALID(state)) { 1265 return (EINVAL); 1266 } 1267 1268 vcpu = &vm->vcpu[vcpuid]; 1269 1270 vcpu_lock(vcpu); 1271 vcpu->run_state = state; 1272 vcpu->sipi_vector = sipi_vec; 1273 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1274 vcpu_unlock(vcpu); 1275 1276 return (0); 1277 } 1278 1279 void 1280 vm_track_dirty_pages(struct vm *vm, uint64_t gpa, size_t len, uint8_t *bitmap) 1281 { 1282 vmspace_t *vms = vm_get_vmspace(vm); 1283 vmspace_track_dirty(vms, gpa, len, bitmap); 1284 } 1285 1286 static void 1287 restore_guest_fpustate(struct vcpu *vcpu) 1288 { 1289 /* Save host FPU and restore guest FPU */ 1290 fpu_stop_emulating(); 1291 hma_fpu_start_guest(vcpu->guestfpu); 1292 1293 /* restore guest XCR0 if XSAVE is enabled in the host */ 1294 if (rcr4() & CR4_XSAVE) 1295 load_xcr(0, vcpu->guest_xcr0); 1296 1297 /* 1298 * The FPU is now "dirty" with the guest's state so turn on emulation 1299 * to trap any access to the FPU by the host. 1300 */ 1301 fpu_start_emulating(); 1302 } 1303 1304 static void 1305 save_guest_fpustate(struct vcpu *vcpu) 1306 { 1307 1308 if ((rcr0() & CR0_TS) == 0) 1309 panic("fpu emulation not enabled in host!"); 1310 1311 /* save guest XCR0 and restore host XCR0 */ 1312 if (rcr4() & CR4_XSAVE) { 1313 vcpu->guest_xcr0 = rxcr(0); 1314 load_xcr(0, vmm_get_host_xcr0()); 1315 } 1316 1317 /* save guest FPU and restore host FPU */ 1318 fpu_stop_emulating(); 1319 hma_fpu_stop_guest(vcpu->guestfpu); 1320 /* 1321 * When the host state has been restored, we should not re-enable 1322 * CR0.TS on illumos for eager FPU. 1323 */ 1324 } 1325 1326 static int 1327 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, 1328 bool from_idle) 1329 { 1330 struct vcpu *vcpu; 1331 int error; 1332 1333 vcpu = &vm->vcpu[vcpuid]; 1334 vcpu_assert_locked(vcpu); 1335 1336 /* 1337 * State transitions from the vmmdev_ioctl() must always begin from 1338 * the VCPU_IDLE state. This guarantees that there is only a single 1339 * ioctl() operating on a vcpu at any point. 1340 */ 1341 if (from_idle) { 1342 while (vcpu->state != VCPU_IDLE) { 1343 vcpu->reqidle = 1; 1344 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 1345 VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to " 1346 "idle requested", vcpu_state2str(vcpu->state)); 1347 cv_wait(&vcpu->state_cv, &vcpu->lock); 1348 } 1349 } else { 1350 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " 1351 "vcpu idle state")); 1352 } 1353 1354 if (vcpu->state == VCPU_RUNNING) { 1355 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " 1356 "mismatch for running vcpu", curcpu, vcpu->hostcpu)); 1357 } else { 1358 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " 1359 "vcpu that is not running", vcpu->hostcpu)); 1360 } 1361 1362 /* 1363 * The following state transitions are allowed: 1364 * IDLE -> FROZEN -> IDLE 1365 * FROZEN -> RUNNING -> FROZEN 1366 * FROZEN -> SLEEPING -> FROZEN 1367 */ 1368 switch (vcpu->state) { 1369 case VCPU_IDLE: 1370 case VCPU_RUNNING: 1371 case VCPU_SLEEPING: 1372 error = (newstate != VCPU_FROZEN); 1373 break; 1374 case VCPU_FROZEN: 1375 error = (newstate == VCPU_FROZEN); 1376 break; 1377 default: 1378 error = 1; 1379 break; 1380 } 1381 1382 if (error) 1383 return (EBUSY); 1384 1385 VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s", 1386 vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); 1387 1388 vcpu->state = newstate; 1389 if (newstate == VCPU_RUNNING) 1390 vcpu->hostcpu = curcpu; 1391 else 1392 vcpu->hostcpu = NOCPU; 1393 1394 if (newstate == VCPU_IDLE) { 1395 cv_broadcast(&vcpu->state_cv); 1396 } 1397 1398 return (0); 1399 } 1400 1401 static void 1402 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1403 { 1404 int error; 1405 1406 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) 1407 panic("Error %d setting state to %d\n", error, newstate); 1408 } 1409 1410 static void 1411 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1412 { 1413 int error; 1414 1415 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) 1416 panic("Error %d setting state to %d", error, newstate); 1417 } 1418 1419 /* 1420 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 1421 */ 1422 static int 1423 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) 1424 { 1425 struct vcpu *vcpu; 1426 int vcpu_halted, vm_halted; 1427 bool userspace_exit = false; 1428 1429 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); 1430 1431 vcpu = &vm->vcpu[vcpuid]; 1432 vcpu_halted = 0; 1433 vm_halted = 0; 1434 1435 vcpu_lock(vcpu); 1436 while (1) { 1437 /* 1438 * Do a final check for pending interrupts (including NMI and 1439 * INIT) before putting this thread to sleep. 1440 */ 1441 if (vm_nmi_pending(vm, vcpuid)) 1442 break; 1443 if (vcpu_run_state_pending(vm, vcpuid)) 1444 break; 1445 if (!intr_disabled) { 1446 if (vm_extint_pending(vm, vcpuid) || 1447 vlapic_pending_intr(vcpu->vlapic, NULL)) { 1448 break; 1449 } 1450 } 1451 1452 /* 1453 * Also check for software events which would cause a wake-up. 1454 * This will set the appropriate exitcode directly, rather than 1455 * requiring a trip through VM_RUN(). 1456 */ 1457 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1458 userspace_exit = true; 1459 break; 1460 } 1461 1462 /* 1463 * Some Linux guests implement "halt" by having all vcpus 1464 * execute HLT with interrupts disabled. 'halted_cpus' keeps 1465 * track of the vcpus that have entered this state. When all 1466 * vcpus enter the halted state the virtual machine is halted. 1467 */ 1468 if (intr_disabled) { 1469 if (!vcpu_halted && halt_detection_enabled) { 1470 vcpu_halted = 1; 1471 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); 1472 } 1473 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { 1474 vm_halted = 1; 1475 break; 1476 } 1477 } 1478 1479 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1480 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1481 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1482 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1483 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1484 } 1485 1486 if (vcpu_halted) 1487 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); 1488 1489 vcpu_unlock(vcpu); 1490 1491 if (vm_halted) { 1492 (void) vm_suspend(vm, VM_SUSPEND_HALT); 1493 } 1494 1495 return (userspace_exit ? -1 : 0); 1496 } 1497 1498 static int 1499 vm_handle_paging(struct vm *vm, int vcpuid) 1500 { 1501 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1502 vm_client_t *vmc = vcpu->vmclient; 1503 struct vm_exit *vme = &vcpu->exitinfo; 1504 int rv, ftype; 1505 1506 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1507 __func__, vme->inst_length)); 1508 1509 ftype = vme->u.paging.fault_type; 1510 KASSERT(ftype == PROT_READ || 1511 ftype == PROT_WRITE || ftype == PROT_EXEC, 1512 ("vm_handle_paging: invalid fault_type %d", ftype)); 1513 1514 rv = vmc_fault(vmc, vme->u.paging.gpa, ftype); 1515 1516 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %lx, " 1517 "ftype = %d", rv, vme->u.paging.gpa, ftype); 1518 1519 if (rv != 0) 1520 return (EFAULT); 1521 return (0); 1522 } 1523 1524 int 1525 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval, 1526 int rsize) 1527 { 1528 int err = ESRCH; 1529 1530 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1531 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1532 1533 err = vlapic_mmio_read(vlapic, gpa, rval, rsize); 1534 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1535 err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize); 1536 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1537 err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize); 1538 } 1539 1540 return (err); 1541 } 1542 1543 int 1544 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval, 1545 int wsize) 1546 { 1547 int err = ESRCH; 1548 1549 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1550 struct vlapic *vlapic = vm_lapic(vm, cpuid); 1551 1552 err = vlapic_mmio_write(vlapic, gpa, wval, wsize); 1553 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1554 err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize); 1555 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1556 err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize); 1557 } 1558 1559 return (err); 1560 } 1561 1562 static int 1563 vm_handle_mmio_emul(struct vm *vm, int vcpuid) 1564 { 1565 struct vie *vie; 1566 struct vcpu *vcpu; 1567 struct vm_exit *vme; 1568 uint64_t inst_addr; 1569 int error, fault, cs_d; 1570 1571 vcpu = &vm->vcpu[vcpuid]; 1572 vme = &vcpu->exitinfo; 1573 vie = vcpu->vie_ctx; 1574 1575 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1576 __func__, vme->inst_length)); 1577 1578 inst_addr = vme->rip + vme->u.mmio_emul.cs_base; 1579 cs_d = vme->u.mmio_emul.cs_d; 1580 1581 VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %lx", 1582 vme->u.mmio_emul.gpa); 1583 1584 /* Fetch the faulting instruction */ 1585 if (vie_needs_fetch(vie)) { 1586 error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr, 1587 &fault); 1588 if (error != 0) { 1589 return (error); 1590 } else if (fault) { 1591 /* 1592 * If a fault during instruction fetch was encountered, 1593 * it will have asserted that the appropriate exception 1594 * be injected at next entry. 1595 * No further work is required. 1596 */ 1597 return (0); 1598 } 1599 } 1600 1601 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1602 VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %lx", 1603 inst_addr); 1604 /* Dump (unrecognized) instruction bytes in userspace */ 1605 vie_fallback_exitinfo(vie, vme); 1606 return (-1); 1607 } 1608 if (vme->u.mmio_emul.gla != VIE_INVALID_GLA && 1609 vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) { 1610 /* Decoded GLA does not match GLA from VM exit state */ 1611 vie_fallback_exitinfo(vie, vme); 1612 return (-1); 1613 } 1614 1615 repeat: 1616 error = vie_emulate_mmio(vie, vm, vcpuid); 1617 if (error < 0) { 1618 /* 1619 * MMIO not handled by any of the in-kernel-emulated devices, so 1620 * make a trip out to userspace for it. 1621 */ 1622 vie_exitinfo(vie, vme); 1623 } else if (error == EAGAIN) { 1624 /* 1625 * Continue emulating the rep-prefixed instruction, which has 1626 * not completed its iterations. 1627 * 1628 * In case this can be emulated in-kernel and has a high 1629 * repetition count (causing a tight spin), it should be 1630 * deferential to yield conditions. 1631 */ 1632 if (!vcpu_should_yield(vm, vcpuid)) { 1633 goto repeat; 1634 } else { 1635 /* 1636 * Defer to the contending load by making a trip to 1637 * userspace with a no-op (BOGUS) exit reason. 1638 */ 1639 vie_reset(vie); 1640 vme->exitcode = VM_EXITCODE_BOGUS; 1641 return (-1); 1642 } 1643 } else if (error == 0) { 1644 /* Update %rip now that instruction has been emulated */ 1645 vie_advance_pc(vie, &vcpu->nextrip); 1646 } 1647 return (error); 1648 } 1649 1650 static int 1651 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme) 1652 { 1653 struct vcpu *vcpu; 1654 struct vie *vie; 1655 int err; 1656 1657 vcpu = &vm->vcpu[vcpuid]; 1658 vie = vcpu->vie_ctx; 1659 1660 repeat: 1661 err = vie_emulate_inout(vie, vm, vcpuid); 1662 1663 if (err < 0) { 1664 /* 1665 * In/out not handled by any of the in-kernel-emulated devices, 1666 * so make a trip out to userspace for it. 1667 */ 1668 vie_exitinfo(vie, vme); 1669 return (err); 1670 } else if (err == EAGAIN) { 1671 /* 1672 * Continue emulating the rep-prefixed ins/outs, which has not 1673 * completed its iterations. 1674 * 1675 * In case this can be emulated in-kernel and has a high 1676 * repetition count (causing a tight spin), it should be 1677 * deferential to yield conditions. 1678 */ 1679 if (!vcpu_should_yield(vm, vcpuid)) { 1680 goto repeat; 1681 } else { 1682 /* 1683 * Defer to the contending load by making a trip to 1684 * userspace with a no-op (BOGUS) exit reason. 1685 */ 1686 vie_reset(vie); 1687 vme->exitcode = VM_EXITCODE_BOGUS; 1688 return (-1); 1689 } 1690 } else if (err != 0) { 1691 /* Emulation failure. Bail all the way out to userspace. */ 1692 vme->exitcode = VM_EXITCODE_INST_EMUL; 1693 bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul)); 1694 return (-1); 1695 } 1696 1697 vie_advance_pc(vie, &vcpu->nextrip); 1698 return (0); 1699 } 1700 1701 static int 1702 vm_handle_inst_emul(struct vm *vm, int vcpuid) 1703 { 1704 struct vie *vie; 1705 struct vcpu *vcpu; 1706 struct vm_exit *vme; 1707 uint64_t cs_base; 1708 int error, fault, cs_d; 1709 1710 vcpu = &vm->vcpu[vcpuid]; 1711 vme = &vcpu->exitinfo; 1712 vie = vcpu->vie_ctx; 1713 1714 vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d); 1715 1716 /* Fetch the faulting instruction */ 1717 ASSERT(vie_needs_fetch(vie)); 1718 error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base, 1719 &fault); 1720 if (error != 0) { 1721 return (error); 1722 } else if (fault) { 1723 /* 1724 * If a fault during instruction fetch was encounted, it will 1725 * have asserted that the appropriate exception be injected at 1726 * next entry. No further work is required. 1727 */ 1728 return (0); 1729 } 1730 1731 if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { 1732 /* Dump (unrecognized) instruction bytes in userspace */ 1733 vie_fallback_exitinfo(vie, vme); 1734 return (-1); 1735 } 1736 1737 error = vie_emulate_other(vie, vm, vcpuid); 1738 if (error != 0) { 1739 /* 1740 * Instruction emulation was unable to complete successfully, so 1741 * kick it out to userspace for handling. 1742 */ 1743 vie_fallback_exitinfo(vie, vme); 1744 } else { 1745 /* Update %rip now that instruction has been emulated */ 1746 vie_advance_pc(vie, &vcpu->nextrip); 1747 } 1748 return (error); 1749 } 1750 1751 static int 1752 vm_handle_suspend(struct vm *vm, int vcpuid) 1753 { 1754 int i; 1755 struct vcpu *vcpu; 1756 1757 vcpu = &vm->vcpu[vcpuid]; 1758 1759 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 1760 1761 /* 1762 * Wait until all 'active_cpus' have suspended themselves. 1763 */ 1764 vcpu_lock(vcpu); 1765 vcpu_ustate_change(vm, vcpuid, VU_INIT); 1766 while (1) { 1767 int rc; 1768 1769 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 1770 VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); 1771 break; 1772 } 1773 1774 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1775 rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->lock, hz, 1776 TR_CLOCK_TICK); 1777 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1778 1779 /* 1780 * If the userspace process driving the instance is killed, any 1781 * vCPUs yet to be marked suspended (because they are not 1782 * VM_RUN-ing in the kernel presently) will never reach that 1783 * state. 1784 * 1785 * To avoid vm_handle_suspend() getting stuck in the kernel 1786 * waiting for those vCPUs, offer a bail-out even though it 1787 * means returning without all vCPUs in a suspended state. 1788 */ 1789 if (rc <= 0) { 1790 if ((curproc->p_flag & SEXITING) != 0) { 1791 break; 1792 } 1793 } 1794 } 1795 vcpu_unlock(vcpu); 1796 1797 /* 1798 * Wakeup the other sleeping vcpus and return to userspace. 1799 */ 1800 for (i = 0; i < vm->maxcpus; i++) { 1801 if (CPU_ISSET(i, &vm->suspended_cpus)) { 1802 vcpu_notify_event(vm, i); 1803 } 1804 } 1805 1806 return (-1); 1807 } 1808 1809 static int 1810 vm_handle_reqidle(struct vm *vm, int vcpuid) 1811 { 1812 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1813 1814 vcpu_lock(vcpu); 1815 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); 1816 vcpu->reqidle = 0; 1817 vcpu_unlock(vcpu); 1818 return (-1); 1819 } 1820 1821 static int 1822 vm_handle_run_state(struct vm *vm, int vcpuid) 1823 { 1824 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1825 bool handled = false; 1826 1827 vcpu_lock(vcpu); 1828 while (1) { 1829 if ((vcpu->run_state & VRS_PEND_INIT) != 0) { 1830 vcpu_unlock(vcpu); 1831 VERIFY0(vcpu_arch_reset(vm, vcpuid, true)); 1832 vcpu_lock(vcpu); 1833 1834 vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT); 1835 vcpu->run_state |= VRS_INIT; 1836 } 1837 1838 if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) == 1839 (VRS_INIT | VRS_PEND_SIPI)) { 1840 const uint8_t vector = vcpu->sipi_vector; 1841 1842 vcpu_unlock(vcpu); 1843 VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector)); 1844 vcpu_lock(vcpu); 1845 1846 vcpu->run_state &= ~VRS_PEND_SIPI; 1847 vcpu->run_state |= VRS_RUN; 1848 } 1849 1850 /* 1851 * If the vCPU is now in the running state, there is no need to 1852 * wait for anything prior to re-entry. 1853 */ 1854 if ((vcpu->run_state & VRS_RUN) != 0) { 1855 handled = true; 1856 break; 1857 } 1858 1859 /* 1860 * Also check for software events which would cause a wake-up. 1861 * This will set the appropriate exitcode directly, rather than 1862 * requiring a trip through VM_RUN(). 1863 */ 1864 if (vcpu_sleep_bailout_checks(vm, vcpuid)) { 1865 break; 1866 } 1867 1868 vcpu_ustate_change(vm, vcpuid, VU_IDLE); 1869 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1870 (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock); 1871 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1872 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 1873 } 1874 vcpu_unlock(vcpu); 1875 1876 return (handled ? 0 : -1); 1877 } 1878 1879 static int 1880 vm_rdmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t *val) 1881 { 1882 switch (num) { 1883 case MSR_MTRRcap: 1884 *val = MTRR_CAP_WC | MTRR_CAP_FIXED | VMM_MTRR_VAR_MAX; 1885 break; 1886 case MSR_MTRRdefType: 1887 *val = mtrr->def_type; 1888 break; 1889 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1890 *val = mtrr->fixed4k[num - MSR_MTRR4kBase]; 1891 break; 1892 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1893 *val = mtrr->fixed16k[num - MSR_MTRR16kBase]; 1894 break; 1895 case MSR_MTRR64kBase: 1896 *val = mtrr->fixed64k; 1897 break; 1898 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { 1899 uint_t offset = num - MSR_MTRRVarBase; 1900 if (offset % 2 == 0) { 1901 *val = mtrr->var[offset / 2].base; 1902 } else { 1903 *val = mtrr->var[offset / 2].mask; 1904 } 1905 break; 1906 } 1907 default: 1908 return (-1); 1909 } 1910 1911 return (0); 1912 } 1913 1914 static int 1915 vm_wrmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t val) 1916 { 1917 switch (num) { 1918 case MSR_MTRRcap: 1919 /* MTRRCAP is read only */ 1920 return (-1); 1921 case MSR_MTRRdefType: 1922 if (val & ~VMM_MTRR_DEF_MASK) { 1923 /* generate #GP on writes to reserved fields */ 1924 return (-1); 1925 } 1926 mtrr->def_type = val; 1927 break; 1928 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1929 mtrr->fixed4k[num - MSR_MTRR4kBase] = val; 1930 break; 1931 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1932 mtrr->fixed16k[num - MSR_MTRR16kBase] = val; 1933 break; 1934 case MSR_MTRR64kBase: 1935 mtrr->fixed64k = val; 1936 break; 1937 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: { 1938 uint_t offset = num - MSR_MTRRVarBase; 1939 if (offset % 2 == 0) { 1940 if (val & ~VMM_MTRR_PHYSBASE_MASK) { 1941 /* generate #GP on writes to reserved fields */ 1942 return (-1); 1943 } 1944 mtrr->var[offset / 2].base = val; 1945 } else { 1946 if (val & ~VMM_MTRR_PHYSMASK_MASK) { 1947 /* generate #GP on writes to reserved fields */ 1948 return (-1); 1949 } 1950 mtrr->var[offset / 2].mask = val; 1951 } 1952 break; 1953 } 1954 default: 1955 return (-1); 1956 } 1957 1958 return (0); 1959 } 1960 1961 static int 1962 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 1963 { 1964 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1965 const uint32_t code = vme->u.msr.code; 1966 uint64_t val = 0; 1967 1968 switch (code) { 1969 case MSR_MCG_CAP: 1970 case MSR_MCG_STATUS: 1971 val = 0; 1972 break; 1973 1974 case MSR_MTRRcap: 1975 case MSR_MTRRdefType: 1976 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 1977 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 1978 case MSR_MTRR64kBase: 1979 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 1980 if (vm_rdmtrr(&vcpu->mtrr, code, &val) != 0) 1981 vm_inject_gp(vm, vcpuid); 1982 break; 1983 1984 case MSR_TSC: 1985 /* 1986 * In all likelihood, this should always be handled in guest 1987 * context by VMX/SVM rather than taking an exit. (Both VMX and 1988 * SVM pass through read-only access to MSR_TSC to the guest.) 1989 * 1990 * No physical offset is requested of vcpu_tsc_offset() since 1991 * rdtsc_offset() takes care of that instead. 1992 */ 1993 val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset(); 1994 break; 1995 1996 default: 1997 /* 1998 * Anything not handled at this point will be kicked out to 1999 * userspace for attempted processing there. 2000 */ 2001 return (-1); 2002 } 2003 2004 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, 2005 val & 0xffffffff)); 2006 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 2007 val >> 32)); 2008 return (0); 2009 } 2010 2011 static int 2012 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) 2013 { 2014 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2015 const uint32_t code = vme->u.msr.code; 2016 const uint64_t val = vme->u.msr.wval; 2017 2018 switch (code) { 2019 case MSR_MCG_CAP: 2020 case MSR_MCG_STATUS: 2021 /* Ignore writes */ 2022 break; 2023 2024 case MSR_MTRRcap: 2025 case MSR_MTRRdefType: 2026 case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7: 2027 case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: 2028 case MSR_MTRR64kBase: 2029 case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: 2030 if (vm_wrmtrr(&vcpu->mtrr, code, val) != 0) 2031 vm_inject_gp(vm, vcpuid); 2032 break; 2033 2034 case MSR_TSC: 2035 /* 2036 * The effect of writing the TSC MSR is that a subsequent read 2037 * of the TSC would report that value written (plus any time 2038 * elapsed between the write and the read). The guest TSC value 2039 * is calculated from a global offset for the guest (which 2040 * effectively makes its TSC read 0 at guest boot) and a 2041 * per-vCPU offset to handle these writes to the MSR. 2042 * 2043 * To calculate that per-vCPU offset, we can work backwards from 2044 * the guest value at the time of write: 2045 * 2046 * value = host TSC + VM boot offset + vCPU offset 2047 * 2048 * so therefore: 2049 * 2050 * value - host TSC - VM boot offset = vCPU offset 2051 */ 2052 vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset(); 2053 break; 2054 2055 default: 2056 /* 2057 * Anything not handled at this point will be kicked out to 2058 * userspace for attempted processing there. 2059 */ 2060 return (-1); 2061 } 2062 2063 return (0); 2064 } 2065 2066 int 2067 vm_suspend(struct vm *vm, enum vm_suspend_how how) 2068 { 2069 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) 2070 return (EINVAL); 2071 2072 if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) { 2073 return (EALREADY); 2074 } 2075 2076 /* 2077 * Notify all active vcpus that they are now suspended. 2078 */ 2079 for (uint_t i = 0; i < vm->maxcpus; i++) { 2080 struct vcpu *vcpu = &vm->vcpu[i]; 2081 2082 vcpu_lock(vcpu); 2083 if (vcpu->state == VCPU_IDLE || vcpu->state == VCPU_FROZEN) { 2084 /* 2085 * Any vCPUs not actively running or in HLT can be 2086 * marked as suspended immediately. 2087 */ 2088 if (CPU_ISSET(i, &vm->active_cpus)) { 2089 CPU_SET_ATOMIC(i, &vm->suspended_cpus); 2090 } 2091 } else { 2092 /* 2093 * Those which are running or in HLT will pick up the 2094 * suspended state after notification. 2095 */ 2096 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2097 } 2098 vcpu_unlock(vcpu); 2099 } 2100 return (0); 2101 } 2102 2103 void 2104 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip) 2105 { 2106 struct vm_exit *vmexit; 2107 2108 vmexit = vm_exitinfo(vm, vcpuid); 2109 vmexit->rip = rip; 2110 vmexit->inst_length = 0; 2111 vmexit->exitcode = VM_EXITCODE_RUN_STATE; 2112 vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1); 2113 } 2114 2115 /* 2116 * Some vmm resources, such as the lapic, may have CPU-specific resources 2117 * allocated to them which would benefit from migration onto the host CPU which 2118 * is processing the vcpu state. 2119 */ 2120 static void 2121 vm_localize_resources(struct vm *vm, struct vcpu *vcpu) 2122 { 2123 /* 2124 * Localizing cyclic resources requires acquisition of cpu_lock, and 2125 * doing so with kpreempt disabled is a recipe for deadlock disaster. 2126 */ 2127 VERIFY(curthread->t_preempt == 0); 2128 2129 /* 2130 * Do not bother with localization if this vCPU is about to return to 2131 * the host CPU it was last localized to. 2132 */ 2133 if (vcpu->lastloccpu == curcpu) 2134 return; 2135 2136 /* 2137 * Localize system-wide resources to the primary boot vCPU. While any 2138 * of the other vCPUs may access them, it keeps the potential interrupt 2139 * footprint constrained to CPUs involved with this instance. 2140 */ 2141 if (vcpu == &vm->vcpu[0]) { 2142 vhpet_localize_resources(vm->vhpet); 2143 vrtc_localize_resources(vm->vrtc); 2144 vatpit_localize_resources(vm->vatpit); 2145 } 2146 2147 vlapic_localize_resources(vcpu->vlapic); 2148 2149 vcpu->lastloccpu = curcpu; 2150 } 2151 2152 static void 2153 vmm_savectx(void *arg) 2154 { 2155 vm_thread_ctx_t *vtc = arg; 2156 struct vm *vm = vtc->vtc_vm; 2157 const int vcpuid = vtc->vtc_vcpuid; 2158 2159 if (ops->vmsavectx != NULL) { 2160 ops->vmsavectx(vm->cookie, vcpuid); 2161 } 2162 2163 /* 2164 * Account for going off-cpu, unless the vCPU is idled, where being 2165 * off-cpu is the explicit point. 2166 */ 2167 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2168 vtc->vtc_ustate = vm->vcpu[vcpuid].ustate; 2169 vcpu_ustate_change(vm, vcpuid, VU_SCHED); 2170 } 2171 2172 /* 2173 * If the CPU holds the restored guest FPU state, save it and restore 2174 * the host FPU state before this thread goes off-cpu. 2175 */ 2176 if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) { 2177 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2178 2179 save_guest_fpustate(vcpu); 2180 vtc->vtc_status &= ~VTCS_FPU_RESTORED; 2181 } 2182 } 2183 2184 static void 2185 vmm_restorectx(void *arg) 2186 { 2187 vm_thread_ctx_t *vtc = arg; 2188 struct vm *vm = vtc->vtc_vm; 2189 const int vcpuid = vtc->vtc_vcpuid; 2190 2191 /* Complete microstate accounting for vCPU being off-cpu */ 2192 if (vm->vcpu[vcpuid].ustate != VU_IDLE) { 2193 vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate); 2194 } 2195 2196 /* 2197 * When coming back on-cpu, only restore the guest FPU status if the 2198 * thread is in a context marked as requiring it. This should be rare, 2199 * occurring only when a future logic error results in a voluntary 2200 * sleep during the VMRUN critical section. 2201 * 2202 * The common case will result in elision of the guest FPU state 2203 * restoration, deferring that action until it is clearly necessary 2204 * during vm_run. 2205 */ 2206 VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0); 2207 if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) { 2208 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2209 2210 restore_guest_fpustate(vcpu); 2211 vtc->vtc_status |= VTCS_FPU_RESTORED; 2212 } 2213 2214 if (ops->vmrestorectx != NULL) { 2215 ops->vmrestorectx(vm->cookie, vcpuid); 2216 } 2217 2218 } 2219 2220 static int 2221 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry, 2222 struct vm_exit *vme) 2223 { 2224 struct vcpu *vcpu; 2225 struct vie *vie; 2226 int err; 2227 2228 vcpu = &vm->vcpu[vcpuid]; 2229 vie = vcpu->vie_ctx; 2230 err = 0; 2231 2232 switch (entry->cmd) { 2233 case VEC_DEFAULT: 2234 return (0); 2235 case VEC_DISCARD_INSTR: 2236 vie_reset(vie); 2237 return (0); 2238 case VEC_FULFILL_MMIO: 2239 err = vie_fulfill_mmio(vie, &entry->u.mmio); 2240 if (err == 0) { 2241 err = vie_emulate_mmio(vie, vm, vcpuid); 2242 if (err == 0) { 2243 vie_advance_pc(vie, &vcpu->nextrip); 2244 } else if (err < 0) { 2245 vie_exitinfo(vie, vme); 2246 } else if (err == EAGAIN) { 2247 /* 2248 * Clear the instruction emulation state in 2249 * order to re-enter VM context and continue 2250 * this 'rep <instruction>' 2251 */ 2252 vie_reset(vie); 2253 err = 0; 2254 } 2255 } 2256 break; 2257 case VEC_FULFILL_INOUT: 2258 err = vie_fulfill_inout(vie, &entry->u.inout); 2259 if (err == 0) { 2260 err = vie_emulate_inout(vie, vm, vcpuid); 2261 if (err == 0) { 2262 vie_advance_pc(vie, &vcpu->nextrip); 2263 } else if (err < 0) { 2264 vie_exitinfo(vie, vme); 2265 } else if (err == EAGAIN) { 2266 /* 2267 * Clear the instruction emulation state in 2268 * order to re-enter VM context and continue 2269 * this 'rep ins/outs' 2270 */ 2271 vie_reset(vie); 2272 err = 0; 2273 } 2274 } 2275 break; 2276 default: 2277 return (EINVAL); 2278 } 2279 return (err); 2280 } 2281 2282 static int 2283 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme) 2284 { 2285 struct vie *vie; 2286 2287 vie = vm->vcpu[vcpuid].vie_ctx; 2288 2289 if (vie_pending(vie)) { 2290 /* 2291 * Userspace has not fulfilled the pending needs of the 2292 * instruction emulation, so bail back out. 2293 */ 2294 vie_exitinfo(vie, vme); 2295 return (-1); 2296 } 2297 2298 return (0); 2299 } 2300 2301 int 2302 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry) 2303 { 2304 int error; 2305 struct vcpu *vcpu; 2306 struct vm_exit *vme; 2307 bool intr_disabled; 2308 int affinity_type = CPU_CURRENT; 2309 2310 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2311 return (EINVAL); 2312 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 2313 return (EINVAL); 2314 2315 vcpu = &vm->vcpu[vcpuid]; 2316 vme = &vcpu->exitinfo; 2317 2318 vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN); 2319 2320 vcpu->vtc.vtc_status = 0; 2321 ctxop_attach(curthread, vcpu->ctxop); 2322 2323 error = vm_entry_actions(vm, vcpuid, entry, vme); 2324 if (error != 0) { 2325 goto exit; 2326 } 2327 2328 restart: 2329 error = vm_loop_checks(vm, vcpuid, vme); 2330 if (error != 0) { 2331 goto exit; 2332 } 2333 2334 thread_affinity_set(curthread, affinity_type); 2335 /* 2336 * Resource localization should happen after the CPU affinity for the 2337 * thread has been set to ensure that access from restricted contexts, 2338 * such as VMX-accelerated APIC operations, can occur without inducing 2339 * cyclic cross-calls. 2340 * 2341 * This must be done prior to disabling kpreempt via critical_enter(). 2342 */ 2343 vm_localize_resources(vm, vcpu); 2344 affinity_type = CPU_CURRENT; 2345 critical_enter(); 2346 2347 /* Force a trip through update_sregs to reload %fs/%gs and friends */ 2348 PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb); 2349 2350 if ((vcpu->vtc.vtc_status & VTCS_FPU_RESTORED) == 0) { 2351 restore_guest_fpustate(vcpu); 2352 vcpu->vtc.vtc_status |= VTCS_FPU_RESTORED; 2353 } 2354 vcpu->vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL; 2355 2356 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 2357 error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip); 2358 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 2359 2360 /* 2361 * Once clear of the delicate contexts comprising the VM_RUN handler, 2362 * thread CPU affinity can be loosened while other processing occurs. 2363 */ 2364 vcpu->vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL; 2365 thread_affinity_clear(curthread); 2366 critical_exit(); 2367 2368 if (error != 0) { 2369 /* Communicate out any error from VMRUN() above */ 2370 goto exit; 2371 } 2372 2373 vcpu->nextrip = vme->rip + vme->inst_length; 2374 switch (vme->exitcode) { 2375 case VM_EXITCODE_REQIDLE: 2376 error = vm_handle_reqidle(vm, vcpuid); 2377 break; 2378 case VM_EXITCODE_RUN_STATE: 2379 error = vm_handle_run_state(vm, vcpuid); 2380 break; 2381 case VM_EXITCODE_SUSPENDED: 2382 error = vm_handle_suspend(vm, vcpuid); 2383 break; 2384 case VM_EXITCODE_IOAPIC_EOI: 2385 vioapic_process_eoi(vm, vcpuid, 2386 vme->u.ioapic_eoi.vector); 2387 break; 2388 case VM_EXITCODE_HLT: 2389 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 2390 error = vm_handle_hlt(vm, vcpuid, intr_disabled); 2391 break; 2392 case VM_EXITCODE_PAGING: 2393 error = vm_handle_paging(vm, vcpuid); 2394 break; 2395 case VM_EXITCODE_MMIO_EMUL: 2396 error = vm_handle_mmio_emul(vm, vcpuid); 2397 break; 2398 case VM_EXITCODE_INOUT: 2399 error = vm_handle_inout(vm, vcpuid, vme); 2400 break; 2401 case VM_EXITCODE_INST_EMUL: 2402 error = vm_handle_inst_emul(vm, vcpuid); 2403 break; 2404 case VM_EXITCODE_MONITOR: 2405 case VM_EXITCODE_MWAIT: 2406 case VM_EXITCODE_VMINSN: 2407 vm_inject_ud(vm, vcpuid); 2408 break; 2409 case VM_EXITCODE_RDMSR: 2410 error = vm_handle_rdmsr(vm, vcpuid, vme); 2411 break; 2412 case VM_EXITCODE_WRMSR: 2413 error = vm_handle_wrmsr(vm, vcpuid, vme); 2414 break; 2415 case VM_EXITCODE_HT: 2416 affinity_type = CPU_BEST; 2417 break; 2418 case VM_EXITCODE_MTRAP: 2419 VERIFY0(vm_suspend_cpu(vm, vcpuid)); 2420 error = -1; 2421 break; 2422 default: 2423 /* handled in userland */ 2424 error = -1; 2425 break; 2426 } 2427 2428 if (error == 0) { 2429 /* VM exit conditions handled in-kernel, continue running */ 2430 goto restart; 2431 } 2432 2433 exit: 2434 kpreempt_disable(); 2435 ctxop_detach(curthread, vcpu->ctxop); 2436 /* Make sure all of the needed vCPU context state is saved */ 2437 vmm_savectx(&vcpu->vtc); 2438 kpreempt_enable(); 2439 2440 VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode); 2441 2442 vcpu_ustate_change(vm, vcpuid, VU_EMU_USER); 2443 return (error); 2444 } 2445 2446 int 2447 vm_restart_instruction(void *arg, int vcpuid) 2448 { 2449 struct vm *vm; 2450 struct vcpu *vcpu; 2451 enum vcpu_state state; 2452 uint64_t rip; 2453 int error; 2454 2455 vm = arg; 2456 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2457 return (EINVAL); 2458 2459 vcpu = &vm->vcpu[vcpuid]; 2460 state = vcpu_get_state(vm, vcpuid, NULL); 2461 if (state == VCPU_RUNNING) { 2462 /* 2463 * When a vcpu is "running" the next instruction is determined 2464 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. 2465 * Thus setting 'inst_length' to zero will cause the current 2466 * instruction to be restarted. 2467 */ 2468 vcpu->exitinfo.inst_length = 0; 2469 VCPU_CTR1(vm, vcpuid, "restarting instruction at %lx by " 2470 "setting inst_length to zero", vcpu->exitinfo.rip); 2471 } else if (state == VCPU_FROZEN) { 2472 /* 2473 * When a vcpu is "frozen" it is outside the critical section 2474 * around VMRUN() and 'nextrip' points to the next instruction. 2475 * Thus instruction restart is achieved by setting 'nextrip' 2476 * to the vcpu's %rip. 2477 */ 2478 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); 2479 KASSERT(!error, ("%s: error %d getting rip", __func__, error)); 2480 VCPU_CTR2(vm, vcpuid, "restarting instruction by updating " 2481 "nextrip from %lx to %lx", vcpu->nextrip, rip); 2482 vcpu->nextrip = rip; 2483 } else { 2484 panic("%s: invalid state %d", __func__, state); 2485 } 2486 return (0); 2487 } 2488 2489 int 2490 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) 2491 { 2492 struct vcpu *vcpu; 2493 int type, vector; 2494 2495 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2496 return (EINVAL); 2497 2498 vcpu = &vm->vcpu[vcpuid]; 2499 2500 if (info & VM_INTINFO_VALID) { 2501 type = info & VM_INTINFO_TYPE; 2502 vector = info & 0xff; 2503 if (type == VM_INTINFO_NMI && vector != IDT_NMI) 2504 return (EINVAL); 2505 if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) 2506 return (EINVAL); 2507 if (info & VM_INTINFO_RSVD) 2508 return (EINVAL); 2509 } else { 2510 info = 0; 2511 } 2512 VCPU_CTR2(vm, vcpuid, "%s: info1(%lx)", __func__, info); 2513 vcpu->exitintinfo = info; 2514 return (0); 2515 } 2516 2517 enum exc_class { 2518 EXC_BENIGN, 2519 EXC_CONTRIBUTORY, 2520 EXC_PAGEFAULT 2521 }; 2522 2523 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */ 2524 2525 static enum exc_class 2526 exception_class(uint64_t info) 2527 { 2528 int type, vector; 2529 2530 KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %lx", info)); 2531 type = info & VM_INTINFO_TYPE; 2532 vector = info & 0xff; 2533 2534 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ 2535 switch (type) { 2536 case VM_INTINFO_HWINTR: 2537 case VM_INTINFO_SWINTR: 2538 case VM_INTINFO_NMI: 2539 return (EXC_BENIGN); 2540 default: 2541 /* 2542 * Hardware exception. 2543 * 2544 * SVM and VT-x use identical type values to represent NMI, 2545 * hardware interrupt and software interrupt. 2546 * 2547 * SVM uses type '3' for all exceptions. VT-x uses type '3' 2548 * for exceptions except #BP and #OF. #BP and #OF use a type 2549 * value of '5' or '6'. Therefore we don't check for explicit 2550 * values of 'type' to classify 'intinfo' into a hardware 2551 * exception. 2552 */ 2553 break; 2554 } 2555 2556 switch (vector) { 2557 case IDT_PF: 2558 case IDT_VE: 2559 return (EXC_PAGEFAULT); 2560 case IDT_DE: 2561 case IDT_TS: 2562 case IDT_NP: 2563 case IDT_SS: 2564 case IDT_GP: 2565 return (EXC_CONTRIBUTORY); 2566 default: 2567 return (EXC_BENIGN); 2568 } 2569 } 2570 2571 static int 2572 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, 2573 uint64_t *retinfo) 2574 { 2575 enum exc_class exc1, exc2; 2576 int type1, vector1; 2577 2578 KASSERT(info1 & VM_INTINFO_VALID, ("info1 %lx is not valid", info1)); 2579 KASSERT(info2 & VM_INTINFO_VALID, ("info2 %lx is not valid", info2)); 2580 2581 /* 2582 * If an exception occurs while attempting to call the double-fault 2583 * handler the processor enters shutdown mode (aka triple fault). 2584 */ 2585 type1 = info1 & VM_INTINFO_TYPE; 2586 vector1 = info1 & 0xff; 2587 if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { 2588 VCPU_CTR2(vm, vcpuid, "triple fault: info1(%lx), info2(%lx)", 2589 info1, info2); 2590 (void) vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); 2591 *retinfo = 0; 2592 return (0); 2593 } 2594 2595 /* 2596 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 2597 */ 2598 exc1 = exception_class(info1); 2599 exc2 = exception_class(info2); 2600 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || 2601 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { 2602 /* Convert nested fault into a double fault. */ 2603 *retinfo = IDT_DF; 2604 *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; 2605 *retinfo |= VM_INTINFO_DEL_ERRCODE; 2606 } else { 2607 /* Handle exceptions serially */ 2608 *retinfo = info2; 2609 } 2610 return (1); 2611 } 2612 2613 static uint64_t 2614 vcpu_exception_intinfo(struct vcpu *vcpu) 2615 { 2616 uint64_t info = 0; 2617 2618 if (vcpu->exception_pending) { 2619 info = vcpu->exc_vector & 0xff; 2620 info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; 2621 if (vcpu->exc_errcode_valid) { 2622 info |= VM_INTINFO_DEL_ERRCODE; 2623 info |= (uint64_t)vcpu->exc_errcode << 32; 2624 } 2625 } 2626 return (info); 2627 } 2628 2629 int 2630 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) 2631 { 2632 struct vcpu *vcpu; 2633 uint64_t info1, info2; 2634 int valid; 2635 2636 KASSERT(vcpuid >= 0 && 2637 vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid)); 2638 2639 vcpu = &vm->vcpu[vcpuid]; 2640 2641 info1 = vcpu->exitintinfo; 2642 vcpu->exitintinfo = 0; 2643 2644 info2 = 0; 2645 if (vcpu->exception_pending) { 2646 info2 = vcpu_exception_intinfo(vcpu); 2647 vcpu->exception_pending = 0; 2648 VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %lx", 2649 vcpu->exc_vector, info2); 2650 } 2651 2652 if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { 2653 valid = nested_fault(vm, vcpuid, info1, info2, retinfo); 2654 } else if (info1 & VM_INTINFO_VALID) { 2655 *retinfo = info1; 2656 valid = 1; 2657 } else if (info2 & VM_INTINFO_VALID) { 2658 *retinfo = info2; 2659 valid = 1; 2660 } else { 2661 valid = 0; 2662 } 2663 2664 if (valid) { 2665 VCPU_CTR4(vm, vcpuid, "%s: info1(%lx), info2(%lx), " 2666 "retinfo(%lx)", __func__, info1, info2, *retinfo); 2667 } 2668 2669 return (valid); 2670 } 2671 2672 int 2673 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) 2674 { 2675 struct vcpu *vcpu; 2676 2677 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2678 return (EINVAL); 2679 2680 vcpu = &vm->vcpu[vcpuid]; 2681 *info1 = vcpu->exitintinfo; 2682 *info2 = vcpu_exception_intinfo(vcpu); 2683 return (0); 2684 } 2685 2686 int 2687 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid, 2688 uint32_t errcode, int restart_instruction) 2689 { 2690 struct vcpu *vcpu; 2691 uint64_t regval; 2692 int error; 2693 2694 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2695 return (EINVAL); 2696 2697 if (vector < 0 || vector >= 32) 2698 return (EINVAL); 2699 2700 /* 2701 * NMIs (which bear an exception vector of 2) are to be injected via 2702 * their own specialized path using vm_inject_nmi(). 2703 */ 2704 if (vector == 2) { 2705 return (EINVAL); 2706 } 2707 2708 /* 2709 * A double fault exception should never be injected directly into 2710 * the guest. It is a derived exception that results from specific 2711 * combinations of nested faults. 2712 */ 2713 if (vector == IDT_DF) 2714 return (EINVAL); 2715 2716 vcpu = &vm->vcpu[vcpuid]; 2717 2718 if (vcpu->exception_pending) { 2719 VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " 2720 "pending exception %d", vector, vcpu->exc_vector); 2721 return (EBUSY); 2722 } 2723 2724 if (errcode_valid) { 2725 /* 2726 * Exceptions don't deliver an error code in real mode. 2727 */ 2728 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val); 2729 KASSERT(!error, ("%s: error %d getting CR0", __func__, error)); 2730 if (!(regval & CR0_PE)) 2731 errcode_valid = 0; 2732 } 2733 2734 /* 2735 * From section 26.6.1 "Interruptibility State" in Intel SDM: 2736 * 2737 * Event blocking by "STI" or "MOV SS" is cleared after guest executes 2738 * one instruction or incurs an exception. 2739 */ 2740 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); 2741 KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", 2742 __func__, error)); 2743 2744 if (restart_instruction) { 2745 VERIFY0(vm_restart_instruction(vm, vcpuid)); 2746 } 2747 2748 vcpu->exception_pending = 1; 2749 vcpu->exc_vector = vector; 2750 vcpu->exc_errcode = errcode; 2751 vcpu->exc_errcode_valid = errcode_valid; 2752 VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector); 2753 return (0); 2754 } 2755 2756 void 2757 vm_inject_fault(struct vm *vm, int vcpuid, int vector, int errcode_valid, 2758 int errcode) 2759 { 2760 int error; 2761 2762 error = vm_inject_exception(vm, vcpuid, vector, errcode_valid, 2763 errcode, 1); 2764 KASSERT(error == 0, ("vm_inject_exception error %d", error)); 2765 } 2766 2767 void 2768 vm_inject_ud(struct vm *vm, int vcpuid) 2769 { 2770 vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0); 2771 } 2772 2773 void 2774 vm_inject_gp(struct vm *vm, int vcpuid) 2775 { 2776 vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0); 2777 } 2778 2779 void 2780 vm_inject_ac(struct vm *vm, int vcpuid, int errcode) 2781 { 2782 vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode); 2783 } 2784 2785 void 2786 vm_inject_ss(struct vm *vm, int vcpuid, int errcode) 2787 { 2788 vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode); 2789 } 2790 2791 void 2792 vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2) 2793 { 2794 int error; 2795 2796 VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %x, cr2 %lx", 2797 error_code, cr2); 2798 2799 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2); 2800 KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); 2801 2802 vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code); 2803 } 2804 2805 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 2806 2807 int 2808 vm_inject_nmi(struct vm *vm, int vcpuid) 2809 { 2810 struct vcpu *vcpu; 2811 2812 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2813 return (EINVAL); 2814 2815 vcpu = &vm->vcpu[vcpuid]; 2816 2817 vcpu->nmi_pending = 1; 2818 vcpu_notify_event(vm, vcpuid); 2819 return (0); 2820 } 2821 2822 int 2823 vm_nmi_pending(struct vm *vm, int vcpuid) 2824 { 2825 struct vcpu *vcpu; 2826 2827 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2828 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 2829 2830 vcpu = &vm->vcpu[vcpuid]; 2831 2832 return (vcpu->nmi_pending); 2833 } 2834 2835 void 2836 vm_nmi_clear(struct vm *vm, int vcpuid) 2837 { 2838 struct vcpu *vcpu; 2839 2840 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2841 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 2842 2843 vcpu = &vm->vcpu[vcpuid]; 2844 2845 if (vcpu->nmi_pending == 0) 2846 panic("vm_nmi_clear: inconsistent nmi_pending state"); 2847 2848 vcpu->nmi_pending = 0; 2849 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 2850 } 2851 2852 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); 2853 2854 int 2855 vm_inject_extint(struct vm *vm, int vcpuid) 2856 { 2857 struct vcpu *vcpu; 2858 2859 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2860 return (EINVAL); 2861 2862 vcpu = &vm->vcpu[vcpuid]; 2863 2864 vcpu->extint_pending = 1; 2865 vcpu_notify_event(vm, vcpuid); 2866 return (0); 2867 } 2868 2869 int 2870 vm_extint_pending(struct vm *vm, int vcpuid) 2871 { 2872 struct vcpu *vcpu; 2873 2874 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2875 panic("vm_extint_pending: invalid vcpuid %d", vcpuid); 2876 2877 vcpu = &vm->vcpu[vcpuid]; 2878 2879 return (vcpu->extint_pending); 2880 } 2881 2882 void 2883 vm_extint_clear(struct vm *vm, int vcpuid) 2884 { 2885 struct vcpu *vcpu; 2886 2887 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2888 panic("vm_extint_pending: invalid vcpuid %d", vcpuid); 2889 2890 vcpu = &vm->vcpu[vcpuid]; 2891 2892 if (vcpu->extint_pending == 0) 2893 panic("vm_extint_clear: inconsistent extint_pending state"); 2894 2895 vcpu->extint_pending = 0; 2896 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); 2897 } 2898 2899 int 2900 vm_inject_init(struct vm *vm, int vcpuid) 2901 { 2902 struct vcpu *vcpu; 2903 2904 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2905 return (EINVAL); 2906 2907 vcpu = &vm->vcpu[vcpuid]; 2908 vcpu_lock(vcpu); 2909 vcpu->run_state |= VRS_PEND_INIT; 2910 /* 2911 * As part of queuing the INIT request, clear any pending SIPI. It 2912 * would not otherwise survive across the reset of the vCPU when it 2913 * undergoes the requested INIT. We would not want it to linger when it 2914 * could be mistaken as a subsequent (after the INIT) SIPI request. 2915 */ 2916 vcpu->run_state &= ~VRS_PEND_SIPI; 2917 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2918 2919 vcpu_unlock(vcpu); 2920 return (0); 2921 } 2922 2923 int 2924 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector) 2925 { 2926 struct vcpu *vcpu; 2927 2928 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2929 return (EINVAL); 2930 2931 vcpu = &vm->vcpu[vcpuid]; 2932 vcpu_lock(vcpu); 2933 vcpu->run_state |= VRS_PEND_SIPI; 2934 vcpu->sipi_vector = vector; 2935 /* SIPI is only actionable if the CPU is waiting in INIT state */ 2936 if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) { 2937 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 2938 } 2939 vcpu_unlock(vcpu); 2940 return (0); 2941 } 2942 2943 bool 2944 vcpu_run_state_pending(struct vm *vm, int vcpuid) 2945 { 2946 struct vcpu *vcpu; 2947 2948 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 2949 vcpu = &vm->vcpu[vcpuid]; 2950 2951 /* Of interest: vCPU not in running state or with pending INIT */ 2952 return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN); 2953 } 2954 2955 int 2956 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only) 2957 { 2958 struct seg_desc desc; 2959 const enum vm_reg_name clear_regs[] = { 2960 VM_REG_GUEST_CR2, 2961 VM_REG_GUEST_CR3, 2962 VM_REG_GUEST_CR4, 2963 VM_REG_GUEST_RAX, 2964 VM_REG_GUEST_RBX, 2965 VM_REG_GUEST_RCX, 2966 VM_REG_GUEST_RSI, 2967 VM_REG_GUEST_RDI, 2968 VM_REG_GUEST_RBP, 2969 VM_REG_GUEST_RSP, 2970 VM_REG_GUEST_R8, 2971 VM_REG_GUEST_R9, 2972 VM_REG_GUEST_R10, 2973 VM_REG_GUEST_R11, 2974 VM_REG_GUEST_R12, 2975 VM_REG_GUEST_R13, 2976 VM_REG_GUEST_R14, 2977 VM_REG_GUEST_R15, 2978 VM_REG_GUEST_DR0, 2979 VM_REG_GUEST_DR1, 2980 VM_REG_GUEST_DR2, 2981 VM_REG_GUEST_DR3, 2982 VM_REG_GUEST_EFER, 2983 }; 2984 const enum vm_reg_name data_segs[] = { 2985 VM_REG_GUEST_SS, 2986 VM_REG_GUEST_DS, 2987 VM_REG_GUEST_ES, 2988 VM_REG_GUEST_FS, 2989 VM_REG_GUEST_GS, 2990 }; 2991 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2992 2993 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 2994 return (EINVAL); 2995 2996 for (uint_t i = 0; i < nitems(clear_regs); i++) { 2997 VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0)); 2998 } 2999 3000 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2)); 3001 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0)); 3002 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010)); 3003 3004 /* 3005 * The prescribed contents of %rdx differ slightly between the Intel and 3006 * AMD architectural definitions. The former expects the Extended Model 3007 * in bits 16-19 where the latter expects all the Family, Model, and 3008 * Stepping be there. Common boot ROMs appear to disregard this 3009 * anyways, so we stick with a compromise value similar to what is 3010 * spelled out in the Intel SDM. 3011 */ 3012 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600)); 3013 3014 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0)); 3015 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400)); 3016 3017 /* CS: Present, R/W, Accessed */ 3018 desc.access = 0x0093; 3019 desc.base = 0xffff0000; 3020 desc.limit = 0xffff; 3021 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 3022 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000)); 3023 3024 /* SS, DS, ES, FS, GS: Present, R/W, Accessed */ 3025 desc.access = 0x0093; 3026 desc.base = 0; 3027 desc.limit = 0xffff; 3028 for (uint_t i = 0; i < nitems(data_segs); i++) { 3029 VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc)); 3030 VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0)); 3031 } 3032 3033 /* GDTR, IDTR */ 3034 desc.base = 0; 3035 desc.limit = 0xffff; 3036 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc)); 3037 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc)); 3038 3039 /* LDTR: Present, LDT */ 3040 desc.access = 0x0082; 3041 desc.base = 0; 3042 desc.limit = 0xffff; 3043 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc)); 3044 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0)); 3045 3046 /* TR: Present, 32-bit TSS */ 3047 desc.access = 0x008b; 3048 desc.base = 0; 3049 desc.limit = 0xffff; 3050 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc)); 3051 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0)); 3052 3053 vlapic_reset(vm_lapic(vm, vcpuid)); 3054 3055 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0)); 3056 3057 vcpu->exitintinfo = 0; 3058 vcpu->exception_pending = 0; 3059 vcpu->nmi_pending = 0; 3060 vcpu->extint_pending = 0; 3061 3062 /* 3063 * A CPU reset caused by power-on or system reset clears more state than 3064 * one which is trigged from an INIT IPI. 3065 */ 3066 if (!init_only) { 3067 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 3068 (void) hma_fpu_init(vcpu->guestfpu); 3069 3070 /* XXX: clear MSRs and other pieces */ 3071 bzero(&vcpu->mtrr, sizeof (vcpu->mtrr)); 3072 } 3073 3074 return (0); 3075 } 3076 3077 static int 3078 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector) 3079 { 3080 struct seg_desc desc; 3081 3082 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3083 return (EINVAL); 3084 3085 /* CS: Present, R/W, Accessed */ 3086 desc.access = 0x0093; 3087 desc.base = (uint64_t)vector << 12; 3088 desc.limit = 0xffff; 3089 VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); 3090 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 3091 (uint64_t)vector << 8)); 3092 3093 VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0)); 3094 3095 return (0); 3096 } 3097 3098 int 3099 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 3100 { 3101 if (vcpu < 0 || vcpu >= vm->maxcpus) 3102 return (EINVAL); 3103 3104 if (type < 0 || type >= VM_CAP_MAX) 3105 return (EINVAL); 3106 3107 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 3108 } 3109 3110 int 3111 vm_set_capability(struct vm *vm, int vcpu, int type, int val) 3112 { 3113 if (vcpu < 0 || vcpu >= vm->maxcpus) 3114 return (EINVAL); 3115 3116 if (type < 0 || type >= VM_CAP_MAX) 3117 return (EINVAL); 3118 3119 return (VMSETCAP(vm->cookie, vcpu, type, val)); 3120 } 3121 3122 struct vlapic * 3123 vm_lapic(struct vm *vm, int cpu) 3124 { 3125 return (vm->vcpu[cpu].vlapic); 3126 } 3127 3128 struct vioapic * 3129 vm_ioapic(struct vm *vm) 3130 { 3131 3132 return (vm->vioapic); 3133 } 3134 3135 struct vhpet * 3136 vm_hpet(struct vm *vm) 3137 { 3138 3139 return (vm->vhpet); 3140 } 3141 3142 void * 3143 vm_iommu_domain(struct vm *vm) 3144 { 3145 3146 return (vm->iommu); 3147 } 3148 3149 int 3150 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, 3151 bool from_idle) 3152 { 3153 int error; 3154 struct vcpu *vcpu; 3155 3156 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3157 panic("vcpu_set_state: invalid vcpuid %d", vcpuid); 3158 3159 vcpu = &vm->vcpu[vcpuid]; 3160 3161 vcpu_lock(vcpu); 3162 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); 3163 vcpu_unlock(vcpu); 3164 3165 return (error); 3166 } 3167 3168 enum vcpu_state 3169 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 3170 { 3171 struct vcpu *vcpu; 3172 enum vcpu_state state; 3173 3174 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3175 panic("vcpu_get_state: invalid vcpuid %d", vcpuid); 3176 3177 vcpu = &vm->vcpu[vcpuid]; 3178 3179 vcpu_lock(vcpu); 3180 state = vcpu->state; 3181 if (hostcpu != NULL) 3182 *hostcpu = vcpu->hostcpu; 3183 vcpu_unlock(vcpu); 3184 3185 return (state); 3186 } 3187 3188 uint64_t 3189 vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj) 3190 { 3191 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3192 3193 uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset; 3194 3195 if (phys_adj) { 3196 /* Include any offset for the current physical CPU too */ 3197 extern hrtime_t tsc_gethrtime_tick_delta(void); 3198 vcpu_off += (uint64_t)tsc_gethrtime_tick_delta(); 3199 } 3200 3201 return (vcpu_off); 3202 } 3203 3204 int 3205 vm_activate_cpu(struct vm *vm, int vcpuid) 3206 { 3207 3208 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3209 return (EINVAL); 3210 3211 if (CPU_ISSET(vcpuid, &vm->active_cpus)) 3212 return (EBUSY); 3213 3214 if (vm->suspend != 0) { 3215 return (EBUSY); 3216 } 3217 3218 VCPU_CTR0(vm, vcpuid, "activated"); 3219 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); 3220 3221 /* 3222 * It is possible that this vCPU was undergoing activation at the same 3223 * time that the VM was being suspended. If that happens to be the 3224 * case, it should reflect the suspended state immediately. 3225 */ 3226 if (atomic_load_acq_int((uint_t *)&vm->suspend) != 0) { 3227 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 3228 } 3229 3230 return (0); 3231 } 3232 3233 int 3234 vm_suspend_cpu(struct vm *vm, int vcpuid) 3235 { 3236 int i; 3237 3238 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3239 return (EINVAL); 3240 3241 if (vcpuid == -1) { 3242 vm->debug_cpus = vm->active_cpus; 3243 for (i = 0; i < vm->maxcpus; i++) { 3244 if (CPU_ISSET(i, &vm->active_cpus)) 3245 vcpu_notify_event(vm, i); 3246 } 3247 } else { 3248 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 3249 return (EINVAL); 3250 3251 CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); 3252 vcpu_notify_event(vm, vcpuid); 3253 } 3254 return (0); 3255 } 3256 3257 int 3258 vm_resume_cpu(struct vm *vm, int vcpuid) 3259 { 3260 3261 if (vcpuid < -1 || vcpuid >= vm->maxcpus) 3262 return (EINVAL); 3263 3264 if (vcpuid == -1) { 3265 CPU_ZERO(&vm->debug_cpus); 3266 } else { 3267 if (!CPU_ISSET(vcpuid, &vm->debug_cpus)) 3268 return (EINVAL); 3269 3270 CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); 3271 } 3272 return (0); 3273 } 3274 3275 static bool 3276 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry, 3277 uint64_t entry_rip) 3278 { 3279 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3280 struct vm_exit *vme = &vcpu->exitinfo; 3281 bool bail = false; 3282 3283 ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); 3284 3285 if (vm->suspend) { 3286 if (on_entry) { 3287 VERIFY(vm->suspend > VM_SUSPEND_NONE && 3288 vm->suspend < VM_SUSPEND_LAST); 3289 3290 vme->exitcode = VM_EXITCODE_SUSPENDED; 3291 vme->u.suspended.how = vm->suspend; 3292 } else { 3293 /* 3294 * Handling VM suspend is complicated, so if that 3295 * condition is detected outside of VM-entry itself, 3296 * just emit a BOGUS exitcode so we take a lap to pick 3297 * up the event during an entry and are directed into 3298 * the vm_handle_suspend() logic. 3299 */ 3300 vme->exitcode = VM_EXITCODE_BOGUS; 3301 } 3302 bail = true; 3303 } 3304 if (vcpu->reqidle) { 3305 vme->exitcode = VM_EXITCODE_REQIDLE; 3306 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); 3307 3308 if (!on_entry) { 3309 /* 3310 * A reqidle request detected outside of VM-entry can be 3311 * handled directly by clearing the request (and taking 3312 * a lap to userspace). 3313 */ 3314 vcpu_assert_locked(vcpu); 3315 vcpu->reqidle = 0; 3316 } 3317 bail = true; 3318 } 3319 if (vcpu_should_yield(vm, vcpuid)) { 3320 vme->exitcode = VM_EXITCODE_BOGUS; 3321 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); 3322 bail = true; 3323 } 3324 if (CPU_ISSET(vcpuid, &vm->debug_cpus)) { 3325 vme->exitcode = VM_EXITCODE_DEBUG; 3326 bail = true; 3327 } 3328 3329 if (bail) { 3330 if (on_entry) { 3331 /* 3332 * If bailing out during VM-entry, the current %rip must 3333 * be recorded in the exitinfo. 3334 */ 3335 vme->rip = entry_rip; 3336 } 3337 vme->inst_length = 0; 3338 } 3339 return (bail); 3340 } 3341 3342 static bool 3343 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid) 3344 { 3345 /* 3346 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or 3347 * wait-for-SIPI) expect that %rip is already populated in the vm_exit 3348 * structure, and we would only modify the exitcode. 3349 */ 3350 return (vcpu_bailout_checks(vm, vcpuid, false, 0)); 3351 } 3352 3353 bool 3354 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip) 3355 { 3356 /* 3357 * Bail-out checks done as part of VM entry require an updated %rip to 3358 * populate the vm_exit struct if any of the conditions of interest are 3359 * matched in the check. 3360 */ 3361 return (vcpu_bailout_checks(vm, vcpuid, true, rip)); 3362 } 3363 3364 cpuset_t 3365 vm_active_cpus(struct vm *vm) 3366 { 3367 3368 return (vm->active_cpus); 3369 } 3370 3371 cpuset_t 3372 vm_debug_cpus(struct vm *vm) 3373 { 3374 3375 return (vm->debug_cpus); 3376 } 3377 3378 cpuset_t 3379 vm_suspended_cpus(struct vm *vm) 3380 { 3381 3382 return (vm->suspended_cpus); 3383 } 3384 3385 void * 3386 vcpu_stats(struct vm *vm, int vcpuid) 3387 { 3388 3389 return (vm->vcpu[vcpuid].stats); 3390 } 3391 3392 int 3393 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 3394 { 3395 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3396 return (EINVAL); 3397 3398 *state = vm->vcpu[vcpuid].x2apic_state; 3399 3400 return (0); 3401 } 3402 3403 int 3404 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 3405 { 3406 if (vcpuid < 0 || vcpuid >= vm->maxcpus) 3407 return (EINVAL); 3408 3409 if (state >= X2APIC_STATE_LAST) 3410 return (EINVAL); 3411 3412 vm->vcpu[vcpuid].x2apic_state = state; 3413 3414 vlapic_set_x2apic_state(vm, vcpuid, state); 3415 3416 return (0); 3417 } 3418 3419 /* 3420 * This function is called to ensure that a vcpu "sees" a pending event 3421 * as soon as possible: 3422 * - If the vcpu thread is sleeping then it is woken up. 3423 * - If the vcpu is running on a different host_cpu then an IPI will be directed 3424 * to the host_cpu to cause the vcpu to trap into the hypervisor. 3425 */ 3426 static void 3427 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype) 3428 { 3429 int hostcpu; 3430 3431 ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT); 3432 3433 hostcpu = vcpu->hostcpu; 3434 if (vcpu->state == VCPU_RUNNING) { 3435 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); 3436 if (hostcpu != curcpu) { 3437 if (ntype == VCPU_NOTIFY_APIC) { 3438 vlapic_post_intr(vcpu->vlapic, hostcpu); 3439 } else { 3440 poke_cpu(hostcpu); 3441 } 3442 } else { 3443 /* 3444 * If the 'vcpu' is running on 'curcpu' then it must 3445 * be sending a notification to itself (e.g. SELF_IPI). 3446 * The pending event will be picked up when the vcpu 3447 * transitions back to guest context. 3448 */ 3449 } 3450 } else { 3451 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " 3452 "with hostcpu %d", vcpu->state, hostcpu)); 3453 if (vcpu->state == VCPU_SLEEPING) { 3454 cv_signal(&vcpu->vcpu_cv); 3455 } 3456 } 3457 } 3458 3459 void 3460 vcpu_notify_event(struct vm *vm, int vcpuid) 3461 { 3462 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3463 3464 vcpu_lock(vcpu); 3465 vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); 3466 vcpu_unlock(vcpu); 3467 } 3468 3469 void 3470 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype) 3471 { 3472 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3473 3474 if (ntype == VCPU_NOTIFY_NONE) { 3475 return; 3476 } 3477 3478 vcpu_lock(vcpu); 3479 vcpu_notify_event_locked(vcpu, ntype); 3480 vcpu_unlock(vcpu); 3481 } 3482 3483 void 3484 vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate) 3485 { 3486 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3487 hrtime_t now = gethrtime(); 3488 3489 ASSERT3U(ustate, !=, vcpu->ustate); 3490 ASSERT3S(ustate, <, VU_MAX); 3491 ASSERT3S(ustate, >=, VU_INIT); 3492 3493 hrtime_t delta = now - vcpu->ustate_when; 3494 vcpu->ustate_total[vcpu->ustate] += delta; 3495 3496 membar_producer(); 3497 3498 vcpu->ustate_when = now; 3499 vcpu->ustate = ustate; 3500 } 3501 3502 struct vmspace * 3503 vm_get_vmspace(struct vm *vm) 3504 { 3505 3506 return (vm->vmspace); 3507 } 3508 3509 struct vm_client * 3510 vm_get_vmclient(struct vm *vm, int vcpuid) 3511 { 3512 return (vm->vcpu[vcpuid].vmclient); 3513 } 3514 3515 int 3516 vm_apicid2vcpuid(struct vm *vm, int apicid) 3517 { 3518 /* 3519 * XXX apic id is assumed to be numerically identical to vcpu id 3520 */ 3521 return (apicid); 3522 } 3523 3524 struct vatpic * 3525 vm_atpic(struct vm *vm) 3526 { 3527 return (vm->vatpic); 3528 } 3529 3530 struct vatpit * 3531 vm_atpit(struct vm *vm) 3532 { 3533 return (vm->vatpit); 3534 } 3535 3536 struct vpmtmr * 3537 vm_pmtmr(struct vm *vm) 3538 { 3539 3540 return (vm->vpmtmr); 3541 } 3542 3543 struct vrtc * 3544 vm_rtc(struct vm *vm) 3545 { 3546 3547 return (vm->vrtc); 3548 } 3549 3550 enum vm_reg_name 3551 vm_segment_name(int seg) 3552 { 3553 static enum vm_reg_name seg_names[] = { 3554 VM_REG_GUEST_ES, 3555 VM_REG_GUEST_CS, 3556 VM_REG_GUEST_SS, 3557 VM_REG_GUEST_DS, 3558 VM_REG_GUEST_FS, 3559 VM_REG_GUEST_GS 3560 }; 3561 3562 KASSERT(seg >= 0 && seg < nitems(seg_names), 3563 ("%s: invalid segment encoding %d", __func__, seg)); 3564 return (seg_names[seg]); 3565 } 3566 3567 void 3568 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, 3569 uint_t num_copyinfo) 3570 { 3571 for (uint_t idx = 0; idx < num_copyinfo; idx++) { 3572 if (copyinfo[idx].cookie != NULL) { 3573 (void) vmp_release((vm_page_t *)copyinfo[idx].cookie); 3574 } 3575 } 3576 bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo)); 3577 } 3578 3579 int 3580 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 3581 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, 3582 uint_t num_copyinfo, int *fault) 3583 { 3584 uint_t idx, nused; 3585 size_t n, off, remaining; 3586 vm_client_t *vmc = vm_get_vmclient(vm, vcpuid); 3587 3588 bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo); 3589 3590 nused = 0; 3591 remaining = len; 3592 while (remaining > 0) { 3593 uint64_t gpa; 3594 int error; 3595 3596 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); 3597 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); 3598 if (error || *fault) 3599 return (error); 3600 off = gpa & PAGEOFFSET; 3601 n = min(remaining, PAGESIZE - off); 3602 copyinfo[nused].gpa = gpa; 3603 copyinfo[nused].len = n; 3604 remaining -= n; 3605 gla += n; 3606 nused++; 3607 } 3608 3609 for (idx = 0; idx < nused; idx++) { 3610 vm_page_t *vmp; 3611 caddr_t hva; 3612 3613 vmp = vmc_hold(vmc, copyinfo[idx].gpa & PAGEMASK, prot); 3614 if (vmp == NULL) { 3615 break; 3616 } 3617 if ((prot & PROT_WRITE) != 0) { 3618 hva = (caddr_t)vmp_get_writable(vmp); 3619 } else { 3620 hva = (caddr_t)vmp_get_readable(vmp); 3621 } 3622 copyinfo[idx].hva = hva + (copyinfo[idx].gpa & PAGEOFFSET); 3623 copyinfo[idx].cookie = vmp; 3624 copyinfo[idx].prot = prot; 3625 } 3626 3627 if (idx != nused) { 3628 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); 3629 return (EFAULT); 3630 } else { 3631 *fault = 0; 3632 return (0); 3633 } 3634 } 3635 3636 void 3637 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, 3638 size_t len) 3639 { 3640 char *dst; 3641 int idx; 3642 3643 dst = kaddr; 3644 idx = 0; 3645 while (len > 0) { 3646 ASSERT(copyinfo[idx].prot & PROT_READ); 3647 3648 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); 3649 len -= copyinfo[idx].len; 3650 dst += copyinfo[idx].len; 3651 idx++; 3652 } 3653 } 3654 3655 void 3656 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, 3657 struct vm_copyinfo *copyinfo, size_t len) 3658 { 3659 const char *src; 3660 int idx; 3661 3662 src = kaddr; 3663 idx = 0; 3664 while (len > 0) { 3665 ASSERT(copyinfo[idx].prot & PROT_WRITE); 3666 3667 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); 3668 len -= copyinfo[idx].len; 3669 src += copyinfo[idx].len; 3670 idx++; 3671 } 3672 } 3673 3674 /* 3675 * Return the amount of in-use and wired memory for the VM. Since 3676 * these are global stats, only return the values with for vCPU 0 3677 */ 3678 VMM_STAT_DECLARE(VMM_MEM_RESIDENT); 3679 3680 static void 3681 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 3682 { 3683 if (vcpu == 0) { 3684 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, 3685 PAGE_SIZE * vmspace_resident_count(vm->vmspace)); 3686 } 3687 } 3688 3689 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); 3690 3691 int 3692 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port, 3693 uint8_t bytes, uint32_t *val) 3694 { 3695 return (vm_inout_access(&vm->ioports, in, port, bytes, val)); 3696 } 3697 3698 /* 3699 * bhyve-internal interfaces to attach or detach IO port handlers. 3700 * Must be called with VM write lock held for safety. 3701 */ 3702 int 3703 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg, 3704 void **cookie) 3705 { 3706 int err; 3707 err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg); 3708 if (err == 0) { 3709 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3710 } 3711 return (err); 3712 } 3713 int 3714 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func, 3715 void **old_arg) 3716 { 3717 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3718 int err; 3719 3720 err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg); 3721 if (err == 0) { 3722 *cookie = NULL; 3723 } 3724 return (err); 3725 } 3726 3727 /* 3728 * External driver interfaces to attach or detach IO port handlers. 3729 * Must be called with VM write lock held for safety. 3730 */ 3731 int 3732 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func, 3733 void *arg, void **cookie) 3734 { 3735 int err; 3736 3737 if (port == 0) { 3738 return (EINVAL); 3739 } 3740 3741 err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg); 3742 if (err == 0) { 3743 *cookie = (void *)IOP_GEN_COOKIE(func, arg, port); 3744 } 3745 return (err); 3746 } 3747 void 3748 vm_ioport_unhook(struct vm *vm, void **cookie) 3749 { 3750 uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie); 3751 ioport_handler_t old_func; 3752 void *old_arg; 3753 int err; 3754 3755 err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg); 3756 3757 /* ioport-hook-using drivers are expected to be well-behaved */ 3758 VERIFY0(err); 3759 VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie); 3760 3761 *cookie = NULL; 3762 } 3763 3764 int 3765 vmm_kstat_update_vcpu(struct kstat *ksp, int rw) 3766 { 3767 struct vm *vm = ksp->ks_private; 3768 vmm_vcpu_kstats_t *vvk = ksp->ks_data; 3769 const int vcpuid = vvk->vvk_vcpu.value.ui32; 3770 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 3771 3772 ASSERT3U(vcpuid, <, VM_MAXCPU); 3773 3774 vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT]; 3775 vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN]; 3776 vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE]; 3777 vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN]; 3778 vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER]; 3779 vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED]; 3780 3781 return (0); 3782 } 3783