1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 /* 31 * Copyright 2020 Joyent, Inc. 32 * Copyright 2021 Oxide Computer Company 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/proc.h> 41 42 #include <machine/clock.h> 43 #include <machine/cpufunc.h> 44 #include <machine/md_var.h> 45 #include <machine/specialreg.h> 46 #include <machine/vmm.h> 47 #include <sys/vmm_kernel.h> 48 49 #include "vmx.h" 50 #include "vmx_msr.h" 51 52 static bool 53 vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos) 54 { 55 56 return ((msr_val & (1UL << (bitpos + 32))) != 0); 57 } 58 59 static bool 60 vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos) 61 { 62 63 return ((msr_val & (1UL << bitpos)) == 0); 64 } 65 66 /* 67 * Generate a bitmask to be used for the VMCS execution control fields. 68 * 69 * The caller specifies what bits should be set to one in 'ones_mask' 70 * and what bits should be set to zero in 'zeros_mask'. The don't-care 71 * bits are set to the default value. The default values are obtained 72 * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining 73 * VMX Capabilities". 74 * 75 * Returns zero on success and non-zero on error. 76 */ 77 int 78 vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask, 79 uint32_t zeros_mask, uint32_t *retval) 80 { 81 int i; 82 uint64_t val, trueval; 83 bool true_ctls_avail, one_allowed, zero_allowed; 84 85 /* We cannot ask the same bit to be set to both '1' and '0' */ 86 if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask)) 87 return (EINVAL); 88 89 true_ctls_avail = (rdmsr(MSR_VMX_BASIC) & (1UL << 55)) != 0; 90 91 val = rdmsr(ctl_reg); 92 if (true_ctls_avail) 93 trueval = rdmsr(true_ctl_reg); /* step c */ 94 else 95 trueval = val; /* step a */ 96 97 for (i = 0; i < 32; i++) { 98 one_allowed = vmx_ctl_allows_one_setting(trueval, i); 99 zero_allowed = vmx_ctl_allows_zero_setting(trueval, i); 100 101 KASSERT(one_allowed || zero_allowed, 102 ("invalid zero/one setting for bit %d of ctl 0x%0x, " 103 "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg)); 104 105 if (zero_allowed && !one_allowed) { /* b(i),c(i) */ 106 if (ones_mask & (1 << i)) 107 return (EINVAL); 108 *retval &= ~(1 << i); 109 } else if (one_allowed && !zero_allowed) { /* b(i),c(i) */ 110 if (zeros_mask & (1 << i)) 111 return (EINVAL); 112 *retval |= 1 << i; 113 } else { 114 if (zeros_mask & (1 << i)) { 115 /* b(ii),c(ii) */ 116 *retval &= ~(1 << i); 117 } else if (ones_mask & (1 << i)) { 118 /* b(ii), c(ii) */ 119 *retval |= 1 << i; 120 } else if (!true_ctls_avail) { 121 /* b(iii) */ 122 *retval &= ~(1 << i); 123 } else if (vmx_ctl_allows_zero_setting(val, i)) { 124 /* c(iii) */ 125 *retval &= ~(1 << i); 126 } else if (vmx_ctl_allows_one_setting(val, i)) { 127 /* c(iv) */ 128 *retval |= 1 << i; 129 } else { 130 panic("vmx_set_ctlreg: unable to determine " 131 "correct value of ctl bit %d for msr " 132 "0x%0x and true msr 0x%0x", i, ctl_reg, 133 true_ctl_reg); 134 } 135 } 136 } 137 138 return (0); 139 } 140 141 void 142 vmx_msr_bitmap_initialize(struct vmx *vmx) 143 { 144 for (uint_t i = 0; i < VM_MAXCPU; i++) { 145 uint8_t *bitmap; 146 147 bitmap = kmem_alloc(PAGESIZE, KM_SLEEP); 148 VERIFY3U((uintptr_t)bitmap & PAGEOFFSET, ==, 0); 149 memset(bitmap, 0xff, PAGESIZE); 150 151 vmx->msr_bitmap[i] = bitmap; 152 } 153 } 154 155 void 156 vmx_msr_bitmap_destroy(struct vmx *vmx) 157 { 158 for (uint_t i = 0; i < VM_MAXCPU; i++) { 159 VERIFY3P(vmx->msr_bitmap[i], !=, NULL); 160 kmem_free(vmx->msr_bitmap[i], PAGESIZE); 161 vmx->msr_bitmap[i] = NULL; 162 } 163 } 164 165 void 166 vmx_msr_bitmap_change_access(struct vmx *vmx, int vcpuid, uint_t msr, int acc) 167 { 168 uint8_t *bitmap = vmx->msr_bitmap[vcpuid]; 169 int byte, bit; 170 171 if (msr <= 0x00001FFF) { 172 byte = msr / 8; 173 } else if (msr >= 0xC0000000 && msr <= 0xC0001FFF) { 174 byte = 1024 + (msr - 0xC0000000) / 8; 175 } else { 176 panic("Invalid MSR for bitmap: %x", msr); 177 } 178 179 bit = msr & 0x7; 180 181 if (acc & MSR_BITMAP_ACCESS_READ) { 182 bitmap[byte] &= ~(1 << bit); 183 } else { 184 bitmap[byte] |= 1 << bit; 185 } 186 187 byte += 2048; 188 if (acc & MSR_BITMAP_ACCESS_WRITE) { 189 bitmap[byte] &= ~(1 << bit); 190 } else { 191 bitmap[byte] |= 1 << bit; 192 } 193 } 194 195 static uint64_t misc_enable; 196 static uint64_t platform_info; 197 static uint64_t turbo_ratio_limit; 198 199 static bool 200 nehalem_cpu(void) 201 { 202 uint_t family, model; 203 204 /* 205 * The family:model numbers belonging to the Nehalem microarchitecture 206 * are documented in Section 35.5, Intel SDM dated Feb 2014. 207 */ 208 family = CPUID_TO_FAMILY(cpu_id); 209 model = CPUID_TO_MODEL(cpu_id); 210 if (family == 0x6) { 211 switch (model) { 212 case 0x1A: 213 case 0x1E: 214 case 0x1F: 215 case 0x2E: 216 return (true); 217 default: 218 break; 219 } 220 } 221 return (false); 222 } 223 224 static bool 225 westmere_cpu(void) 226 { 227 uint_t family, model; 228 229 /* 230 * The family:model numbers belonging to the Westmere microarchitecture 231 * are documented in Section 35.6, Intel SDM dated Feb 2014. 232 */ 233 family = CPUID_TO_FAMILY(cpu_id); 234 model = CPUID_TO_MODEL(cpu_id); 235 if (family == 0x6) { 236 switch (model) { 237 case 0x25: 238 case 0x2C: 239 return (true); 240 default: 241 break; 242 } 243 } 244 return (false); 245 } 246 247 static bool 248 pat_valid(uint64_t val) 249 { 250 int i, pa; 251 252 /* 253 * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT" 254 * 255 * Extract PA0 through PA7 and validate that each one encodes a 256 * valid memory type. 257 */ 258 for (i = 0; i < 8; i++) { 259 pa = (val >> (i * 8)) & 0xff; 260 if (pa == 2 || pa == 3 || pa >= 8) 261 return (false); 262 } 263 return (true); 264 } 265 266 void 267 vmx_msr_init(void) 268 { 269 uint64_t bus_freq, ratio; 270 int i; 271 272 /* 273 * Initialize emulated MSRs 274 */ 275 misc_enable = rdmsr(MSR_IA32_MISC_ENABLE); 276 /* 277 * Set mandatory bits 278 * 11: branch trace disabled 279 * 12: PEBS unavailable 280 * Clear unsupported features 281 * 16: SpeedStep enable 282 * 18: enable MONITOR FSM 283 */ 284 misc_enable |= (1 << 12) | (1 << 11); 285 misc_enable &= ~((1 << 18) | (1 << 16)); 286 287 if (nehalem_cpu() || westmere_cpu()) 288 bus_freq = 133330000; /* 133Mhz */ 289 else 290 bus_freq = 100000000; /* 100Mhz */ 291 292 /* 293 * XXXtime 294 * The ratio should really be based on the virtual TSC frequency as 295 * opposed to the host TSC. 296 */ 297 ratio = (tsc_freq / bus_freq) & 0xff; 298 299 /* 300 * The register definition is based on the micro-architecture 301 * but the following bits are always the same: 302 * [15:8] Maximum Non-Turbo Ratio 303 * [28] Programmable Ratio Limit for Turbo Mode 304 * [29] Programmable TDC-TDP Limit for Turbo Mode 305 * [47:40] Maximum Efficiency Ratio 306 * 307 * The other bits can be safely set to 0 on all 308 * micro-architectures up to Haswell. 309 */ 310 platform_info = (ratio << 8) | (ratio << 40); 311 312 /* 313 * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is 314 * dependent on the maximum cores per package supported by the micro- 315 * architecture. For e.g., Westmere supports 6 cores per package and 316 * uses the low 48 bits. Sandybridge support 8 cores per package and 317 * uses up all 64 bits. 318 * 319 * However, the unused bits are reserved so we pretend that all bits 320 * in this MSR are valid. 321 */ 322 for (i = 0; i < 8; i++) 323 turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio; 324 } 325 326 void 327 vmx_msr_guest_init(struct vmx *vmx, int vcpuid) 328 { 329 uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; 330 331 /* 332 * It is safe to allow direct access to MSR_GSBASE and 333 * MSR_FSBASE. The guest FSBASE and GSBASE are saved and 334 * restored during vm-exit and vm-entry respectively. The host 335 * FSBASE and GSBASE are always restored from the vmcs host 336 * state area on vm-exit. 337 * 338 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in 339 * how they are saved/restored so can be directly accessed by 340 * the guest. 341 * 342 * MSR_EFER is saved and restored in the guest VMCS area on a VM 343 * exit and entry respectively. It is also restored from the 344 * host VMCS area on a VM exit. 345 * 346 * The TSC MSR is exposed read-only. Writes are disallowed as 347 * that will impact the host TSC. If the guest does a write the 348 * "use TSC offsetting" execution control is enabled and the 349 * difference between the host TSC and the guest TSC is written 350 * into the TSC offset in the VMCS. 351 */ 352 guest_msr_rw(vmx, vcpuid, MSR_GSBASE); 353 guest_msr_rw(vmx, vcpuid, MSR_FSBASE); 354 guest_msr_rw(vmx, vcpuid, MSR_SYSENTER_CS_MSR); 355 guest_msr_rw(vmx, vcpuid, MSR_SYSENTER_ESP_MSR); 356 guest_msr_rw(vmx, vcpuid, MSR_SYSENTER_EIP_MSR); 357 guest_msr_rw(vmx, vcpuid, MSR_EFER); 358 guest_msr_ro(vmx, vcpuid, MSR_TSC); 359 360 /* 361 * The guest may have direct access to these MSRs as they are 362 * saved/restored in vmx_msr_guest_enter() and vmx_msr_guest_exit(). 363 */ 364 guest_msr_rw(vmx, vcpuid, MSR_LSTAR); 365 guest_msr_rw(vmx, vcpuid, MSR_CSTAR); 366 guest_msr_rw(vmx, vcpuid, MSR_STAR); 367 guest_msr_rw(vmx, vcpuid, MSR_SF_MASK); 368 guest_msr_rw(vmx, vcpuid, MSR_KGSBASE); 369 370 /* 371 * Initialize guest IA32_PAT MSR with default value after reset. 372 */ 373 guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) | 374 PAT_VALUE(1, PAT_WRITE_THROUGH) | 375 PAT_VALUE(2, PAT_UNCACHED) | 376 PAT_VALUE(3, PAT_UNCACHEABLE) | 377 PAT_VALUE(4, PAT_WRITE_BACK) | 378 PAT_VALUE(5, PAT_WRITE_THROUGH) | 379 PAT_VALUE(6, PAT_UNCACHED) | 380 PAT_VALUE(7, PAT_UNCACHEABLE); 381 } 382 383 void 384 vmx_msr_guest_enter(struct vmx *vmx, int vcpuid) 385 { 386 uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; 387 uint64_t *host_msrs = vmx->host_msrs[vcpuid]; 388 389 /* Save host MSRs */ 390 host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); 391 host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); 392 host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); 393 host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); 394 395 /* Save host MSRs (in particular, KGSBASE) and restore guest MSRs */ 396 wrmsr(MSR_LSTAR, guest_msrs[IDX_MSR_LSTAR]); 397 wrmsr(MSR_CSTAR, guest_msrs[IDX_MSR_CSTAR]); 398 wrmsr(MSR_STAR, guest_msrs[IDX_MSR_STAR]); 399 wrmsr(MSR_SF_MASK, guest_msrs[IDX_MSR_SF_MASK]); 400 wrmsr(MSR_KGSBASE, guest_msrs[IDX_MSR_KGSBASE]); 401 } 402 403 void 404 vmx_msr_guest_exit(struct vmx *vmx, int vcpuid) 405 { 406 uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; 407 uint64_t *host_msrs = vmx->host_msrs[vcpuid]; 408 409 /* Save guest MSRs */ 410 guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); 411 guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); 412 guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); 413 guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); 414 guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE); 415 416 /* Restore host MSRs */ 417 wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]); 418 wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]); 419 wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]); 420 wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]); 421 422 /* MSR_KGSBASE will be restored on the way back to userspace */ 423 } 424 425 vm_msr_result_t 426 vmx_rdmsr(struct vmx *vmx, int vcpuid, uint32_t num, uint64_t *val) 427 { 428 const uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; 429 430 switch (num) { 431 case MSR_IA32_FEATURE_CONTROL: 432 /* 433 * We currently don't support SGX support in guests, so 434 * always report those features as disabled with the MSR 435 * locked so the guest won't attempt to write to it. 436 */ 437 *val = IA32_FEATURE_CONTROL_LOCK; 438 break; 439 case MSR_IA32_MISC_ENABLE: 440 *val = misc_enable; 441 break; 442 case MSR_PLATFORM_INFO: 443 *val = platform_info; 444 break; 445 case MSR_TURBO_RATIO_LIMIT: 446 case MSR_TURBO_RATIO_LIMIT1: 447 *val = turbo_ratio_limit; 448 break; 449 case MSR_PAT: 450 *val = guest_msrs[IDX_MSR_PAT]; 451 break; 452 default: 453 return (VMR_UNHANLDED); 454 } 455 return (VMR_OK); 456 } 457 458 vm_msr_result_t 459 vmx_wrmsr(struct vmx *vmx, int vcpuid, uint32_t num, uint64_t val) 460 { 461 uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; 462 uint64_t changed; 463 464 switch (num) { 465 case MSR_IA32_MISC_ENABLE: 466 changed = val ^ misc_enable; 467 /* 468 * If the host has disabled the NX feature then the guest 469 * also cannot use it. However, a Linux guest will try to 470 * enable the NX feature by writing to the MISC_ENABLE MSR. 471 * 472 * This can be safely ignored because the memory management 473 * code looks at CPUID.80000001H:EDX.NX to check if the 474 * functionality is actually enabled. 475 */ 476 changed &= ~(1UL << 34); 477 478 /* 479 * Punt to userspace if any other bits are being modified. 480 */ 481 if (changed) { 482 return (VMR_UNHANLDED); 483 } 484 break; 485 case MSR_PAT: 486 if (!pat_valid(val)) { 487 return (VMR_GP); 488 } 489 guest_msrs[IDX_MSR_PAT] = val; 490 break; 491 default: 492 return (VMR_UNHANLDED); 493 } 494 495 return (VMR_OK); 496 } 497