1 /* $OpenBSD: x86_vm.c,v 1.5 2024/10/02 17:05:56 dv Exp $ */
2 /*
3 * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18 #include <sys/stat.h>
19 #include <sys/types.h>
20
21 #include <dev/ic/i8253reg.h>
22 #include <dev/isa/isareg.h>
23
24 #include <machine/pte.h>
25 #include <machine/specialreg.h>
26 #include <machine/vmmvar.h>
27
28 #include <errno.h>
29 #include <string.h>
30 #include <unistd.h>
31
32 #include <zlib.h>
33
34 #include "atomicio.h"
35 #include "fw_cfg.h"
36 #include "i8253.h"
37 #include "i8259.h"
38 #include "loadfile.h"
39 #include "mc146818.h"
40 #include "ns8250.h"
41 #include "pci.h"
42 #include "virtio.h"
43
44 typedef uint8_t (*io_fn_t)(struct vm_run_params *);
45
46 #define MAX_PORTS 65536
47
48 io_fn_t ioports_map[MAX_PORTS];
49 extern char *__progname;
50
51 void create_memory_map(struct vm_create_params *);
52 int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int);
53
54 static int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *);
55 static int vcpu_exit_eptviolation(struct vm_run_params *);
56 static void vcpu_exit_inout(struct vm_run_params *);
57
58 extern struct vmd_vm *current_vm;
59 extern int con_fd;
60
61 /*
62 * Represents a standard register set for an OS to be booted
63 * as a flat 64 bit address space.
64 *
65 * NOT set here are:
66 * RIP
67 * RSP
68 * GDTR BASE
69 *
70 * Specific bootloaders should clone this structure and override
71 * those fields as needed.
72 *
73 * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
74 * features of the CPU in use.
75 */
76 static const struct vcpu_reg_state vcpu_init_flat64 = {
77 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
78 .vrs_gprs[VCPU_REGS_RIP] = 0x0,
79 .vrs_gprs[VCPU_REGS_RSP] = 0x0,
80 .vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG,
81 .vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
82 .vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE,
83 .vrs_crs[VCPU_REGS_PDPTE0] = 0ULL,
84 .vrs_crs[VCPU_REGS_PDPTE1] = 0ULL,
85 .vrs_crs[VCPU_REGS_PDPTE2] = 0ULL,
86 .vrs_crs[VCPU_REGS_PDPTE3] = 0ULL,
87 .vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
88 .vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
89 .vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
90 .vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
91 .vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
92 .vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
93 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
94 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
95 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
96 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
97 .vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA,
98 .vrs_drs[VCPU_REGS_DR0] = 0x0,
99 .vrs_drs[VCPU_REGS_DR1] = 0x0,
100 .vrs_drs[VCPU_REGS_DR2] = 0x0,
101 .vrs_drs[VCPU_REGS_DR3] = 0x0,
102 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
103 .vrs_drs[VCPU_REGS_DR7] = 0x400,
104 .vrs_msrs[VCPU_REGS_STAR] = 0ULL,
105 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
106 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
107 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
108 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
109 .vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL,
110 .vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87
111 };
112
113 /*
114 * Represents a standard register set for an BIOS to be booted
115 * as a flat 16 bit address space.
116 */
117 static const struct vcpu_reg_state vcpu_init_flat16 = {
118 .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
119 .vrs_gprs[VCPU_REGS_RIP] = 0xFFF0,
120 .vrs_gprs[VCPU_REGS_RSP] = 0x0,
121 .vrs_crs[VCPU_REGS_CR0] = 0x60000010,
122 .vrs_crs[VCPU_REGS_CR3] = 0,
123 .vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000},
124 .vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0},
125 .vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0},
126 .vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0},
127 .vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0},
128 .vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0},
129 .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
130 .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
131 .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
132 .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
133 .vrs_msrs[VCPU_REGS_EFER] = 0ULL,
134 .vrs_drs[VCPU_REGS_DR0] = 0x0,
135 .vrs_drs[VCPU_REGS_DR1] = 0x0,
136 .vrs_drs[VCPU_REGS_DR2] = 0x0,
137 .vrs_drs[VCPU_REGS_DR3] = 0x0,
138 .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
139 .vrs_drs[VCPU_REGS_DR7] = 0x400,
140 .vrs_msrs[VCPU_REGS_STAR] = 0ULL,
141 .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
142 .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
143 .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
144 .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
145 .vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87
146 };
147
148 /*
149 * create_memory_map
150 *
151 * Sets up the guest physical memory ranges that the VM can access.
152 *
153 * Parameters:
154 * vcp: VM create parameters describing the VM whose memory map
155 * is being created
156 *
157 * Return values:
158 * nothing
159 */
160 void
create_memory_map(struct vm_create_params * vcp)161 create_memory_map(struct vm_create_params *vcp)
162 {
163 size_t len, mem_bytes;
164 size_t above_1m = 0, above_4g = 0;
165
166 mem_bytes = vcp->vcp_memranges[0].vmr_size;
167 vcp->vcp_nmemranges = 0;
168 if (mem_bytes == 0 || mem_bytes > VMM_MAX_VM_MEM_SIZE)
169 return;
170
171 /* First memory region: 0 - LOWMEM_KB (DOS low mem) */
172 len = LOWMEM_KB * 1024;
173 vcp->vcp_memranges[0].vmr_gpa = 0x0;
174 vcp->vcp_memranges[0].vmr_size = len;
175 vcp->vcp_memranges[0].vmr_type = VM_MEM_RAM;
176 mem_bytes -= len;
177
178 /*
179 * Second memory region: LOWMEM_KB - 1MB.
180 *
181 * N.B. - Normally ROMs or parts of video RAM are mapped here.
182 * We have to add this region, because some systems
183 * unconditionally write to 0xb8000 (VGA RAM), and
184 * we need to make sure that vmm(4) permits accesses
185 * to it. So allocate guest memory for it.
186 */
187 len = MB(1) - (LOWMEM_KB * 1024);
188 vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
189 vcp->vcp_memranges[1].vmr_size = len;
190 vcp->vcp_memranges[1].vmr_type = VM_MEM_RESERVED;
191 mem_bytes -= len;
192
193 /* If we have less than 2MB remaining, still create a 2nd BIOS area. */
194 if (mem_bytes <= MB(2)) {
195 vcp->vcp_memranges[2].vmr_gpa = PCI_MMIO_BAR_END;
196 vcp->vcp_memranges[2].vmr_size = MB(2);
197 vcp->vcp_memranges[2].vmr_type = VM_MEM_RESERVED;
198 vcp->vcp_nmemranges = 3;
199 return;
200 }
201
202 /*
203 * Calculate the how to split any remaining memory across the 4GB
204 * boundary while making sure we do not place physical memory into
205 * MMIO ranges.
206 */
207 if (mem_bytes > PCI_MMIO_BAR_BASE - MB(1)) {
208 above_1m = PCI_MMIO_BAR_BASE - MB(1);
209 above_4g = mem_bytes - above_1m;
210 } else {
211 above_1m = mem_bytes;
212 above_4g = 0;
213 }
214
215 /* Third memory region: area above 1MB to MMIO region */
216 vcp->vcp_memranges[2].vmr_gpa = MB(1);
217 vcp->vcp_memranges[2].vmr_size = above_1m;
218 vcp->vcp_memranges[2].vmr_type = VM_MEM_RAM;
219
220 /* Fourth region: PCI MMIO range */
221 vcp->vcp_memranges[3].vmr_gpa = PCI_MMIO_BAR_BASE;
222 vcp->vcp_memranges[3].vmr_size = PCI_MMIO_BAR_END -
223 PCI_MMIO_BAR_BASE + 1;
224 vcp->vcp_memranges[3].vmr_type = VM_MEM_MMIO;
225
226 /* Fifth region: 2nd copy of BIOS above MMIO ending at 4GB */
227 vcp->vcp_memranges[4].vmr_gpa = PCI_MMIO_BAR_END + 1;
228 vcp->vcp_memranges[4].vmr_size = MB(2);
229 vcp->vcp_memranges[4].vmr_type = VM_MEM_RESERVED;
230
231 /* Sixth region: any remainder above 4GB */
232 if (above_4g > 0) {
233 vcp->vcp_memranges[5].vmr_gpa = GB(4);
234 vcp->vcp_memranges[5].vmr_size = above_4g;
235 vcp->vcp_memranges[5].vmr_type = VM_MEM_RAM;
236 vcp->vcp_nmemranges = 6;
237 } else
238 vcp->vcp_nmemranges = 5;
239 }
240
241 int
load_firmware(struct vmd_vm * vm,struct vcpu_reg_state * vrs)242 load_firmware(struct vmd_vm *vm, struct vcpu_reg_state *vrs)
243 {
244 int ret;
245 gzFile fp;
246 struct stat sb;
247
248 /*
249 * Set up default "flat 64 bit" register state - RIP, RSP, and
250 * GDT info will be set in bootloader
251 */
252 memcpy(vrs, &vcpu_init_flat64, sizeof(*vrs));
253
254 /* Find and open kernel image */
255 if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL)
256 fatalx("failed to open kernel - exiting");
257
258 /* Load kernel image */
259 ret = loadfile_elf(fp, vm, vrs, vm->vm_params.vmc_bootdevice);
260
261 /*
262 * Try BIOS as a fallback (only if it was provided as an image
263 * with vm->vm_kernel and the file is not compressed)
264 */
265 if (ret && errno == ENOEXEC && vm->vm_kernel != -1 &&
266 gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0)
267 ret = loadfile_bios(fp, sb.st_size, vrs);
268
269 gzclose(fp);
270
271 return (ret);
272 }
273
274
275 /*
276 * loadfile_bios
277 *
278 * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image
279 * directly into memory.
280 *
281 * Parameters:
282 * fp: file of a kernel file to load
283 * size: uncompressed size of the image
284 * (out) vrs: register state to set on init for this kernel
285 *
286 * Return values:
287 * 0 if successful
288 * various error codes returned from read(2) or loadelf functions
289 */
290 int
loadfile_bios(gzFile fp,off_t size,struct vcpu_reg_state * vrs)291 loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs)
292 {
293 off_t off;
294
295 /* Set up a "flat 16 bit" register state for BIOS */
296 memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs));
297
298 /* Seek to the beginning of the BIOS image */
299 if (gzseek(fp, 0, SEEK_SET) == -1)
300 return (-1);
301
302 /* The BIOS image must end at 1MB */
303 if ((off = MB(1) - size) < 0)
304 return (-1);
305
306 /* Read BIOS image into memory */
307 if (mread(fp, off, size) != (size_t)size) {
308 errno = EIO;
309 return (-1);
310 }
311
312 if (gzseek(fp, 0, SEEK_SET) == -1)
313 return (-1);
314
315 /* Read a second BIOS copy into memory ending at 4GB */
316 off = GB(4) - size;
317 if (mread(fp, off, size) != (size_t)size) {
318 errno = EIO;
319 return (-1);
320 }
321
322 log_debug("%s: loaded BIOS image", __func__);
323
324 return (0);
325 }
326
327 /*
328 * init_emulated_hw
329 *
330 * Initializes the userspace hardware emulation
331 */
332 void
init_emulated_hw(struct vmop_create_params * vmc,int child_cdrom,int child_disks[][VM_MAX_BASE_PER_DISK],int * child_taps)333 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
334 int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
335 {
336 struct vm_create_params *vcp = &vmc->vmc_params;
337 size_t i;
338 uint64_t memlo, memhi;
339
340 /* Calculate memory size for NVRAM registers */
341 memlo = memhi = 0;
342 for (i = 0; i < vcp->vcp_nmemranges; i++) {
343 if (vcp->vcp_memranges[i].vmr_gpa == MB(1) &&
344 vcp->vcp_memranges[i].vmr_size > (15 * MB(1)))
345 memlo = vcp->vcp_memranges[i].vmr_size - (15 * MB(1));
346 else if (vcp->vcp_memranges[i].vmr_gpa == GB(4))
347 memhi = vcp->vcp_memranges[i].vmr_size;
348 }
349
350 /* Reset the IO port map */
351 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
352
353 /* Init i8253 PIT */
354 i8253_init(vcp->vcp_id);
355 ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
356 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
357 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
358 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
359 ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc;
360
361 /* Init mc146818 RTC */
362 mc146818_init(vcp->vcp_id, memlo, memhi);
363 ioports_map[IO_RTC] = vcpu_exit_mc146818;
364 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
365
366 /* Init master and slave PICs */
367 i8259_init();
368 ioports_map[IO_ICU1] = vcpu_exit_i8259;
369 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
370 ioports_map[IO_ICU2] = vcpu_exit_i8259;
371 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
372 ioports_map[ELCR0] = vcpu_exit_elcr;
373 ioports_map[ELCR1] = vcpu_exit_elcr;
374
375 /* Init ns8250 UART */
376 ns8250_init(con_fd, vcp->vcp_id);
377 for (i = COM1_DATA; i <= COM1_SCR; i++)
378 ioports_map[i] = vcpu_exit_com;
379
380 /* Initialize PCI */
381 for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++)
382 ioports_map[i] = vcpu_exit_pci;
383
384 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
385 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
386 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
387 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
388 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
389 pci_init();
390
391 /* Initialize virtio devices */
392 virtio_init(current_vm, child_cdrom, child_disks, child_taps);
393
394 /*
395 * Init QEMU fw_cfg interface. Must be done last for pci hardware
396 * detection.
397 */
398 fw_cfg_init(vmc);
399 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
400 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
401 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
402 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
403 }
404
405 /*
406 * restore_emulated_hw
407 *
408 * Restores the userspace hardware emulation from fd
409 */
410 void
restore_emulated_hw(struct vm_create_params * vcp,int fd,int * child_taps,int child_disks[][VM_MAX_BASE_PER_DISK],int child_cdrom)411 restore_emulated_hw(struct vm_create_params *vcp, int fd,
412 int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
413 {
414 /* struct vm_create_params *vcp = &vmc->vmc_params; */
415 int i;
416 memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
417
418 /* Init i8253 PIT */
419 i8253_restore(fd, vcp->vcp_id);
420 ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
421 ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
422 ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
423 ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
424
425 /* Init master and slave PICs */
426 i8259_restore(fd);
427 ioports_map[IO_ICU1] = vcpu_exit_i8259;
428 ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
429 ioports_map[IO_ICU2] = vcpu_exit_i8259;
430 ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
431
432 /* Init ns8250 UART */
433 ns8250_restore(fd, con_fd, vcp->vcp_id);
434 for (i = COM1_DATA; i <= COM1_SCR; i++)
435 ioports_map[i] = vcpu_exit_com;
436
437 /* Init mc146818 RTC */
438 mc146818_restore(fd, vcp->vcp_id);
439 ioports_map[IO_RTC] = vcpu_exit_mc146818;
440 ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
441
442 /* Init QEMU fw_cfg interface */
443 fw_cfg_restore(fd);
444 ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
445 ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
446 ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
447 ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
448
449 /* Initialize PCI */
450 for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++)
451 ioports_map[i] = vcpu_exit_pci;
452
453 ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
454 ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
455 ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
456 ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
457 ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
458 pci_restore(fd);
459 virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps);
460 }
461
462 void
pause_vm_md(struct vmd_vm * vm)463 pause_vm_md(struct vmd_vm *vm)
464 {
465 i8253_stop();
466 mc146818_stop();
467 ns8250_stop();
468 virtio_stop(vm);
469 }
470
471 void
unpause_vm_md(struct vmd_vm * vm)472 unpause_vm_md(struct vmd_vm *vm)
473 {
474 i8253_start();
475 mc146818_start();
476 ns8250_start();
477 virtio_start(vm);
478 }
479
480 int
dump_devs(int fd)481 dump_devs(int fd)
482 {
483 int ret = 0;
484
485 if ((ret = i8253_dump(fd)))
486 return ret;
487 if ((ret = i8259_dump(fd)))
488 return ret;
489 if ((ret = ns8250_dump(fd)))
490 return ret;
491 if ((ret = mc146818_dump(fd)))
492 return ret;
493 ret = fw_cfg_dump(fd);
494
495 return ret;
496 }
497
498 int
dump_send_header(int fd)499 dump_send_header(int fd) {
500 struct vm_dump_header vmh;
501 int i;
502
503 memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE,
504 sizeof(vmh.vmh_signature));
505
506 vmh.vmh_cpuids[0].code = 0x00;
507 vmh.vmh_cpuids[0].leaf = 0x00;
508
509 vmh.vmh_cpuids[1].code = 0x01;
510 vmh.vmh_cpuids[1].leaf = 0x00;
511
512 vmh.vmh_cpuids[2].code = 0x07;
513 vmh.vmh_cpuids[2].leaf = 0x00;
514
515 vmh.vmh_cpuids[3].code = 0x0d;
516 vmh.vmh_cpuids[3].leaf = 0x00;
517
518 vmh.vmh_cpuids[4].code = 0x80000001;
519 vmh.vmh_cpuids[4].leaf = 0x00;
520
521 vmh.vmh_version = VM_DUMP_VERSION;
522
523 for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
524 CPUID_LEAF(vmh.vmh_cpuids[i].code,
525 vmh.vmh_cpuids[i].leaf,
526 vmh.vmh_cpuids[i].a,
527 vmh.vmh_cpuids[i].b,
528 vmh.vmh_cpuids[i].c,
529 vmh.vmh_cpuids[i].d);
530 }
531
532 if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh))
533 return (-1);
534
535 return (0);
536 }
537
538
539 /*
540 * vcpu_exit_inout
541 *
542 * Handle all I/O exits that need to be emulated in vmd. This includes the
543 * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
544 *
545 * Parameters:
546 * vrp: vcpu run parameters containing guest state for this exit
547 */
548 void
vcpu_exit_inout(struct vm_run_params * vrp)549 vcpu_exit_inout(struct vm_run_params *vrp)
550 {
551 struct vm_exit *vei = vrp->vrp_exit;
552 uint8_t intr = 0xFF;
553
554 if (vei->vei.vei_rep || vei->vei.vei_string) {
555 #ifdef MMIO_DEBUG
556 log_info("%s: %s%s%s %d-byte, enc=%d, data=0x%08x, port=0x%04x",
557 __func__,
558 vei->vei.vei_rep == 0 ? "" : "REP ",
559 vei->vei.vei_dir == VEI_DIR_IN ? "IN" : "OUT",
560 vei->vei.vei_string == 0 ? "" : "S",
561 vei->vei.vei_size, vei->vei.vei_encoding,
562 vei->vei.vei_data, vei->vei.vei_port);
563 log_info("%s: ECX = 0x%llx, RDX = 0x%llx, RSI = 0x%llx",
564 __func__,
565 vei->vrs.vrs_gprs[VCPU_REGS_RCX],
566 vei->vrs.vrs_gprs[VCPU_REGS_RDX],
567 vei->vrs.vrs_gprs[VCPU_REGS_RSI]);
568 #endif /* MMIO_DEBUG */
569 fatalx("%s: can't emulate REP prefixed IN(S)/OUT(S)",
570 __func__);
571 }
572
573 if (ioports_map[vei->vei.vei_port] != NULL)
574 intr = ioports_map[vei->vei.vei_port](vrp);
575 else if (vei->vei.vei_dir == VEI_DIR_IN)
576 set_return_data(vei, 0xFFFFFFFF);
577
578 vei->vrs.vrs_gprs[VCPU_REGS_RIP] += vei->vei.vei_insn_len;
579
580 if (intr != 0xFF)
581 vcpu_assert_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
582 }
583
584 /*
585 * vcpu_exit
586 *
587 * Handle a vcpu exit. This function is called when it is determined that
588 * vmm(4) requires the assistance of vmd to support a particular guest
589 * exit type (eg, accessing an I/O port or device). Guest state is contained
590 * in 'vrp', and will be resent to vmm(4) on exit completion.
591 *
592 * Upon conclusion of handling the exit, the function determines if any
593 * interrupts should be injected into the guest, and asserts the proper
594 * IRQ line whose interrupt should be vectored.
595 *
596 * Parameters:
597 * vrp: vcpu run parameters containing guest state for this exit
598 *
599 * Return values:
600 * 0: the exit was handled successfully
601 * 1: an error occurred (eg, unknown exit reason passed in 'vrp')
602 */
603 int
vcpu_exit(struct vm_run_params * vrp)604 vcpu_exit(struct vm_run_params *vrp)
605 {
606 int ret;
607
608 switch (vrp->vrp_exit_reason) {
609 case VMX_EXIT_INT_WINDOW:
610 case SVM_VMEXIT_VINTR:
611 case VMX_EXIT_CPUID:
612 case VMX_EXIT_EXTINT:
613 case SVM_VMEXIT_INTR:
614 case SVM_VMEXIT_MSR:
615 case SVM_VMEXIT_CPUID:
616 /*
617 * We may be exiting to vmd to handle a pending interrupt but
618 * at the same time the last exit type may have been one of
619 * these. In this case, there's nothing extra to be done
620 * here (and falling through to the default case below results
621 * in more vmd log spam).
622 */
623 break;
624 case SVM_VMEXIT_NPF:
625 case VMX_EXIT_EPT_VIOLATION:
626 ret = vcpu_exit_eptviolation(vrp);
627 if (ret)
628 return (ret);
629 break;
630 case VMX_EXIT_IO:
631 case SVM_VMEXIT_IOIO:
632 vcpu_exit_inout(vrp);
633 break;
634 case VMX_EXIT_HLT:
635 case SVM_VMEXIT_HLT:
636 vcpu_halt(vrp->vrp_vcpu_id);
637 break;
638 case VMX_EXIT_TRIPLE_FAULT:
639 case SVM_VMEXIT_SHUTDOWN:
640 /* reset VM */
641 return (EAGAIN);
642 default:
643 log_debug("%s: unknown exit reason 0x%x",
644 __progname, vrp->vrp_exit_reason);
645 }
646
647 return (0);
648 }
649
650 /*
651 * vcpu_exit_eptviolation
652 *
653 * handle an EPT Violation
654 *
655 * Parameters:
656 * vrp: vcpu run parameters containing guest state for this exit
657 *
658 * Return values:
659 * 0: no action required
660 * EFAULT: a protection fault occured, kill the vm.
661 */
662 static int
vcpu_exit_eptviolation(struct vm_run_params * vrp)663 vcpu_exit_eptviolation(struct vm_run_params *vrp)
664 {
665 struct vm_exit *ve = vrp->vrp_exit;
666 int ret = 0;
667 #if MMIO_NOTYET
668 struct x86_insn insn;
669 uint64_t va, pa;
670 size_t len = 15; /* Max instruction length in x86. */
671 #endif /* MMIO_NOTYET */
672 switch (ve->vee.vee_fault_type) {
673 case VEE_FAULT_HANDLED:
674 break;
675
676 #if MMIO_NOTYET
677 case VEE_FAULT_MMIO_ASSIST:
678 /* Intel VMX might give us the length of the instruction. */
679 if (ve->vee.vee_insn_info & VEE_LEN_VALID)
680 len = ve->vee.vee_insn_len;
681
682 if (len > 15)
683 fatalx("%s: invalid instruction length %lu", __func__,
684 len);
685
686 /* If we weren't given instruction bytes, we need to fetch. */
687 if (!(ve->vee.vee_insn_info & VEE_BYTES_VALID)) {
688 memset(ve->vee.vee_insn_bytes, 0,
689 sizeof(ve->vee.vee_insn_bytes));
690 va = ve->vrs.vrs_gprs[VCPU_REGS_RIP];
691
692 /* XXX Only support instructions that fit on 1 page. */
693 if ((va & PAGE_MASK) + len > PAGE_SIZE) {
694 log_warnx("%s: instruction might cross page "
695 "boundary", __func__);
696 ret = EINVAL;
697 break;
698 }
699
700 ret = translate_gva(ve, va, &pa, PROT_EXEC);
701 if (ret != 0) {
702 log_warnx("%s: failed gva translation",
703 __func__);
704 break;
705 }
706
707 ret = read_mem(pa, ve->vee.vee_insn_bytes, len);
708 if (ret != 0) {
709 log_warnx("%s: failed to fetch instruction "
710 "bytes from 0x%llx", __func__, pa);
711 break;
712 }
713 }
714
715 ret = insn_decode(ve, &insn);
716 if (ret == 0)
717 ret = insn_emulate(ve, &insn);
718 break;
719 #endif /* MMIO_NOTYET */
720
721 case VEE_FAULT_PROTECT:
722 log_debug("%s: EPT Violation: rip=0x%llx", __progname,
723 ve->vrs.vrs_gprs[VCPU_REGS_RIP]);
724 ret = EFAULT;
725 break;
726
727 default:
728 fatalx("%s: invalid fault_type %d", __progname,
729 ve->vee.vee_fault_type);
730 /* UNREACHED */
731 }
732
733 return (ret);
734 }
735
736 /*
737 * vcpu_exit_pci
738 *
739 * Handle all I/O to the emulated PCI subsystem.
740 *
741 * Parameters:
742 * vrp: vcpu run parameters containing guest state for this exit
743 *
744 * Return value:
745 * Interrupt to inject to the guest VM, or 0xFF if no interrupt should
746 * be injected.
747 */
748 uint8_t
vcpu_exit_pci(struct vm_run_params * vrp)749 vcpu_exit_pci(struct vm_run_params *vrp)
750 {
751 struct vm_exit *vei = vrp->vrp_exit;
752 uint8_t intr;
753
754 intr = 0xFF;
755
756 switch (vei->vei.vei_port) {
757 case PCI_MODE1_ADDRESS_REG:
758 pci_handle_address_reg(vrp);
759 break;
760 case PCI_MODE1_DATA_REG:
761 case PCI_MODE1_DATA_REG + 1:
762 case PCI_MODE1_DATA_REG + 2:
763 case PCI_MODE1_DATA_REG + 3:
764 pci_handle_data_reg(vrp);
765 break;
766 case VM_PCI_IO_BAR_BASE ... VM_PCI_IO_BAR_END:
767 intr = pci_handle_io(vrp);
768 break;
769 default:
770 log_warnx("%s: unknown PCI register 0x%llx",
771 __progname, (uint64_t)vei->vei.vei_port);
772 break;
773 }
774
775 return (intr);
776 }
777
778 /*
779 * find_gpa_range
780 *
781 * Search for a contiguous guest physical mem range.
782 *
783 * Parameters:
784 * vcp: VM create parameters that contain the memory map to search in
785 * gpa: the starting guest physical address
786 * len: the length of the memory range
787 *
788 * Return values:
789 * NULL: on failure if there is no memory range as described by the parameters
790 * Pointer to vm_mem_range that contains the start of the range otherwise.
791 */
792 struct vm_mem_range *
find_gpa_range(struct vm_create_params * vcp,paddr_t gpa,size_t len)793 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
794 {
795 size_t i, n;
796 struct vm_mem_range *vmr;
797
798 /* Find the first vm_mem_range that contains gpa */
799 for (i = 0; i < vcp->vcp_nmemranges; i++) {
800 vmr = &vcp->vcp_memranges[i];
801 if (gpa < vmr->vmr_gpa + vmr->vmr_size)
802 break;
803 }
804
805 /* No range found. */
806 if (i == vcp->vcp_nmemranges)
807 return (NULL);
808
809 /*
810 * vmr may cover the range [gpa, gpa + len) only partly. Make
811 * sure that the following vm_mem_ranges are contiguous and
812 * cover the rest.
813 */
814 n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
815 if (len < n)
816 len = 0;
817 else
818 len -= n;
819 gpa = vmr->vmr_gpa + vmr->vmr_size;
820 for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
821 vmr = &vcp->vcp_memranges[i];
822 if (gpa != vmr->vmr_gpa)
823 return (NULL);
824 if (len <= vmr->vmr_size)
825 len = 0;
826 else
827 len -= vmr->vmr_size;
828
829 gpa = vmr->vmr_gpa + vmr->vmr_size;
830 }
831
832 if (len != 0)
833 return (NULL);
834
835 return (vmr);
836 }
837 /*
838 * write_mem
839 *
840 * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
841 *
842 * Parameters:
843 * dst: the destination paddr_t in the guest VM
844 * buf: data to copy (or NULL to zero the data)
845 * len: number of bytes to copy
846 *
847 * Return values:
848 * 0: success
849 * EINVAL: if the guest physical memory range [dst, dst + len) does not
850 * exist in the guest.
851 */
852 int
write_mem(paddr_t dst,const void * buf,size_t len)853 write_mem(paddr_t dst, const void *buf, size_t len)
854 {
855 const char *from = buf;
856 char *to;
857 size_t n, off;
858 struct vm_mem_range *vmr;
859
860 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, dst, len);
861 if (vmr == NULL) {
862 errno = EINVAL;
863 log_warn("%s: failed - invalid memory range dst = 0x%lx, "
864 "len = 0x%zx", __func__, dst, len);
865 return (EINVAL);
866 }
867
868 off = dst - vmr->vmr_gpa;
869 while (len != 0) {
870 n = vmr->vmr_size - off;
871 if (len < n)
872 n = len;
873
874 to = (char *)vmr->vmr_va + off;
875 if (buf == NULL)
876 memset(to, 0, n);
877 else {
878 memcpy(to, from, n);
879 from += n;
880 }
881 len -= n;
882 off = 0;
883 vmr++;
884 }
885
886 return (0);
887 }
888
889 /*
890 * read_mem
891 *
892 * Reads memory at guest paddr 'src' into 'buf'.
893 *
894 * Parameters:
895 * src: the source paddr_t in the guest VM to read from.
896 * buf: destination (local) buffer
897 * len: number of bytes to read
898 *
899 * Return values:
900 * 0: success
901 * EINVAL: if the guest physical memory range [dst, dst + len) does not
902 * exist in the guest.
903 */
904 int
read_mem(paddr_t src,void * buf,size_t len)905 read_mem(paddr_t src, void *buf, size_t len)
906 {
907 char *from, *to = buf;
908 size_t n, off;
909 struct vm_mem_range *vmr;
910
911 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, src, len);
912 if (vmr == NULL) {
913 errno = EINVAL;
914 log_warn("%s: failed - invalid memory range src = 0x%lx, "
915 "len = 0x%zx", __func__, src, len);
916 return (EINVAL);
917 }
918
919 off = src - vmr->vmr_gpa;
920 while (len != 0) {
921 n = vmr->vmr_size - off;
922 if (len < n)
923 n = len;
924
925 from = (char *)vmr->vmr_va + off;
926 memcpy(to, from, n);
927
928 to += n;
929 len -= n;
930 off = 0;
931 vmr++;
932 }
933
934 return (0);
935 }
936
937 /*
938 * hvaddr_mem
939 *
940 * Translate a guest physical address to a host virtual address, checking the
941 * provided memory range length to confirm it's contiguous within the same
942 * guest memory range (vm_mem_range).
943 *
944 * Parameters:
945 * gpa: guest physical address to translate
946 * len: number of bytes in the intended range
947 *
948 * Return values:
949 * void* to host virtual memory on success
950 * NULL on error, setting errno to:
951 * EFAULT: gpa falls outside guest memory ranges
952 * EINVAL: requested len extends beyond memory range
953 */
954 void *
hvaddr_mem(paddr_t gpa,size_t len)955 hvaddr_mem(paddr_t gpa, size_t len)
956 {
957 struct vm_mem_range *vmr;
958 size_t off;
959
960 vmr = find_gpa_range(¤t_vm->vm_params.vmc_params, gpa, len);
961 if (vmr == NULL) {
962 log_warnx("%s: failed - invalid gpa: 0x%lx\n", __func__, gpa);
963 errno = EFAULT;
964 return (NULL);
965 }
966
967 off = gpa - vmr->vmr_gpa;
968 if (len > (vmr->vmr_size - off)) {
969 log_warnx("%s: failed - invalid memory range: gpa=0x%lx, "
970 "len=%zu", __func__, gpa, len);
971 errno = EINVAL;
972 return (NULL);
973 }
974
975 return ((char *)vmr->vmr_va + off);
976 }
977
978 /*
979 * vcpu_assert_irq
980 *
981 * Injects the specified IRQ on the supplied vcpu/vm
982 *
983 * Parameters:
984 * vm_id: VM ID to inject to
985 * vcpu_id: VCPU ID to inject to
986 * irq: IRQ to inject
987 */
988 void
vcpu_assert_irq(uint32_t vm_id,uint32_t vcpu_id,int irq)989 vcpu_assert_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
990 {
991 i8259_assert_irq(irq);
992
993 if (i8259_is_pending()) {
994 if (vcpu_intr(vm_id, vcpu_id, 1))
995 fatalx("%s: can't assert INTR", __func__);
996
997 vcpu_unhalt(vcpu_id);
998 vcpu_signal_run(vcpu_id);
999 }
1000 }
1001
1002 /*
1003 * vcpu_deassert_pic_irq
1004 *
1005 * Clears the specified IRQ on the supplied vcpu/vm
1006 *
1007 * Parameters:
1008 * vm_id: VM ID to clear in
1009 * vcpu_id: VCPU ID to clear in
1010 * irq: IRQ to clear
1011 */
1012 void
vcpu_deassert_irq(uint32_t vm_id,uint32_t vcpu_id,int irq)1013 vcpu_deassert_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1014 {
1015 i8259_deassert_irq(irq);
1016
1017 if (!i8259_is_pending()) {
1018 if (vcpu_intr(vm_id, vcpu_id, 0))
1019 fatalx("%s: can't deassert INTR for vm_id %d, "
1020 "vcpu_id %d", __func__, vm_id, vcpu_id);
1021 }
1022 }
1023 /*
1024 * set_return_data
1025 *
1026 * Utility function for manipulating register data in vm exit info structs. This
1027 * function ensures that the data is copied to the vei->vei.vei_data field with
1028 * the proper size for the operation being performed.
1029 *
1030 * Parameters:
1031 * vei: exit information
1032 * data: return data
1033 */
1034 void
set_return_data(struct vm_exit * vei,uint32_t data)1035 set_return_data(struct vm_exit *vei, uint32_t data)
1036 {
1037 switch (vei->vei.vei_size) {
1038 case 1:
1039 vei->vei.vei_data &= ~0xFF;
1040 vei->vei.vei_data |= (uint8_t)data;
1041 break;
1042 case 2:
1043 vei->vei.vei_data &= ~0xFFFF;
1044 vei->vei.vei_data |= (uint16_t)data;
1045 break;
1046 case 4:
1047 vei->vei.vei_data = data;
1048 break;
1049 }
1050 }
1051
1052 /*
1053 * get_input_data
1054 *
1055 * Utility function for manipulating register data in vm exit info
1056 * structs. This function ensures that the data is copied from the
1057 * vei->vei.vei_data field with the proper size for the operation being
1058 * performed.
1059 *
1060 * Parameters:
1061 * vei: exit information
1062 * data: location to store the result
1063 */
1064 void
get_input_data(struct vm_exit * vei,uint32_t * data)1065 get_input_data(struct vm_exit *vei, uint32_t *data)
1066 {
1067 switch (vei->vei.vei_size) {
1068 case 1:
1069 *data &= 0xFFFFFF00;
1070 *data |= (uint8_t)vei->vei.vei_data;
1071 break;
1072 case 2:
1073 *data &= 0xFFFF0000;
1074 *data |= (uint16_t)vei->vei.vei_data;
1075 break;
1076 case 4:
1077 *data = vei->vei.vei_data;
1078 break;
1079 default:
1080 log_warnx("%s: invalid i/o size %d", __func__,
1081 vei->vei.vei_size);
1082 }
1083
1084 }
1085
1086 /*
1087 * translate_gva
1088 *
1089 * Translates a guest virtual address to a guest physical address by walking
1090 * the currently active page table (if needed).
1091 *
1092 * XXX ensure translate_gva updates the A bit in the PTE
1093 * XXX ensure translate_gva respects segment base and limits in i386 mode
1094 * XXX ensure translate_gva respects segment wraparound in i8086 mode
1095 * XXX ensure translate_gva updates the A bit in the segment selector
1096 * XXX ensure translate_gva respects CR4.LMSLE if available
1097 *
1098 * Parameters:
1099 * exit: The VCPU this translation should be performed for (guest MMU settings
1100 * are gathered from this VCPU)
1101 * va: virtual address to translate
1102 * pa: pointer to paddr_t variable that will receive the translated physical
1103 * address. 'pa' is unchanged on error.
1104 * mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which
1105 * the address should be translated
1106 *
1107 * Return values:
1108 * 0: the address was successfully translated - 'pa' contains the physical
1109 * address currently mapped by 'va'.
1110 * EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case
1111 * and %cr2 set in the vcpu structure.
1112 * EINVAL: an error occurred reading paging table structures
1113 */
1114 int
translate_gva(struct vm_exit * exit,uint64_t va,uint64_t * pa,int mode)1115 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode)
1116 {
1117 int level, shift, pdidx;
1118 uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask;
1119 uint64_t shift_width, pte_size;
1120 struct vcpu_reg_state *vrs;
1121
1122 vrs = &exit->vrs;
1123
1124 if (!pa)
1125 return (EINVAL);
1126
1127 if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) {
1128 log_debug("%s: unpaged, va=pa=0x%llx", __func__, va);
1129 *pa = va;
1130 return (0);
1131 }
1132
1133 pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3];
1134
1135 log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__,
1136 vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]);
1137
1138 if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) {
1139 if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) {
1140 pte_size = sizeof(uint64_t);
1141 shift_width = 9;
1142
1143 if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) {
1144 /* 4 level paging */
1145 level = 4;
1146 mask = L4_MASK;
1147 shift = L4_SHIFT;
1148 } else {
1149 /* 32 bit with PAE paging */
1150 level = 3;
1151 mask = L3_MASK;
1152 shift = L3_SHIFT;
1153 }
1154 } else {
1155 /* 32 bit paging */
1156 level = 2;
1157 shift_width = 10;
1158 mask = 0xFFC00000;
1159 shift = 22;
1160 pte_size = sizeof(uint32_t);
1161 }
1162 } else
1163 return (EINVAL);
1164
1165 /* XXX: Check for R bit in segment selector and set A bit */
1166
1167 for (;level > 0; level--) {
1168 pdidx = (va & mask) >> shift;
1169 pte_paddr = (pt_paddr) + (pdidx * pte_size);
1170
1171 log_debug("%s: read pte level %d @ GPA 0x%llx", __func__,
1172 level, pte_paddr);
1173 if (read_mem(pte_paddr, &pte, pte_size)) {
1174 log_warn("%s: failed to read pte", __func__);
1175 return (EFAULT);
1176 }
1177
1178 log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr,
1179 pte);
1180
1181 /* XXX: Set CR2 */
1182 if (!(pte & PG_V))
1183 return (EFAULT);
1184
1185 /* XXX: Check for SMAP */
1186 if ((mode == PROT_WRITE) && !(pte & PG_RW))
1187 return (EPERM);
1188
1189 if ((exit->cpl > 0) && !(pte & PG_u))
1190 return (EPERM);
1191
1192 pte = pte | PG_U;
1193 if (mode == PROT_WRITE)
1194 pte = pte | PG_M;
1195 if (write_mem(pte_paddr, &pte, pte_size)) {
1196 log_warn("%s: failed to write back flags to pte",
1197 __func__);
1198 return (EIO);
1199 }
1200
1201 /* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */
1202 if (pte & PG_PS)
1203 break;
1204
1205 if (level > 1) {
1206 pt_paddr = pte & PG_FRAME;
1207 shift -= shift_width;
1208 mask = mask >> shift_width;
1209 }
1210 }
1211
1212 low_mask = (1 << shift) - 1;
1213 high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask;
1214 *pa = (pte & high_mask) | (va & low_mask);
1215
1216 log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa);
1217
1218 return (0);
1219 }
1220
1221 int
intr_pending(struct vmd_vm * vm)1222 intr_pending(struct vmd_vm *vm)
1223 {
1224 /* XXX select active interrupt controller */
1225 return i8259_is_pending();
1226 }
1227
1228 int
intr_ack(struct vmd_vm * vm)1229 intr_ack(struct vmd_vm *vm)
1230 {
1231 /* XXX select active interrupt controller */
1232 return i8259_ack();
1233 }
1234
1235 void
intr_toggle_el(struct vmd_vm * vm,int irq,int val)1236 intr_toggle_el(struct vmd_vm *vm, int irq, int val)
1237 {
1238 /* XXX select active interrupt controller */
1239 pic_set_elcr(irq, val);
1240 }
1241
1242 int
vmd_check_vmh(struct vm_dump_header * vmh)1243 vmd_check_vmh(struct vm_dump_header *vmh)
1244 {
1245 int i;
1246 unsigned int code, leaf;
1247 unsigned int a, b, c, d;
1248
1249 if (strncmp(vmh->vmh_signature, VM_DUMP_SIGNATURE, strlen(VM_DUMP_SIGNATURE)) != 0) {
1250 log_warnx("%s: incompatible dump signature", __func__);
1251 return (-1);
1252 }
1253
1254 if (vmh->vmh_version != VM_DUMP_VERSION) {
1255 log_warnx("%s: incompatible dump version", __func__);
1256 return (-1);
1257 }
1258
1259 for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
1260 code = vmh->vmh_cpuids[i].code;
1261 leaf = vmh->vmh_cpuids[i].leaf;
1262 if (leaf != 0x00) {
1263 log_debug("%s: invalid leaf 0x%x for code 0x%x",
1264 __func__, leaf, code);
1265 return (-1);
1266 }
1267
1268 switch (code) {
1269 case 0x00:
1270 CPUID_LEAF(code, leaf, a, b, c, d);
1271 if (vmh->vmh_cpuids[i].a > a) {
1272 log_debug("%s: incompatible cpuid level",
1273 __func__);
1274 return (-1);
1275 }
1276 if (!(vmh->vmh_cpuids[i].b == b &&
1277 vmh->vmh_cpuids[i].c == c &&
1278 vmh->vmh_cpuids[i].d == d)) {
1279 log_debug("%s: incompatible cpu brand",
1280 __func__);
1281 return (-1);
1282 }
1283 break;
1284
1285 case 0x01:
1286 CPUID_LEAF(code, leaf, a, b, c, d);
1287 if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
1288 (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
1289 log_debug("%s: incompatible cpu features "
1290 "code: 0x%x leaf: 0x%x reg: c", __func__,
1291 code, leaf);
1292 return (-1);
1293 }
1294 if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
1295 (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
1296 log_debug("%s: incompatible cpu features "
1297 "code: 0x%x leaf: 0x%x reg: d", __func__,
1298 code, leaf);
1299 return (-1);
1300 }
1301 break;
1302
1303 case 0x07:
1304 CPUID_LEAF(code, leaf, a, b, c, d);
1305 if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
1306 (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
1307 log_debug("%s: incompatible cpu features "
1308 "code: 0x%x leaf: 0x%x reg: c", __func__,
1309 code, leaf);
1310 return (-1);
1311 }
1312 if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
1313 (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
1314 log_debug("%s: incompatible cpu features "
1315 "code: 0x%x leaf: 0x%x reg: d", __func__,
1316 code, leaf);
1317 return (-1);
1318 }
1319 break;
1320
1321 case 0x0d:
1322 CPUID_LEAF(code, leaf, a, b, c, d);
1323 if (vmh->vmh_cpuids[i].b > b) {
1324 log_debug("%s: incompatible cpu: insufficient "
1325 "max save area for enabled XCR0 features",
1326 __func__);
1327 return (-1);
1328 }
1329 if (vmh->vmh_cpuids[i].c > c) {
1330 log_debug("%s: incompatible cpu: insufficient "
1331 "max save area for supported XCR0 features",
1332 __func__);
1333 return (-1);
1334 }
1335 break;
1336
1337 case 0x80000001:
1338 CPUID_LEAF(code, leaf, a, b, c, d);
1339 if ((vmh->vmh_cpuids[i].a & a) !=
1340 vmh->vmh_cpuids[i].a) {
1341 log_debug("%s: incompatible cpu features "
1342 "code: 0x%x leaf: 0x%x reg: a", __func__,
1343 code, leaf);
1344 return (-1);
1345 }
1346 if ((vmh->vmh_cpuids[i].c & c) !=
1347 vmh->vmh_cpuids[i].c) {
1348 log_debug("%s: incompatible cpu features "
1349 "code: 0x%x leaf: 0x%x reg: c", __func__,
1350 code, leaf);
1351 return (-1);
1352 }
1353 if ((vmh->vmh_cpuids[i].d & d) !=
1354 vmh->vmh_cpuids[i].d) {
1355 log_debug("%s: incompatible cpu features "
1356 "code: 0x%x leaf: 0x%x reg: d", __func__,
1357 code, leaf);
1358 return (-1);
1359 }
1360 break;
1361
1362 default:
1363 log_debug("%s: unknown code 0x%x", __func__, code);
1364 return (-1);
1365 }
1366 }
1367
1368 return (0);
1369 }
1370