xref: /openbsd/usr.sbin/vmd/x86_vm.c (revision a4052f0f)
1 /*	$OpenBSD: x86_vm.c,v 1.5 2024/10/02 17:05:56 dv Exp $	*/
2 /*
3  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <sys/stat.h>
19 #include <sys/types.h>
20 
21 #include <dev/ic/i8253reg.h>
22 #include <dev/isa/isareg.h>
23 
24 #include <machine/pte.h>
25 #include <machine/specialreg.h>
26 #include <machine/vmmvar.h>
27 
28 #include <errno.h>
29 #include <string.h>
30 #include <unistd.h>
31 
32 #include <zlib.h>
33 
34 #include "atomicio.h"
35 #include "fw_cfg.h"
36 #include "i8253.h"
37 #include "i8259.h"
38 #include "loadfile.h"
39 #include "mc146818.h"
40 #include "ns8250.h"
41 #include "pci.h"
42 #include "virtio.h"
43 
44 typedef uint8_t (*io_fn_t)(struct vm_run_params *);
45 
46 #define MAX_PORTS 65536
47 
48 io_fn_t	ioports_map[MAX_PORTS];
49 extern char *__progname;
50 
51 void	 create_memory_map(struct vm_create_params *);
52 int	 translate_gva(struct vm_exit*, uint64_t, uint64_t *, int);
53 
54 static int	loadfile_bios(gzFile, off_t, struct vcpu_reg_state *);
55 static int	vcpu_exit_eptviolation(struct vm_run_params *);
56 static void	vcpu_exit_inout(struct vm_run_params *);
57 
58 extern struct vmd_vm	*current_vm;
59 extern int		 con_fd;
60 
61 /*
62  * Represents a standard register set for an OS to be booted
63  * as a flat 64 bit address space.
64  *
65  * NOT set here are:
66  *  RIP
67  *  RSP
68  *  GDTR BASE
69  *
70  * Specific bootloaders should clone this structure and override
71  * those fields as needed.
72  *
73  * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
74  *        features of the CPU in use.
75  */
76 static const struct vcpu_reg_state vcpu_init_flat64 = {
77 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
78 	.vrs_gprs[VCPU_REGS_RIP] = 0x0,
79 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
80 	.vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG,
81 	.vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
82 	.vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE,
83 	.vrs_crs[VCPU_REGS_PDPTE0] = 0ULL,
84 	.vrs_crs[VCPU_REGS_PDPTE1] = 0ULL,
85 	.vrs_crs[VCPU_REGS_PDPTE2] = 0ULL,
86 	.vrs_crs[VCPU_REGS_PDPTE3] = 0ULL,
87 	.vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
88 	.vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
89 	.vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
90 	.vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
91 	.vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
92 	.vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
93 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
94 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
95 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
96 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
97 	.vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA,
98 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
99 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
100 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
101 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
102 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
103 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
104 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
105 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
106 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
107 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
108 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
109 	.vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL,
110 	.vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87
111 };
112 
113 /*
114  * Represents a standard register set for an BIOS to be booted
115  * as a flat 16 bit address space.
116  */
117 static const struct vcpu_reg_state vcpu_init_flat16 = {
118 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
119 	.vrs_gprs[VCPU_REGS_RIP] = 0xFFF0,
120 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
121 	.vrs_crs[VCPU_REGS_CR0] = 0x60000010,
122 	.vrs_crs[VCPU_REGS_CR3] = 0,
123 	.vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000},
124 	.vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0},
125 	.vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0},
126 	.vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0},
127 	.vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0},
128 	.vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0},
129 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
130 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
131 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
132 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
133 	.vrs_msrs[VCPU_REGS_EFER] = 0ULL,
134 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
135 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
136 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
137 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
138 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
139 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
140 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
141 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
142 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
143 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
144 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
145 	.vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87
146 };
147 
148 /*
149  * create_memory_map
150  *
151  * Sets up the guest physical memory ranges that the VM can access.
152  *
153  * Parameters:
154  *  vcp: VM create parameters describing the VM whose memory map
155  *       is being created
156  *
157  * Return values:
158  *  nothing
159  */
160 void
create_memory_map(struct vm_create_params * vcp)161 create_memory_map(struct vm_create_params *vcp)
162 {
163 	size_t len, mem_bytes;
164 	size_t above_1m = 0, above_4g = 0;
165 
166 	mem_bytes = vcp->vcp_memranges[0].vmr_size;
167 	vcp->vcp_nmemranges = 0;
168 	if (mem_bytes == 0 || mem_bytes > VMM_MAX_VM_MEM_SIZE)
169 		return;
170 
171 	/* First memory region: 0 - LOWMEM_KB (DOS low mem) */
172 	len = LOWMEM_KB * 1024;
173 	vcp->vcp_memranges[0].vmr_gpa = 0x0;
174 	vcp->vcp_memranges[0].vmr_size = len;
175 	vcp->vcp_memranges[0].vmr_type = VM_MEM_RAM;
176 	mem_bytes -= len;
177 
178 	/*
179 	 * Second memory region: LOWMEM_KB - 1MB.
180 	 *
181 	 * N.B. - Normally ROMs or parts of video RAM are mapped here.
182 	 * We have to add this region, because some systems
183 	 * unconditionally write to 0xb8000 (VGA RAM), and
184 	 * we need to make sure that vmm(4) permits accesses
185 	 * to it. So allocate guest memory for it.
186 	 */
187 	len = MB(1) - (LOWMEM_KB * 1024);
188 	vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
189 	vcp->vcp_memranges[1].vmr_size = len;
190 	vcp->vcp_memranges[1].vmr_type = VM_MEM_RESERVED;
191 	mem_bytes -= len;
192 
193 	/* If we have less than 2MB remaining, still create a 2nd BIOS area. */
194 	if (mem_bytes <= MB(2)) {
195 		vcp->vcp_memranges[2].vmr_gpa = PCI_MMIO_BAR_END;
196 		vcp->vcp_memranges[2].vmr_size = MB(2);
197 		vcp->vcp_memranges[2].vmr_type = VM_MEM_RESERVED;
198 		vcp->vcp_nmemranges = 3;
199 		return;
200 	}
201 
202 	/*
203 	 * Calculate the how to split any remaining memory across the 4GB
204 	 * boundary while making sure we do not place physical memory into
205 	 * MMIO ranges.
206 	 */
207 	if (mem_bytes > PCI_MMIO_BAR_BASE - MB(1)) {
208 		above_1m = PCI_MMIO_BAR_BASE - MB(1);
209 		above_4g = mem_bytes - above_1m;
210 	} else {
211 		above_1m = mem_bytes;
212 		above_4g = 0;
213 	}
214 
215 	/* Third memory region: area above 1MB to MMIO region */
216 	vcp->vcp_memranges[2].vmr_gpa = MB(1);
217 	vcp->vcp_memranges[2].vmr_size = above_1m;
218 	vcp->vcp_memranges[2].vmr_type = VM_MEM_RAM;
219 
220 	/* Fourth region: PCI MMIO range */
221 	vcp->vcp_memranges[3].vmr_gpa = PCI_MMIO_BAR_BASE;
222 	vcp->vcp_memranges[3].vmr_size = PCI_MMIO_BAR_END -
223 	    PCI_MMIO_BAR_BASE + 1;
224 	vcp->vcp_memranges[3].vmr_type = VM_MEM_MMIO;
225 
226 	/* Fifth region: 2nd copy of BIOS above MMIO ending at 4GB */
227 	vcp->vcp_memranges[4].vmr_gpa = PCI_MMIO_BAR_END + 1;
228 	vcp->vcp_memranges[4].vmr_size = MB(2);
229 	vcp->vcp_memranges[4].vmr_type = VM_MEM_RESERVED;
230 
231 	/* Sixth region: any remainder above 4GB */
232 	if (above_4g > 0) {
233 		vcp->vcp_memranges[5].vmr_gpa = GB(4);
234 		vcp->vcp_memranges[5].vmr_size = above_4g;
235 		vcp->vcp_memranges[5].vmr_type = VM_MEM_RAM;
236 		vcp->vcp_nmemranges = 6;
237 	} else
238 		vcp->vcp_nmemranges = 5;
239 }
240 
241 int
load_firmware(struct vmd_vm * vm,struct vcpu_reg_state * vrs)242 load_firmware(struct vmd_vm *vm, struct vcpu_reg_state *vrs)
243 {
244 	int		ret;
245 	gzFile		fp;
246 	struct stat	sb;
247 
248 	/*
249 	 * Set up default "flat 64 bit" register state - RIP, RSP, and
250 	 * GDT info will be set in bootloader
251 	 */
252 	memcpy(vrs, &vcpu_init_flat64, sizeof(*vrs));
253 
254 	/* Find and open kernel image */
255 	if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL)
256 		fatalx("failed to open kernel - exiting");
257 
258 	/* Load kernel image */
259 	ret = loadfile_elf(fp, vm, vrs, vm->vm_params.vmc_bootdevice);
260 
261 	/*
262 	 * Try BIOS as a fallback (only if it was provided as an image
263 	 * with vm->vm_kernel and the file is not compressed)
264 	 */
265 	if (ret && errno == ENOEXEC && vm->vm_kernel != -1 &&
266 	    gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0)
267 		ret = loadfile_bios(fp, sb.st_size, vrs);
268 
269 	gzclose(fp);
270 
271 	return (ret);
272 }
273 
274 
275 /*
276  * loadfile_bios
277  *
278  * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image
279  * directly into memory.
280  *
281  * Parameters:
282  *  fp: file of a kernel file to load
283  *  size: uncompressed size of the image
284  *  (out) vrs: register state to set on init for this kernel
285  *
286  * Return values:
287  *  0 if successful
288  *  various error codes returned from read(2) or loadelf functions
289  */
290 int
loadfile_bios(gzFile fp,off_t size,struct vcpu_reg_state * vrs)291 loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs)
292 {
293 	off_t	 off;
294 
295 	/* Set up a "flat 16 bit" register state for BIOS */
296 	memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs));
297 
298 	/* Seek to the beginning of the BIOS image */
299 	if (gzseek(fp, 0, SEEK_SET) == -1)
300 		return (-1);
301 
302 	/* The BIOS image must end at 1MB */
303 	if ((off = MB(1) - size) < 0)
304 		return (-1);
305 
306 	/* Read BIOS image into memory */
307 	if (mread(fp, off, size) != (size_t)size) {
308 		errno = EIO;
309 		return (-1);
310 	}
311 
312 	if (gzseek(fp, 0, SEEK_SET) == -1)
313 		return (-1);
314 
315 	/* Read a second BIOS copy into memory ending at 4GB */
316 	off = GB(4) - size;
317 	if (mread(fp, off, size) != (size_t)size) {
318 		errno = EIO;
319 		return (-1);
320 	}
321 
322 	log_debug("%s: loaded BIOS image", __func__);
323 
324 	return (0);
325 }
326 
327 /*
328  * init_emulated_hw
329  *
330  * Initializes the userspace hardware emulation
331  */
332 void
init_emulated_hw(struct vmop_create_params * vmc,int child_cdrom,int child_disks[][VM_MAX_BASE_PER_DISK],int * child_taps)333 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
334     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
335 {
336 	struct vm_create_params *vcp = &vmc->vmc_params;
337 	size_t i;
338 	uint64_t memlo, memhi;
339 
340 	/* Calculate memory size for NVRAM registers */
341 	memlo = memhi = 0;
342 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
343 		if (vcp->vcp_memranges[i].vmr_gpa == MB(1) &&
344 		    vcp->vcp_memranges[i].vmr_size > (15 * MB(1)))
345 			memlo = vcp->vcp_memranges[i].vmr_size - (15 * MB(1));
346 		else if (vcp->vcp_memranges[i].vmr_gpa == GB(4))
347 			memhi = vcp->vcp_memranges[i].vmr_size;
348 	}
349 
350 	/* Reset the IO port map */
351 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
352 
353 	/* Init i8253 PIT */
354 	i8253_init(vcp->vcp_id);
355 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
356 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
357 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
358 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
359 	ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc;
360 
361 	/* Init mc146818 RTC */
362 	mc146818_init(vcp->vcp_id, memlo, memhi);
363 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
364 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
365 
366 	/* Init master and slave PICs */
367 	i8259_init();
368 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
369 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
370 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
371 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
372 	ioports_map[ELCR0] = vcpu_exit_elcr;
373 	ioports_map[ELCR1] = vcpu_exit_elcr;
374 
375 	/* Init ns8250 UART */
376 	ns8250_init(con_fd, vcp->vcp_id);
377 	for (i = COM1_DATA; i <= COM1_SCR; i++)
378 		ioports_map[i] = vcpu_exit_com;
379 
380 	/* Initialize PCI */
381 	for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++)
382 		ioports_map[i] = vcpu_exit_pci;
383 
384 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
385 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
386 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
387 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
388 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
389 	pci_init();
390 
391 	/* Initialize virtio devices */
392 	virtio_init(current_vm, child_cdrom, child_disks, child_taps);
393 
394 	/*
395 	 * Init QEMU fw_cfg interface. Must be done last for pci hardware
396 	 * detection.
397 	 */
398 	fw_cfg_init(vmc);
399 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
400 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
401 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
402 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
403 }
404 
405 /*
406  * restore_emulated_hw
407  *
408  * Restores the userspace hardware emulation from fd
409  */
410 void
restore_emulated_hw(struct vm_create_params * vcp,int fd,int * child_taps,int child_disks[][VM_MAX_BASE_PER_DISK],int child_cdrom)411 restore_emulated_hw(struct vm_create_params *vcp, int fd,
412     int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
413 {
414 	/* struct vm_create_params *vcp = &vmc->vmc_params; */
415 	int i;
416 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
417 
418 	/* Init i8253 PIT */
419 	i8253_restore(fd, vcp->vcp_id);
420 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
421 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
422 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
423 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
424 
425 	/* Init master and slave PICs */
426 	i8259_restore(fd);
427 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
428 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
429 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
430 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
431 
432 	/* Init ns8250 UART */
433 	ns8250_restore(fd, con_fd, vcp->vcp_id);
434 	for (i = COM1_DATA; i <= COM1_SCR; i++)
435 		ioports_map[i] = vcpu_exit_com;
436 
437 	/* Init mc146818 RTC */
438 	mc146818_restore(fd, vcp->vcp_id);
439 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
440 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
441 
442 	/* Init QEMU fw_cfg interface */
443 	fw_cfg_restore(fd);
444 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
445 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
446 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
447 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
448 
449 	/* Initialize PCI */
450 	for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++)
451 		ioports_map[i] = vcpu_exit_pci;
452 
453 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
454 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
455 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
456 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
457 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
458 	pci_restore(fd);
459 	virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps);
460 }
461 
462 void
pause_vm_md(struct vmd_vm * vm)463 pause_vm_md(struct vmd_vm *vm)
464 {
465 	i8253_stop();
466 	mc146818_stop();
467 	ns8250_stop();
468 	virtio_stop(vm);
469 }
470 
471 void
unpause_vm_md(struct vmd_vm * vm)472 unpause_vm_md(struct vmd_vm *vm)
473 {
474 	i8253_start();
475 	mc146818_start();
476 	ns8250_start();
477 	virtio_start(vm);
478 }
479 
480 int
dump_devs(int fd)481 dump_devs(int fd)
482 {
483 	int ret = 0;
484 
485 	if ((ret = i8253_dump(fd)))
486 		return ret;
487 	if ((ret = i8259_dump(fd)))
488 		return ret;
489 	if ((ret = ns8250_dump(fd)))
490 		return ret;
491 	if ((ret = mc146818_dump(fd)))
492 		return ret;
493 	ret = fw_cfg_dump(fd);
494 
495 	return ret;
496 }
497 
498 int
dump_send_header(int fd)499 dump_send_header(int fd) {
500 	struct vm_dump_header	   vmh;
501 	int			   i;
502 
503 	memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE,
504 	    sizeof(vmh.vmh_signature));
505 
506 	vmh.vmh_cpuids[0].code = 0x00;
507 	vmh.vmh_cpuids[0].leaf = 0x00;
508 
509 	vmh.vmh_cpuids[1].code = 0x01;
510 	vmh.vmh_cpuids[1].leaf = 0x00;
511 
512 	vmh.vmh_cpuids[2].code = 0x07;
513 	vmh.vmh_cpuids[2].leaf = 0x00;
514 
515 	vmh.vmh_cpuids[3].code = 0x0d;
516 	vmh.vmh_cpuids[3].leaf = 0x00;
517 
518 	vmh.vmh_cpuids[4].code = 0x80000001;
519 	vmh.vmh_cpuids[4].leaf = 0x00;
520 
521 	vmh.vmh_version = VM_DUMP_VERSION;
522 
523 	for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
524 		CPUID_LEAF(vmh.vmh_cpuids[i].code,
525 		    vmh.vmh_cpuids[i].leaf,
526 		    vmh.vmh_cpuids[i].a,
527 		    vmh.vmh_cpuids[i].b,
528 		    vmh.vmh_cpuids[i].c,
529 		    vmh.vmh_cpuids[i].d);
530 	}
531 
532 	if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh))
533 		return (-1);
534 
535 	return (0);
536 }
537 
538 
539 /*
540  * vcpu_exit_inout
541  *
542  * Handle all I/O exits that need to be emulated in vmd. This includes the
543  * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
544  *
545  * Parameters:
546  *  vrp: vcpu run parameters containing guest state for this exit
547  */
548 void
vcpu_exit_inout(struct vm_run_params * vrp)549 vcpu_exit_inout(struct vm_run_params *vrp)
550 {
551 	struct vm_exit *vei = vrp->vrp_exit;
552 	uint8_t intr = 0xFF;
553 
554 	if (vei->vei.vei_rep || vei->vei.vei_string) {
555 #ifdef MMIO_DEBUG
556 		log_info("%s: %s%s%s %d-byte, enc=%d, data=0x%08x, port=0x%04x",
557 		    __func__,
558 		    vei->vei.vei_rep == 0 ? "" : "REP ",
559 		    vei->vei.vei_dir == VEI_DIR_IN ? "IN" : "OUT",
560 		    vei->vei.vei_string == 0 ? "" : "S",
561 		    vei->vei.vei_size, vei->vei.vei_encoding,
562 		    vei->vei.vei_data, vei->vei.vei_port);
563 		log_info("%s: ECX = 0x%llx, RDX = 0x%llx, RSI = 0x%llx",
564 		    __func__,
565 		    vei->vrs.vrs_gprs[VCPU_REGS_RCX],
566 		    vei->vrs.vrs_gprs[VCPU_REGS_RDX],
567 		    vei->vrs.vrs_gprs[VCPU_REGS_RSI]);
568 #endif /* MMIO_DEBUG */
569 		fatalx("%s: can't emulate REP prefixed IN(S)/OUT(S)",
570 		    __func__);
571 	}
572 
573 	if (ioports_map[vei->vei.vei_port] != NULL)
574 		intr = ioports_map[vei->vei.vei_port](vrp);
575 	else if (vei->vei.vei_dir == VEI_DIR_IN)
576 		set_return_data(vei, 0xFFFFFFFF);
577 
578 	vei->vrs.vrs_gprs[VCPU_REGS_RIP] += vei->vei.vei_insn_len;
579 
580 	if (intr != 0xFF)
581 		vcpu_assert_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
582 }
583 
584 /*
585  * vcpu_exit
586  *
587  * Handle a vcpu exit. This function is called when it is determined that
588  * vmm(4) requires the assistance of vmd to support a particular guest
589  * exit type (eg, accessing an I/O port or device). Guest state is contained
590  * in 'vrp', and will be resent to vmm(4) on exit completion.
591  *
592  * Upon conclusion of handling the exit, the function determines if any
593  * interrupts should be injected into the guest, and asserts the proper
594  * IRQ line whose interrupt should be vectored.
595  *
596  * Parameters:
597  *  vrp: vcpu run parameters containing guest state for this exit
598  *
599  * Return values:
600  *  0: the exit was handled successfully
601  *  1: an error occurred (eg, unknown exit reason passed in 'vrp')
602  */
603 int
vcpu_exit(struct vm_run_params * vrp)604 vcpu_exit(struct vm_run_params *vrp)
605 {
606 	int ret;
607 
608 	switch (vrp->vrp_exit_reason) {
609 	case VMX_EXIT_INT_WINDOW:
610 	case SVM_VMEXIT_VINTR:
611 	case VMX_EXIT_CPUID:
612 	case VMX_EXIT_EXTINT:
613 	case SVM_VMEXIT_INTR:
614 	case SVM_VMEXIT_MSR:
615 	case SVM_VMEXIT_CPUID:
616 		/*
617 		 * We may be exiting to vmd to handle a pending interrupt but
618 		 * at the same time the last exit type may have been one of
619 		 * these. In this case, there's nothing extra to be done
620 		 * here (and falling through to the default case below results
621 		 * in more vmd log spam).
622 		 */
623 		break;
624 	case SVM_VMEXIT_NPF:
625 	case VMX_EXIT_EPT_VIOLATION:
626 		ret = vcpu_exit_eptviolation(vrp);
627 		if (ret)
628 			return (ret);
629 		break;
630 	case VMX_EXIT_IO:
631 	case SVM_VMEXIT_IOIO:
632 		vcpu_exit_inout(vrp);
633 		break;
634 	case VMX_EXIT_HLT:
635 	case SVM_VMEXIT_HLT:
636 		vcpu_halt(vrp->vrp_vcpu_id);
637 		break;
638 	case VMX_EXIT_TRIPLE_FAULT:
639 	case SVM_VMEXIT_SHUTDOWN:
640 		/* reset VM */
641 		return (EAGAIN);
642 	default:
643 		log_debug("%s: unknown exit reason 0x%x",
644 		    __progname, vrp->vrp_exit_reason);
645 	}
646 
647 	return (0);
648 }
649 
650 /*
651  * vcpu_exit_eptviolation
652  *
653  * handle an EPT Violation
654  *
655  * Parameters:
656  *  vrp: vcpu run parameters containing guest state for this exit
657  *
658  * Return values:
659  *  0: no action required
660  *  EFAULT: a protection fault occured, kill the vm.
661  */
662 static int
vcpu_exit_eptviolation(struct vm_run_params * vrp)663 vcpu_exit_eptviolation(struct vm_run_params *vrp)
664 {
665 	struct vm_exit *ve = vrp->vrp_exit;
666 	int ret = 0;
667 #if MMIO_NOTYET
668 	struct x86_insn insn;
669 	uint64_t va, pa;
670 	size_t len = 15;		/* Max instruction length in x86. */
671 #endif /* MMIO_NOTYET */
672 	switch (ve->vee.vee_fault_type) {
673 	case VEE_FAULT_HANDLED:
674 		break;
675 
676 #if MMIO_NOTYET
677 	case VEE_FAULT_MMIO_ASSIST:
678 		/* Intel VMX might give us the length of the instruction. */
679 		if (ve->vee.vee_insn_info & VEE_LEN_VALID)
680 			len = ve->vee.vee_insn_len;
681 
682 		if (len > 15)
683 			fatalx("%s: invalid instruction length %lu", __func__,
684 			    len);
685 
686 		/* If we weren't given instruction bytes, we need to fetch. */
687 		if (!(ve->vee.vee_insn_info & VEE_BYTES_VALID)) {
688 			memset(ve->vee.vee_insn_bytes, 0,
689 			    sizeof(ve->vee.vee_insn_bytes));
690 			va = ve->vrs.vrs_gprs[VCPU_REGS_RIP];
691 
692 			/* XXX Only support instructions that fit on 1 page. */
693 			if ((va & PAGE_MASK) + len > PAGE_SIZE) {
694 				log_warnx("%s: instruction might cross page "
695 				    "boundary", __func__);
696 				ret = EINVAL;
697 				break;
698 			}
699 
700 			ret = translate_gva(ve, va, &pa, PROT_EXEC);
701 			if (ret != 0) {
702 				log_warnx("%s: failed gva translation",
703 				    __func__);
704 				break;
705 			}
706 
707 			ret = read_mem(pa, ve->vee.vee_insn_bytes, len);
708 			if (ret != 0) {
709 				log_warnx("%s: failed to fetch instruction "
710 				    "bytes from 0x%llx", __func__, pa);
711 				break;
712 			}
713 		}
714 
715 		ret = insn_decode(ve, &insn);
716 		if (ret == 0)
717 			ret = insn_emulate(ve, &insn);
718 		break;
719 #endif /* MMIO_NOTYET */
720 
721 	case VEE_FAULT_PROTECT:
722 		log_debug("%s: EPT Violation: rip=0x%llx", __progname,
723 		    ve->vrs.vrs_gprs[VCPU_REGS_RIP]);
724 		ret = EFAULT;
725 		break;
726 
727 	default:
728 		fatalx("%s: invalid fault_type %d", __progname,
729 		    ve->vee.vee_fault_type);
730 		/* UNREACHED */
731 	}
732 
733 	return (ret);
734 }
735 
736 /*
737  * vcpu_exit_pci
738  *
739  * Handle all I/O to the emulated PCI subsystem.
740  *
741  * Parameters:
742  *  vrp: vcpu run parameters containing guest state for this exit
743  *
744  * Return value:
745  *  Interrupt to inject to the guest VM, or 0xFF if no interrupt should
746  *      be injected.
747  */
748 uint8_t
vcpu_exit_pci(struct vm_run_params * vrp)749 vcpu_exit_pci(struct vm_run_params *vrp)
750 {
751 	struct vm_exit *vei = vrp->vrp_exit;
752 	uint8_t intr;
753 
754 	intr = 0xFF;
755 
756 	switch (vei->vei.vei_port) {
757 	case PCI_MODE1_ADDRESS_REG:
758 		pci_handle_address_reg(vrp);
759 		break;
760 	case PCI_MODE1_DATA_REG:
761 	case PCI_MODE1_DATA_REG + 1:
762 	case PCI_MODE1_DATA_REG + 2:
763 	case PCI_MODE1_DATA_REG + 3:
764 		pci_handle_data_reg(vrp);
765 		break;
766 	case VM_PCI_IO_BAR_BASE ... VM_PCI_IO_BAR_END:
767 		intr = pci_handle_io(vrp);
768 		break;
769 	default:
770 		log_warnx("%s: unknown PCI register 0x%llx",
771 		    __progname, (uint64_t)vei->vei.vei_port);
772 		break;
773 	}
774 
775 	return (intr);
776 }
777 
778 /*
779  * find_gpa_range
780  *
781  * Search for a contiguous guest physical mem range.
782  *
783  * Parameters:
784  *  vcp: VM create parameters that contain the memory map to search in
785  *  gpa: the starting guest physical address
786  *  len: the length of the memory range
787  *
788  * Return values:
789  *  NULL: on failure if there is no memory range as described by the parameters
790  *  Pointer to vm_mem_range that contains the start of the range otherwise.
791  */
792 struct vm_mem_range *
find_gpa_range(struct vm_create_params * vcp,paddr_t gpa,size_t len)793 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
794 {
795 	size_t i, n;
796 	struct vm_mem_range *vmr;
797 
798 	/* Find the first vm_mem_range that contains gpa */
799 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
800 		vmr = &vcp->vcp_memranges[i];
801 		if (gpa < vmr->vmr_gpa + vmr->vmr_size)
802 			break;
803 	}
804 
805 	/* No range found. */
806 	if (i == vcp->vcp_nmemranges)
807 		return (NULL);
808 
809 	/*
810 	 * vmr may cover the range [gpa, gpa + len) only partly. Make
811 	 * sure that the following vm_mem_ranges are contiguous and
812 	 * cover the rest.
813 	 */
814 	n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
815 	if (len < n)
816 		len = 0;
817 	else
818 		len -= n;
819 	gpa = vmr->vmr_gpa + vmr->vmr_size;
820 	for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
821 		vmr = &vcp->vcp_memranges[i];
822 		if (gpa != vmr->vmr_gpa)
823 			return (NULL);
824 		if (len <= vmr->vmr_size)
825 			len = 0;
826 		else
827 			len -= vmr->vmr_size;
828 
829 		gpa = vmr->vmr_gpa + vmr->vmr_size;
830 	}
831 
832 	if (len != 0)
833 		return (NULL);
834 
835 	return (vmr);
836 }
837 /*
838  * write_mem
839  *
840  * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
841  *
842  * Parameters:
843  *  dst: the destination paddr_t in the guest VM
844  *  buf: data to copy (or NULL to zero the data)
845  *  len: number of bytes to copy
846  *
847  * Return values:
848  *  0: success
849  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
850  *      exist in the guest.
851  */
852 int
write_mem(paddr_t dst,const void * buf,size_t len)853 write_mem(paddr_t dst, const void *buf, size_t len)
854 {
855 	const char *from = buf;
856 	char *to;
857 	size_t n, off;
858 	struct vm_mem_range *vmr;
859 
860 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
861 	if (vmr == NULL) {
862 		errno = EINVAL;
863 		log_warn("%s: failed - invalid memory range dst = 0x%lx, "
864 		    "len = 0x%zx", __func__, dst, len);
865 		return (EINVAL);
866 	}
867 
868 	off = dst - vmr->vmr_gpa;
869 	while (len != 0) {
870 		n = vmr->vmr_size - off;
871 		if (len < n)
872 			n = len;
873 
874 		to = (char *)vmr->vmr_va + off;
875 		if (buf == NULL)
876 			memset(to, 0, n);
877 		else {
878 			memcpy(to, from, n);
879 			from += n;
880 		}
881 		len -= n;
882 		off = 0;
883 		vmr++;
884 	}
885 
886 	return (0);
887 }
888 
889 /*
890  * read_mem
891  *
892  * Reads memory at guest paddr 'src' into 'buf'.
893  *
894  * Parameters:
895  *  src: the source paddr_t in the guest VM to read from.
896  *  buf: destination (local) buffer
897  *  len: number of bytes to read
898  *
899  * Return values:
900  *  0: success
901  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
902  *      exist in the guest.
903  */
904 int
read_mem(paddr_t src,void * buf,size_t len)905 read_mem(paddr_t src, void *buf, size_t len)
906 {
907 	char *from, *to = buf;
908 	size_t n, off;
909 	struct vm_mem_range *vmr;
910 
911 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
912 	if (vmr == NULL) {
913 		errno = EINVAL;
914 		log_warn("%s: failed - invalid memory range src = 0x%lx, "
915 		    "len = 0x%zx", __func__, src, len);
916 		return (EINVAL);
917 	}
918 
919 	off = src - vmr->vmr_gpa;
920 	while (len != 0) {
921 		n = vmr->vmr_size - off;
922 		if (len < n)
923 			n = len;
924 
925 		from = (char *)vmr->vmr_va + off;
926 		memcpy(to, from, n);
927 
928 		to += n;
929 		len -= n;
930 		off = 0;
931 		vmr++;
932 	}
933 
934 	return (0);
935 }
936 
937 /*
938  * hvaddr_mem
939  *
940  * Translate a guest physical address to a host virtual address, checking the
941  * provided memory range length to confirm it's contiguous within the same
942  * guest memory range (vm_mem_range).
943  *
944  * Parameters:
945  *  gpa: guest physical address to translate
946  *  len: number of bytes in the intended range
947  *
948  * Return values:
949  *  void* to host virtual memory on success
950  *  NULL on error, setting errno to:
951  *    EFAULT: gpa falls outside guest memory ranges
952  *    EINVAL: requested len extends beyond memory range
953  */
954 void *
hvaddr_mem(paddr_t gpa,size_t len)955 hvaddr_mem(paddr_t gpa, size_t len)
956 {
957 	struct vm_mem_range *vmr;
958 	size_t off;
959 
960 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, gpa, len);
961 	if (vmr == NULL) {
962 		log_warnx("%s: failed - invalid gpa: 0x%lx\n", __func__, gpa);
963 		errno = EFAULT;
964 		return (NULL);
965 	}
966 
967 	off = gpa - vmr->vmr_gpa;
968 	if (len > (vmr->vmr_size - off)) {
969 		log_warnx("%s: failed - invalid memory range: gpa=0x%lx, "
970 		    "len=%zu", __func__, gpa, len);
971 		errno = EINVAL;
972 		return (NULL);
973 	}
974 
975 	return ((char *)vmr->vmr_va + off);
976 }
977 
978 /*
979  * vcpu_assert_irq
980  *
981  * Injects the specified IRQ on the supplied vcpu/vm
982  *
983  * Parameters:
984  *  vm_id: VM ID to inject to
985  *  vcpu_id: VCPU ID to inject to
986  *  irq: IRQ to inject
987  */
988 void
vcpu_assert_irq(uint32_t vm_id,uint32_t vcpu_id,int irq)989 vcpu_assert_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
990 {
991 	i8259_assert_irq(irq);
992 
993 	if (i8259_is_pending()) {
994 		if (vcpu_intr(vm_id, vcpu_id, 1))
995 			fatalx("%s: can't assert INTR", __func__);
996 
997 		vcpu_unhalt(vcpu_id);
998 		vcpu_signal_run(vcpu_id);
999 	}
1000 }
1001 
1002 /*
1003  * vcpu_deassert_pic_irq
1004  *
1005  * Clears the specified IRQ on the supplied vcpu/vm
1006  *
1007  * Parameters:
1008  *  vm_id: VM ID to clear in
1009  *  vcpu_id: VCPU ID to clear in
1010  *  irq: IRQ to clear
1011  */
1012 void
vcpu_deassert_irq(uint32_t vm_id,uint32_t vcpu_id,int irq)1013 vcpu_deassert_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1014 {
1015 	i8259_deassert_irq(irq);
1016 
1017 	if (!i8259_is_pending()) {
1018 		if (vcpu_intr(vm_id, vcpu_id, 0))
1019 			fatalx("%s: can't deassert INTR for vm_id %d, "
1020 			    "vcpu_id %d", __func__, vm_id, vcpu_id);
1021 	}
1022 }
1023 /*
1024  * set_return_data
1025  *
1026  * Utility function for manipulating register data in vm exit info structs. This
1027  * function ensures that the data is copied to the vei->vei.vei_data field with
1028  * the proper size for the operation being performed.
1029  *
1030  * Parameters:
1031  *  vei: exit information
1032  *  data: return data
1033  */
1034 void
set_return_data(struct vm_exit * vei,uint32_t data)1035 set_return_data(struct vm_exit *vei, uint32_t data)
1036 {
1037 	switch (vei->vei.vei_size) {
1038 	case 1:
1039 		vei->vei.vei_data &= ~0xFF;
1040 		vei->vei.vei_data |= (uint8_t)data;
1041 		break;
1042 	case 2:
1043 		vei->vei.vei_data &= ~0xFFFF;
1044 		vei->vei.vei_data |= (uint16_t)data;
1045 		break;
1046 	case 4:
1047 		vei->vei.vei_data = data;
1048 		break;
1049 	}
1050 }
1051 
1052 /*
1053  * get_input_data
1054  *
1055  * Utility function for manipulating register data in vm exit info
1056  * structs. This function ensures that the data is copied from the
1057  * vei->vei.vei_data field with the proper size for the operation being
1058  * performed.
1059  *
1060  * Parameters:
1061  *  vei: exit information
1062  *  data: location to store the result
1063  */
1064 void
get_input_data(struct vm_exit * vei,uint32_t * data)1065 get_input_data(struct vm_exit *vei, uint32_t *data)
1066 {
1067 	switch (vei->vei.vei_size) {
1068 	case 1:
1069 		*data &= 0xFFFFFF00;
1070 		*data |= (uint8_t)vei->vei.vei_data;
1071 		break;
1072 	case 2:
1073 		*data &= 0xFFFF0000;
1074 		*data |= (uint16_t)vei->vei.vei_data;
1075 		break;
1076 	case 4:
1077 		*data = vei->vei.vei_data;
1078 		break;
1079 	default:
1080 		log_warnx("%s: invalid i/o size %d", __func__,
1081 		    vei->vei.vei_size);
1082 	}
1083 
1084 }
1085 
1086 /*
1087  * translate_gva
1088  *
1089  * Translates a guest virtual address to a guest physical address by walking
1090  * the currently active page table (if needed).
1091  *
1092  * XXX ensure translate_gva updates the A bit in the PTE
1093  * XXX ensure translate_gva respects segment base and limits in i386 mode
1094  * XXX ensure translate_gva respects segment wraparound in i8086 mode
1095  * XXX ensure translate_gva updates the A bit in the segment selector
1096  * XXX ensure translate_gva respects CR4.LMSLE if available
1097  *
1098  * Parameters:
1099  *  exit: The VCPU this translation should be performed for (guest MMU settings
1100  *   are gathered from this VCPU)
1101  *  va: virtual address to translate
1102  *  pa: pointer to paddr_t variable that will receive the translated physical
1103  *   address. 'pa' is unchanged on error.
1104  *  mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which
1105  *   the address should be translated
1106  *
1107  * Return values:
1108  *  0: the address was successfully translated - 'pa' contains the physical
1109  *     address currently mapped by 'va'.
1110  *  EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case
1111  *     and %cr2 set in the vcpu structure.
1112  *  EINVAL: an error occurred reading paging table structures
1113  */
1114 int
translate_gva(struct vm_exit * exit,uint64_t va,uint64_t * pa,int mode)1115 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode)
1116 {
1117 	int level, shift, pdidx;
1118 	uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask;
1119 	uint64_t shift_width, pte_size;
1120 	struct vcpu_reg_state *vrs;
1121 
1122 	vrs = &exit->vrs;
1123 
1124 	if (!pa)
1125 		return (EINVAL);
1126 
1127 	if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) {
1128 		log_debug("%s: unpaged, va=pa=0x%llx", __func__, va);
1129 		*pa = va;
1130 		return (0);
1131 	}
1132 
1133 	pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3];
1134 
1135 	log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__,
1136 	    vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]);
1137 
1138 	if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) {
1139 		if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) {
1140 			pte_size = sizeof(uint64_t);
1141 			shift_width = 9;
1142 
1143 			if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) {
1144 				/* 4 level paging */
1145 				level = 4;
1146 				mask = L4_MASK;
1147 				shift = L4_SHIFT;
1148 			} else {
1149 				/* 32 bit with PAE paging */
1150 				level = 3;
1151 				mask = L3_MASK;
1152 				shift = L3_SHIFT;
1153 			}
1154 		} else {
1155 			/* 32 bit paging */
1156 			level = 2;
1157 			shift_width = 10;
1158 			mask = 0xFFC00000;
1159 			shift = 22;
1160 			pte_size = sizeof(uint32_t);
1161 		}
1162 	} else
1163 		return (EINVAL);
1164 
1165 	/* XXX: Check for R bit in segment selector and set A bit */
1166 
1167 	for (;level > 0; level--) {
1168 		pdidx = (va & mask) >> shift;
1169 		pte_paddr = (pt_paddr) + (pdidx * pte_size);
1170 
1171 		log_debug("%s: read pte level %d @ GPA 0x%llx", __func__,
1172 		    level, pte_paddr);
1173 		if (read_mem(pte_paddr, &pte, pte_size)) {
1174 			log_warn("%s: failed to read pte", __func__);
1175 			return (EFAULT);
1176 		}
1177 
1178 		log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr,
1179 		    pte);
1180 
1181 		/* XXX: Set CR2  */
1182 		if (!(pte & PG_V))
1183 			return (EFAULT);
1184 
1185 		/* XXX: Check for SMAP */
1186 		if ((mode == PROT_WRITE) && !(pte & PG_RW))
1187 			return (EPERM);
1188 
1189 		if ((exit->cpl > 0) && !(pte & PG_u))
1190 			return (EPERM);
1191 
1192 		pte = pte | PG_U;
1193 		if (mode == PROT_WRITE)
1194 			pte = pte | PG_M;
1195 		if (write_mem(pte_paddr, &pte, pte_size)) {
1196 			log_warn("%s: failed to write back flags to pte",
1197 			    __func__);
1198 			return (EIO);
1199 		}
1200 
1201 		/* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */
1202 		if (pte & PG_PS)
1203 			break;
1204 
1205 		if (level > 1) {
1206 			pt_paddr = pte & PG_FRAME;
1207 			shift -= shift_width;
1208 			mask = mask >> shift_width;
1209 		}
1210 	}
1211 
1212 	low_mask = (1 << shift) - 1;
1213 	high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask;
1214 	*pa = (pte & high_mask) | (va & low_mask);
1215 
1216 	log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa);
1217 
1218 	return (0);
1219 }
1220 
1221 int
intr_pending(struct vmd_vm * vm)1222 intr_pending(struct vmd_vm *vm)
1223 {
1224 	/* XXX select active interrupt controller */
1225 	return i8259_is_pending();
1226 }
1227 
1228 int
intr_ack(struct vmd_vm * vm)1229 intr_ack(struct vmd_vm *vm)
1230 {
1231 	/* XXX select active interrupt controller */
1232 	return i8259_ack();
1233 }
1234 
1235 void
intr_toggle_el(struct vmd_vm * vm,int irq,int val)1236 intr_toggle_el(struct vmd_vm *vm, int irq, int val)
1237 {
1238 	/* XXX select active interrupt controller */
1239 	pic_set_elcr(irq, val);
1240 }
1241 
1242 int
vmd_check_vmh(struct vm_dump_header * vmh)1243 vmd_check_vmh(struct vm_dump_header *vmh)
1244 {
1245 	int i;
1246 	unsigned int code, leaf;
1247 	unsigned int a, b, c, d;
1248 
1249 	if (strncmp(vmh->vmh_signature, VM_DUMP_SIGNATURE, strlen(VM_DUMP_SIGNATURE)) != 0) {
1250 		log_warnx("%s: incompatible dump signature", __func__);
1251 		return (-1);
1252 	}
1253 
1254 	if (vmh->vmh_version != VM_DUMP_VERSION) {
1255 		log_warnx("%s: incompatible dump version", __func__);
1256 		return (-1);
1257 	}
1258 
1259 	for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
1260 		code = vmh->vmh_cpuids[i].code;
1261 		leaf = vmh->vmh_cpuids[i].leaf;
1262 		if (leaf != 0x00) {
1263 			log_debug("%s: invalid leaf 0x%x for code 0x%x",
1264 			    __func__, leaf, code);
1265 			return (-1);
1266 		}
1267 
1268 		switch (code) {
1269 		case 0x00:
1270 			CPUID_LEAF(code, leaf, a, b, c, d);
1271 			if (vmh->vmh_cpuids[i].a > a) {
1272 				log_debug("%s: incompatible cpuid level",
1273 				    __func__);
1274 				return (-1);
1275 			}
1276 			if (!(vmh->vmh_cpuids[i].b == b &&
1277 			    vmh->vmh_cpuids[i].c == c &&
1278 			    vmh->vmh_cpuids[i].d == d)) {
1279 				log_debug("%s: incompatible cpu brand",
1280 				    __func__);
1281 				return (-1);
1282 			}
1283 			break;
1284 
1285 		case 0x01:
1286 			CPUID_LEAF(code, leaf, a, b, c, d);
1287 			if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
1288 			    (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
1289 				log_debug("%s: incompatible cpu features "
1290 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
1291 				    code, leaf);
1292 				return (-1);
1293 			}
1294 			if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
1295 			    (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
1296 				log_debug("%s: incompatible cpu features "
1297 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
1298 				    code, leaf);
1299 				return (-1);
1300 			}
1301 			break;
1302 
1303 		case 0x07:
1304 			CPUID_LEAF(code, leaf, a, b, c, d);
1305 			if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
1306 			    (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
1307 				log_debug("%s: incompatible cpu features "
1308 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
1309 				    code, leaf);
1310 				return (-1);
1311 			}
1312 			if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
1313 			    (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
1314 				log_debug("%s: incompatible cpu features "
1315 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
1316 				    code, leaf);
1317 				return (-1);
1318 			}
1319 			break;
1320 
1321 		case 0x0d:
1322 			CPUID_LEAF(code, leaf, a, b, c, d);
1323 			if (vmh->vmh_cpuids[i].b > b) {
1324 				log_debug("%s: incompatible cpu: insufficient "
1325 				    "max save area for enabled XCR0 features",
1326 				    __func__);
1327 				return (-1);
1328 			}
1329 			if (vmh->vmh_cpuids[i].c > c) {
1330 				log_debug("%s: incompatible cpu: insufficient "
1331 				    "max save area for supported XCR0 features",
1332 				    __func__);
1333 				return (-1);
1334 			}
1335 			break;
1336 
1337 		case 0x80000001:
1338 			CPUID_LEAF(code, leaf, a, b, c, d);
1339 			if ((vmh->vmh_cpuids[i].a & a) !=
1340 			    vmh->vmh_cpuids[i].a) {
1341 				log_debug("%s: incompatible cpu features "
1342 				    "code: 0x%x leaf: 0x%x  reg: a", __func__,
1343 				    code, leaf);
1344 				return (-1);
1345 			}
1346 			if ((vmh->vmh_cpuids[i].c & c) !=
1347 			    vmh->vmh_cpuids[i].c) {
1348 				log_debug("%s: incompatible cpu features "
1349 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
1350 				    code, leaf);
1351 				return (-1);
1352 			}
1353 			if ((vmh->vmh_cpuids[i].d & d) !=
1354 			    vmh->vmh_cpuids[i].d) {
1355 				log_debug("%s: incompatible cpu features "
1356 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
1357 				    code, leaf);
1358 				return (-1);
1359 			}
1360 			break;
1361 
1362 		default:
1363 			log_debug("%s: unknown code 0x%x", __func__, code);
1364 			return (-1);
1365 		}
1366 	}
1367 
1368 	return (0);
1369 }
1370