xref: /openbsd/usr.sbin/vmd/x86_vm.c (revision 4fb9ab68)
1 /*	$OpenBSD: x86_vm.c,v 1.2 2024/07/12 13:51:12 dv Exp $	*/
2 /*
3  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <sys/stat.h>
19 #include <sys/types.h>
20 
21 #include <dev/ic/i8253reg.h>
22 #include <dev/isa/isareg.h>
23 
24 #include <machine/psl.h>
25 #include <machine/pte.h>
26 #include <machine/specialreg.h>
27 #include <machine/vmmvar.h>
28 
29 #include <errno.h>
30 #include <string.h>
31 #include <unistd.h>
32 
33 #include <zlib.h>
34 
35 #include "atomicio.h"
36 #include "fw_cfg.h"
37 #include "i8253.h"
38 #include "i8259.h"
39 #include "loadfile.h"
40 #include "mc146818.h"
41 #include "ns8250.h"
42 #include "pci.h"
43 #include "virtio.h"
44 
45 typedef uint8_t (*io_fn_t)(struct vm_run_params *);
46 
47 #define MAX_PORTS 65536
48 
49 io_fn_t	ioports_map[MAX_PORTS];
50 extern char *__progname;
51 
52 void	 create_memory_map(struct vm_create_params *);
53 int	 translate_gva(struct vm_exit*, uint64_t, uint64_t *, int);
54 
55 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
56     size_t);
57 static int	loadfile_bios(gzFile, off_t, struct vcpu_reg_state *);
58 static int	vcpu_exit_eptviolation(struct vm_run_params *);
59 static void	vcpu_exit_inout(struct vm_run_params *);
60 
61 extern struct vmd_vm	*current_vm;
62 extern int		 con_fd;
63 
64 /*
65  * Represents a standard register set for an OS to be booted
66  * as a flat 64 bit address space.
67  *
68  * NOT set here are:
69  *  RIP
70  *  RSP
71  *  GDTR BASE
72  *
73  * Specific bootloaders should clone this structure and override
74  * those fields as needed.
75  *
76  * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
77  *        features of the CPU in use.
78  */
79 static const struct vcpu_reg_state vcpu_init_flat64 = {
80 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
81 	.vrs_gprs[VCPU_REGS_RIP] = 0x0,
82 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
83 	.vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG,
84 	.vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
85 	.vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE,
86 	.vrs_crs[VCPU_REGS_PDPTE0] = 0ULL,
87 	.vrs_crs[VCPU_REGS_PDPTE1] = 0ULL,
88 	.vrs_crs[VCPU_REGS_PDPTE2] = 0ULL,
89 	.vrs_crs[VCPU_REGS_PDPTE3] = 0ULL,
90 	.vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
91 	.vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
92 	.vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
93 	.vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
94 	.vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
95 	.vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
96 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
97 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
98 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
99 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
100 	.vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA,
101 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
102 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
103 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
104 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
105 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
106 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
107 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
108 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
109 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
110 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
111 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
112 	.vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL,
113 	.vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87
114 };
115 
116 /*
117  * Represents a standard register set for an BIOS to be booted
118  * as a flat 16 bit address space.
119  */
120 static const struct vcpu_reg_state vcpu_init_flat16 = {
121 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
122 	.vrs_gprs[VCPU_REGS_RIP] = 0xFFF0,
123 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
124 	.vrs_crs[VCPU_REGS_CR0] = 0x60000010,
125 	.vrs_crs[VCPU_REGS_CR3] = 0,
126 	.vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000},
127 	.vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0},
128 	.vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0},
129 	.vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0},
130 	.vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0},
131 	.vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0},
132 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
133 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
134 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
135 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
136 	.vrs_msrs[VCPU_REGS_EFER] = 0ULL,
137 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
138 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
139 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
140 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
141 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
142 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
143 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
144 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
145 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
146 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
147 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
148 	.vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87
149 };
150 
151 /*
152  * create_memory_map
153  *
154  * Sets up the guest physical memory ranges that the VM can access.
155  *
156  * Parameters:
157  *  vcp: VM create parameters describing the VM whose memory map
158  *       is being created
159  *
160  * Return values:
161  *  nothing
162  */
163 void
164 create_memory_map(struct vm_create_params *vcp)
165 {
166 	size_t len, mem_bytes;
167 	size_t above_1m = 0, above_4g = 0;
168 
169 	mem_bytes = vcp->vcp_memranges[0].vmr_size;
170 	vcp->vcp_nmemranges = 0;
171 	if (mem_bytes == 0 || mem_bytes > VMM_MAX_VM_MEM_SIZE)
172 		return;
173 
174 	/* First memory region: 0 - LOWMEM_KB (DOS low mem) */
175 	len = LOWMEM_KB * 1024;
176 	vcp->vcp_memranges[0].vmr_gpa = 0x0;
177 	vcp->vcp_memranges[0].vmr_size = len;
178 	vcp->vcp_memranges[0].vmr_type = VM_MEM_RAM;
179 	mem_bytes -= len;
180 
181 	/*
182 	 * Second memory region: LOWMEM_KB - 1MB.
183 	 *
184 	 * N.B. - Normally ROMs or parts of video RAM are mapped here.
185 	 * We have to add this region, because some systems
186 	 * unconditionally write to 0xb8000 (VGA RAM), and
187 	 * we need to make sure that vmm(4) permits accesses
188 	 * to it. So allocate guest memory for it.
189 	 */
190 	len = MB(1) - (LOWMEM_KB * 1024);
191 	vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
192 	vcp->vcp_memranges[1].vmr_size = len;
193 	vcp->vcp_memranges[1].vmr_type = VM_MEM_RESERVED;
194 	mem_bytes -= len;
195 
196 	/* If we have less than 2MB remaining, still create a 2nd BIOS area. */
197 	if (mem_bytes <= MB(2)) {
198 		vcp->vcp_memranges[2].vmr_gpa = VMM_PCI_MMIO_BAR_END;
199 		vcp->vcp_memranges[2].vmr_size = MB(2);
200 		vcp->vcp_memranges[2].vmr_type = VM_MEM_RESERVED;
201 		vcp->vcp_nmemranges = 3;
202 		return;
203 	}
204 
205 	/*
206 	 * Calculate the how to split any remaining memory across the 4GB
207 	 * boundary while making sure we do not place physical memory into
208 	 * MMIO ranges.
209 	 */
210 	if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - MB(1)) {
211 		above_1m = VMM_PCI_MMIO_BAR_BASE - MB(1);
212 		above_4g = mem_bytes - above_1m;
213 	} else {
214 		above_1m = mem_bytes;
215 		above_4g = 0;
216 	}
217 
218 	/* Third memory region: area above 1MB to MMIO region */
219 	vcp->vcp_memranges[2].vmr_gpa = MB(1);
220 	vcp->vcp_memranges[2].vmr_size = above_1m;
221 	vcp->vcp_memranges[2].vmr_type = VM_MEM_RAM;
222 
223 	/* Fourth region: PCI MMIO range */
224 	vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_BASE;
225 	vcp->vcp_memranges[3].vmr_size = VMM_PCI_MMIO_BAR_END -
226 	    VMM_PCI_MMIO_BAR_BASE + 1;
227 	vcp->vcp_memranges[3].vmr_type = VM_MEM_MMIO;
228 
229 	/* Fifth region: 2nd copy of BIOS above MMIO ending at 4GB */
230 	vcp->vcp_memranges[4].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
231 	vcp->vcp_memranges[4].vmr_size = MB(2);
232 	vcp->vcp_memranges[4].vmr_type = VM_MEM_RESERVED;
233 
234 	/* Sixth region: any remainder above 4GB */
235 	if (above_4g > 0) {
236 		vcp->vcp_memranges[5].vmr_gpa = GB(4);
237 		vcp->vcp_memranges[5].vmr_size = above_4g;
238 		vcp->vcp_memranges[5].vmr_type = VM_MEM_RAM;
239 		vcp->vcp_nmemranges = 6;
240 	} else
241 		vcp->vcp_nmemranges = 5;
242 }
243 
244 int
245 load_firmware(struct vmd_vm *vm, struct vcpu_reg_state *vrs)
246 {
247 	int		ret;
248 	gzFile		fp;
249 	struct stat	sb;
250 
251 	/*
252 	 * Set up default "flat 64 bit" register state - RIP, RSP, and
253 	 * GDT info will be set in bootloader
254 	 */
255 	memcpy(vrs, &vcpu_init_flat64, sizeof(*vrs));
256 
257 	/* Find and open kernel image */
258 	if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL)
259 		fatalx("failed to open kernel - exiting");
260 
261 	/* Load kernel image */
262 	ret = loadfile_elf(fp, vm, vrs, vm->vm_params.vmc_bootdevice);
263 
264 	/*
265 	 * Try BIOS as a fallback (only if it was provided as an image
266 	 * with vm->vm_kernel and the file is not compressed)
267 	 */
268 	if (ret && errno == ENOEXEC && vm->vm_kernel != -1 &&
269 	    gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0)
270 		ret = loadfile_bios(fp, sb.st_size, vrs);
271 
272 	gzclose(fp);
273 
274 	return (ret);
275 }
276 
277 
278 /*
279  * loadfile_bios
280  *
281  * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image
282  * directly into memory.
283  *
284  * Parameters:
285  *  fp: file of a kernel file to load
286  *  size: uncompressed size of the image
287  *  (out) vrs: register state to set on init for this kernel
288  *
289  * Return values:
290  *  0 if successful
291  *  various error codes returned from read(2) or loadelf functions
292  */
293 int
294 loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs)
295 {
296 	off_t	 off;
297 
298 	/* Set up a "flat 16 bit" register state for BIOS */
299 	memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs));
300 
301 	/* Seek to the beginning of the BIOS image */
302 	if (gzseek(fp, 0, SEEK_SET) == -1)
303 		return (-1);
304 
305 	/* The BIOS image must end at 1MB */
306 	if ((off = MB(1) - size) < 0)
307 		return (-1);
308 
309 	/* Read BIOS image into memory */
310 	if (mread(fp, off, size) != (size_t)size) {
311 		errno = EIO;
312 		return (-1);
313 	}
314 
315 	if (gzseek(fp, 0, SEEK_SET) == -1)
316 		return (-1);
317 
318 	/* Read a second BIOS copy into memory ending at 4GB */
319 	off = GB(4) - size;
320 	if (mread(fp, off, size) != (size_t)size) {
321 		errno = EIO;
322 		return (-1);
323 	}
324 
325 	log_debug("%s: loaded BIOS image", __func__);
326 
327 	return (0);
328 }
329 
330 /*
331  * init_emulated_hw
332  *
333  * Initializes the userspace hardware emulation
334  */
335 void
336 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
337     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
338 {
339 	struct vm_create_params *vcp = &vmc->vmc_params;
340 	size_t i;
341 	uint64_t memlo, memhi;
342 
343 	/* Calculate memory size for NVRAM registers */
344 	memlo = memhi = 0;
345 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
346 		if (vcp->vcp_memranges[i].vmr_gpa == MB(1) &&
347 		    vcp->vcp_memranges[i].vmr_size > (15 * MB(1)))
348 			memlo = vcp->vcp_memranges[i].vmr_size - (15 * MB(1));
349 		else if (vcp->vcp_memranges[i].vmr_gpa == GB(4))
350 			memhi = vcp->vcp_memranges[i].vmr_size;
351 	}
352 
353 	/* Reset the IO port map */
354 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
355 
356 	/* Init i8253 PIT */
357 	i8253_init(vcp->vcp_id);
358 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
359 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
360 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
361 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
362 	ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc;
363 
364 	/* Init mc146818 RTC */
365 	mc146818_init(vcp->vcp_id, memlo, memhi);
366 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
367 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
368 
369 	/* Init master and slave PICs */
370 	i8259_init();
371 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
372 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
373 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
374 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
375 	ioports_map[ELCR0] = vcpu_exit_elcr;
376 	ioports_map[ELCR1] = vcpu_exit_elcr;
377 
378 	/* Init ns8250 UART */
379 	ns8250_init(con_fd, vcp->vcp_id);
380 	for (i = COM1_DATA; i <= COM1_SCR; i++)
381 		ioports_map[i] = vcpu_exit_com;
382 
383 	/* Initialize PCI */
384 	for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++)
385 		ioports_map[i] = vcpu_exit_pci;
386 
387 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
388 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
389 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
390 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
391 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
392 	pci_init();
393 
394 	/* Initialize virtio devices */
395 	virtio_init(current_vm, child_cdrom, child_disks, child_taps);
396 
397 	/*
398 	 * Init QEMU fw_cfg interface. Must be done last for pci hardware
399 	 * detection.
400 	 */
401 	fw_cfg_init(vmc);
402 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
403 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
404 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
405 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
406 }
407 
408 /*
409  * restore_emulated_hw
410  *
411  * Restores the userspace hardware emulation from fd
412  */
413 void
414 restore_emulated_hw(struct vm_create_params *vcp, int fd,
415     int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
416 {
417 	/* struct vm_create_params *vcp = &vmc->vmc_params; */
418 	int i;
419 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
420 
421 	/* Init i8253 PIT */
422 	i8253_restore(fd, vcp->vcp_id);
423 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
424 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
425 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
426 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
427 
428 	/* Init master and slave PICs */
429 	i8259_restore(fd);
430 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
431 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
432 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
433 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
434 
435 	/* Init ns8250 UART */
436 	ns8250_restore(fd, con_fd, vcp->vcp_id);
437 	for (i = COM1_DATA; i <= COM1_SCR; i++)
438 		ioports_map[i] = vcpu_exit_com;
439 
440 	/* Init mc146818 RTC */
441 	mc146818_restore(fd, vcp->vcp_id);
442 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
443 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
444 
445 	/* Init QEMU fw_cfg interface */
446 	fw_cfg_restore(fd);
447 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
448 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
449 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
450 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
451 
452 	/* Initialize PCI */
453 	for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++)
454 		ioports_map[i] = vcpu_exit_pci;
455 
456 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
457 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
458 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
459 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
460 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
461 	pci_restore(fd);
462 	virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps);
463 }
464 
465 void
466 pause_vm_md(struct vmd_vm *vm)
467 {
468 	i8253_stop();
469 	mc146818_stop();
470 	ns8250_stop();
471 	virtio_stop(vm);
472 }
473 
474 void
475 unpause_vm_md(struct vmd_vm *vm)
476 {
477 	i8253_start();
478 	mc146818_start();
479 	ns8250_start();
480 	virtio_start(vm);
481 }
482 
483 int
484 dump_devs(int fd)
485 {
486 	int ret = 0;
487 
488 	if ((ret = i8253_dump(fd)))
489 		return ret;
490 	if ((ret = i8259_dump(fd)))
491 		return ret;
492 	if ((ret = ns8250_dump(fd)))
493 		return ret;
494 	if ((ret = mc146818_dump(fd)))
495 		return ret;
496 	ret = fw_cfg_dump(fd);
497 
498 	return ret;
499 }
500 
501 int
502 dump_send_header(int fd) {
503 	struct vm_dump_header	   vmh;
504 	int			   i;
505 
506 	memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE,
507 	    sizeof(vmh.vmh_signature));
508 
509 	vmh.vmh_cpuids[0].code = 0x00;
510 	vmh.vmh_cpuids[0].leaf = 0x00;
511 
512 	vmh.vmh_cpuids[1].code = 0x01;
513 	vmh.vmh_cpuids[1].leaf = 0x00;
514 
515 	vmh.vmh_cpuids[2].code = 0x07;
516 	vmh.vmh_cpuids[2].leaf = 0x00;
517 
518 	vmh.vmh_cpuids[3].code = 0x0d;
519 	vmh.vmh_cpuids[3].leaf = 0x00;
520 
521 	vmh.vmh_cpuids[4].code = 0x80000001;
522 	vmh.vmh_cpuids[4].leaf = 0x00;
523 
524 	vmh.vmh_version = VM_DUMP_VERSION;
525 
526 	for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
527 		CPUID_LEAF(vmh.vmh_cpuids[i].code,
528 		    vmh.vmh_cpuids[i].leaf,
529 		    vmh.vmh_cpuids[i].a,
530 		    vmh.vmh_cpuids[i].b,
531 		    vmh.vmh_cpuids[i].c,
532 		    vmh.vmh_cpuids[i].d);
533 	}
534 
535 	if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh))
536 		return (-1);
537 
538 	return (0);
539 }
540 
541 
542 /*
543  * vcpu_exit_inout
544  *
545  * Handle all I/O exits that need to be emulated in vmd. This includes the
546  * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
547  *
548  * Parameters:
549  *  vrp: vcpu run parameters containing guest state for this exit
550  */
551 void
552 vcpu_exit_inout(struct vm_run_params *vrp)
553 {
554 	struct vm_exit *vei = vrp->vrp_exit;
555 	uint8_t intr = 0xFF;
556 
557 	if (vei->vei.vei_rep || vei->vei.vei_string) {
558 #ifdef MMIO_DEBUG
559 		log_info("%s: %s%s%s %d-byte, enc=%d, data=0x%08x, port=0x%04x",
560 		    __func__,
561 		    vei->vei.vei_rep == 0 ? "" : "REP ",
562 		    vei->vei.vei_dir == VEI_DIR_IN ? "IN" : "OUT",
563 		    vei->vei.vei_string == 0 ? "" : "S",
564 		    vei->vei.vei_size, vei->vei.vei_encoding,
565 		    vei->vei.vei_data, vei->vei.vei_port);
566 		log_info("%s: ECX = 0x%llx, RDX = 0x%llx, RSI = 0x%llx",
567 		    __func__,
568 		    vei->vrs.vrs_gprs[VCPU_REGS_RCX],
569 		    vei->vrs.vrs_gprs[VCPU_REGS_RDX],
570 		    vei->vrs.vrs_gprs[VCPU_REGS_RSI]);
571 #endif /* MMIO_DEBUG */
572 		fatalx("%s: can't emulate REP prefixed IN(S)/OUT(S)",
573 		    __func__);
574 	}
575 
576 	if (ioports_map[vei->vei.vei_port] != NULL)
577 		intr = ioports_map[vei->vei.vei_port](vrp);
578 	else if (vei->vei.vei_dir == VEI_DIR_IN)
579 		set_return_data(vei, 0xFFFFFFFF);
580 
581 	vei->vrs.vrs_gprs[VCPU_REGS_RIP] += vei->vei.vei_insn_len;
582 
583 	if (intr != 0xFF)
584 		vcpu_assert_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
585 }
586 
587 /*
588  * vcpu_exit
589  *
590  * Handle a vcpu exit. This function is called when it is determined that
591  * vmm(4) requires the assistance of vmd to support a particular guest
592  * exit type (eg, accessing an I/O port or device). Guest state is contained
593  * in 'vrp', and will be resent to vmm(4) on exit completion.
594  *
595  * Upon conclusion of handling the exit, the function determines if any
596  * interrupts should be injected into the guest, and asserts the proper
597  * IRQ line whose interrupt should be vectored.
598  *
599  * Parameters:
600  *  vrp: vcpu run parameters containing guest state for this exit
601  *
602  * Return values:
603  *  0: the exit was handled successfully
604  *  1: an error occurred (eg, unknown exit reason passed in 'vrp')
605  */
606 int
607 vcpu_exit(struct vm_run_params *vrp)
608 {
609 	int ret;
610 
611 	switch (vrp->vrp_exit_reason) {
612 	case VMX_EXIT_INT_WINDOW:
613 	case SVM_VMEXIT_VINTR:
614 	case VMX_EXIT_CPUID:
615 	case VMX_EXIT_EXTINT:
616 	case SVM_VMEXIT_INTR:
617 	case SVM_VMEXIT_MSR:
618 	case SVM_VMEXIT_CPUID:
619 		/*
620 		 * We may be exiting to vmd to handle a pending interrupt but
621 		 * at the same time the last exit type may have been one of
622 		 * these. In this case, there's nothing extra to be done
623 		 * here (and falling through to the default case below results
624 		 * in more vmd log spam).
625 		 */
626 		break;
627 	case SVM_VMEXIT_NPF:
628 	case VMX_EXIT_EPT_VIOLATION:
629 		ret = vcpu_exit_eptviolation(vrp);
630 		if (ret)
631 			return (ret);
632 		break;
633 	case VMX_EXIT_IO:
634 	case SVM_VMEXIT_IOIO:
635 		vcpu_exit_inout(vrp);
636 		break;
637 	case VMX_EXIT_HLT:
638 	case SVM_VMEXIT_HLT:
639 		vcpu_halt(vrp->vrp_vcpu_id);
640 		break;
641 	case VMX_EXIT_TRIPLE_FAULT:
642 	case SVM_VMEXIT_SHUTDOWN:
643 		/* reset VM */
644 		return (EAGAIN);
645 	default:
646 		log_debug("%s: unknown exit reason 0x%x",
647 		    __progname, vrp->vrp_exit_reason);
648 	}
649 
650 	return (0);
651 }
652 
653 /*
654  * vcpu_exit_eptviolation
655  *
656  * handle an EPT Violation
657  *
658  * Parameters:
659  *  vrp: vcpu run parameters containing guest state for this exit
660  *
661  * Return values:
662  *  0: no action required
663  *  EFAULT: a protection fault occured, kill the vm.
664  */
665 static int
666 vcpu_exit_eptviolation(struct vm_run_params *vrp)
667 {
668 	struct vm_exit *ve = vrp->vrp_exit;
669 	int ret = 0;
670 #if MMIO_NOTYET
671 	struct x86_insn insn;
672 	uint64_t va, pa;
673 	size_t len = 15;		/* Max instruction length in x86. */
674 #endif /* MMIO_NOTYET */
675 	switch (ve->vee.vee_fault_type) {
676 	case VEE_FAULT_HANDLED:
677 		break;
678 
679 #if MMIO_NOTYET
680 	case VEE_FAULT_MMIO_ASSIST:
681 		/* Intel VMX might give us the length of the instruction. */
682 		if (ve->vee.vee_insn_info & VEE_LEN_VALID)
683 			len = ve->vee.vee_insn_len;
684 
685 		if (len > 15)
686 			fatalx("%s: invalid instruction length %lu", __func__,
687 			    len);
688 
689 		/* If we weren't given instruction bytes, we need to fetch. */
690 		if (!(ve->vee.vee_insn_info & VEE_BYTES_VALID)) {
691 			memset(ve->vee.vee_insn_bytes, 0,
692 			    sizeof(ve->vee.vee_insn_bytes));
693 			va = ve->vrs.vrs_gprs[VCPU_REGS_RIP];
694 
695 			/* XXX Only support instructions that fit on 1 page. */
696 			if ((va & PAGE_MASK) + len > PAGE_SIZE) {
697 				log_warnx("%s: instruction might cross page "
698 				    "boundary", __func__);
699 				ret = EINVAL;
700 				break;
701 			}
702 
703 			ret = translate_gva(ve, va, &pa, PROT_EXEC);
704 			if (ret != 0) {
705 				log_warnx("%s: failed gva translation",
706 				    __func__);
707 				break;
708 			}
709 
710 			ret = read_mem(pa, ve->vee.vee_insn_bytes, len);
711 			if (ret != 0) {
712 				log_warnx("%s: failed to fetch instruction "
713 				    "bytes from 0x%llx", __func__, pa);
714 				break;
715 			}
716 		}
717 
718 		ret = insn_decode(ve, &insn);
719 		if (ret == 0)
720 			ret = insn_emulate(ve, &insn);
721 		break;
722 #endif /* MMIO_NOTYET */
723 
724 	case VEE_FAULT_PROTECT:
725 		log_debug("%s: EPT Violation: rip=0x%llx", __progname,
726 		    ve->vrs.vrs_gprs[VCPU_REGS_RIP]);
727 		ret = EFAULT;
728 		break;
729 
730 	default:
731 		fatalx("%s: invalid fault_type %d", __progname,
732 		    ve->vee.vee_fault_type);
733 		/* UNREACHED */
734 	}
735 
736 	return (ret);
737 }
738 
739 /*
740  * vcpu_exit_pci
741  *
742  * Handle all I/O to the emulated PCI subsystem.
743  *
744  * Parameters:
745  *  vrp: vcpu run parameters containing guest state for this exit
746  *
747  * Return value:
748  *  Interrupt to inject to the guest VM, or 0xFF if no interrupt should
749  *      be injected.
750  */
751 uint8_t
752 vcpu_exit_pci(struct vm_run_params *vrp)
753 {
754 	struct vm_exit *vei = vrp->vrp_exit;
755 	uint8_t intr;
756 
757 	intr = 0xFF;
758 
759 	switch (vei->vei.vei_port) {
760 	case PCI_MODE1_ADDRESS_REG:
761 		pci_handle_address_reg(vrp);
762 		break;
763 	case PCI_MODE1_DATA_REG:
764 	case PCI_MODE1_DATA_REG + 1:
765 	case PCI_MODE1_DATA_REG + 2:
766 	case PCI_MODE1_DATA_REG + 3:
767 		pci_handle_data_reg(vrp);
768 		break;
769 	case VM_PCI_IO_BAR_BASE ... VM_PCI_IO_BAR_END:
770 		intr = pci_handle_io(vrp);
771 		break;
772 	default:
773 		log_warnx("%s: unknown PCI register 0x%llx",
774 		    __progname, (uint64_t)vei->vei.vei_port);
775 		break;
776 	}
777 
778 	return (intr);
779 }
780 
781 /*
782  * find_gpa_range
783  *
784  * Search for a contiguous guest physical mem range.
785  *
786  * Parameters:
787  *  vcp: VM create parameters that contain the memory map to search in
788  *  gpa: the starting guest physical address
789  *  len: the length of the memory range
790  *
791  * Return values:
792  *  NULL: on failure if there is no memory range as described by the parameters
793  *  Pointer to vm_mem_range that contains the start of the range otherwise.
794  */
795 static struct vm_mem_range *
796 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
797 {
798 	size_t i, n;
799 	struct vm_mem_range *vmr;
800 
801 	/* Find the first vm_mem_range that contains gpa */
802 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
803 		vmr = &vcp->vcp_memranges[i];
804 		if (gpa < vmr->vmr_gpa + vmr->vmr_size)
805 			break;
806 	}
807 
808 	/* No range found. */
809 	if (i == vcp->vcp_nmemranges)
810 		return (NULL);
811 
812 	/*
813 	 * vmr may cover the range [gpa, gpa + len) only partly. Make
814 	 * sure that the following vm_mem_ranges are contiguous and
815 	 * cover the rest.
816 	 */
817 	n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
818 	if (len < n)
819 		len = 0;
820 	else
821 		len -= n;
822 	gpa = vmr->vmr_gpa + vmr->vmr_size;
823 	for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
824 		vmr = &vcp->vcp_memranges[i];
825 		if (gpa != vmr->vmr_gpa)
826 			return (NULL);
827 		if (len <= vmr->vmr_size)
828 			len = 0;
829 		else
830 			len -= vmr->vmr_size;
831 
832 		gpa = vmr->vmr_gpa + vmr->vmr_size;
833 	}
834 
835 	if (len != 0)
836 		return (NULL);
837 
838 	return (vmr);
839 }
840 /*
841  * write_mem
842  *
843  * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
844  *
845  * Parameters:
846  *  dst: the destination paddr_t in the guest VM
847  *  buf: data to copy (or NULL to zero the data)
848  *  len: number of bytes to copy
849  *
850  * Return values:
851  *  0: success
852  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
853  *      exist in the guest.
854  */
855 int
856 write_mem(paddr_t dst, const void *buf, size_t len)
857 {
858 	const char *from = buf;
859 	char *to;
860 	size_t n, off;
861 	struct vm_mem_range *vmr;
862 
863 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
864 	if (vmr == NULL) {
865 		errno = EINVAL;
866 		log_warn("%s: failed - invalid memory range dst = 0x%lx, "
867 		    "len = 0x%zx", __func__, dst, len);
868 		return (EINVAL);
869 	}
870 
871 	off = dst - vmr->vmr_gpa;
872 	while (len != 0) {
873 		n = vmr->vmr_size - off;
874 		if (len < n)
875 			n = len;
876 
877 		to = (char *)vmr->vmr_va + off;
878 		if (buf == NULL)
879 			memset(to, 0, n);
880 		else {
881 			memcpy(to, from, n);
882 			from += n;
883 		}
884 		len -= n;
885 		off = 0;
886 		vmr++;
887 	}
888 
889 	return (0);
890 }
891 
892 /*
893  * read_mem
894  *
895  * Reads memory at guest paddr 'src' into 'buf'.
896  *
897  * Parameters:
898  *  src: the source paddr_t in the guest VM to read from.
899  *  buf: destination (local) buffer
900  *  len: number of bytes to read
901  *
902  * Return values:
903  *  0: success
904  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
905  *      exist in the guest.
906  */
907 int
908 read_mem(paddr_t src, void *buf, size_t len)
909 {
910 	char *from, *to = buf;
911 	size_t n, off;
912 	struct vm_mem_range *vmr;
913 
914 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
915 	if (vmr == NULL) {
916 		errno = EINVAL;
917 		log_warn("%s: failed - invalid memory range src = 0x%lx, "
918 		    "len = 0x%zx", __func__, src, len);
919 		return (EINVAL);
920 	}
921 
922 	off = src - vmr->vmr_gpa;
923 	while (len != 0) {
924 		n = vmr->vmr_size - off;
925 		if (len < n)
926 			n = len;
927 
928 		from = (char *)vmr->vmr_va + off;
929 		memcpy(to, from, n);
930 
931 		to += n;
932 		len -= n;
933 		off = 0;
934 		vmr++;
935 	}
936 
937 	return (0);
938 }
939 
940 /*
941  * hvaddr_mem
942  *
943  * Translate a guest physical address to a host virtual address, checking the
944  * provided memory range length to confirm it's contiguous within the same
945  * guest memory range (vm_mem_range).
946  *
947  * Parameters:
948  *  gpa: guest physical address to translate
949  *  len: number of bytes in the intended range
950  *
951  * Return values:
952  *  void* to host virtual memory on success
953  *  NULL on error, setting errno to:
954  *    EFAULT: gpa falls outside guest memory ranges
955  *    EINVAL: requested len extends beyond memory range
956  */
957 void *
958 hvaddr_mem(paddr_t gpa, size_t len)
959 {
960 	struct vm_mem_range *vmr;
961 	size_t off;
962 
963 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, gpa, len);
964 	if (vmr == NULL) {
965 		log_warnx("%s: failed - invalid gpa: 0x%lx\n", __func__, gpa);
966 		errno = EFAULT;
967 		return (NULL);
968 	}
969 
970 	off = gpa - vmr->vmr_gpa;
971 	if (len > (vmr->vmr_size - off)) {
972 		log_warnx("%s: failed - invalid memory range: gpa=0x%lx, "
973 		    "len=%zu", __func__, gpa, len);
974 		errno = EINVAL;
975 		return (NULL);
976 	}
977 
978 	return ((char *)vmr->vmr_va + off);
979 }
980 
981 /*
982  * vcpu_assert_irq
983  *
984  * Injects the specified IRQ on the supplied vcpu/vm
985  *
986  * Parameters:
987  *  vm_id: VM ID to inject to
988  *  vcpu_id: VCPU ID to inject to
989  *  irq: IRQ to inject
990  */
991 void
992 vcpu_assert_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
993 {
994 	i8259_assert_irq(irq);
995 
996 	if (i8259_is_pending()) {
997 		if (vcpu_intr(vm_id, vcpu_id, 1))
998 			fatalx("%s: can't assert INTR", __func__);
999 
1000 		vcpu_unhalt(vcpu_id);
1001 		vcpu_signal_run(vcpu_id);
1002 	}
1003 }
1004 
1005 /*
1006  * vcpu_deassert_pic_irq
1007  *
1008  * Clears the specified IRQ on the supplied vcpu/vm
1009  *
1010  * Parameters:
1011  *  vm_id: VM ID to clear in
1012  *  vcpu_id: VCPU ID to clear in
1013  *  irq: IRQ to clear
1014  */
1015 void
1016 vcpu_deassert_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1017 {
1018 	i8259_deassert_irq(irq);
1019 
1020 	if (!i8259_is_pending()) {
1021 		if (vcpu_intr(vm_id, vcpu_id, 0))
1022 			fatalx("%s: can't deassert INTR for vm_id %d, "
1023 			    "vcpu_id %d", __func__, vm_id, vcpu_id);
1024 	}
1025 }
1026 /*
1027  * set_return_data
1028  *
1029  * Utility function for manipulating register data in vm exit info structs. This
1030  * function ensures that the data is copied to the vei->vei.vei_data field with
1031  * the proper size for the operation being performed.
1032  *
1033  * Parameters:
1034  *  vei: exit information
1035  *  data: return data
1036  */
1037 void
1038 set_return_data(struct vm_exit *vei, uint32_t data)
1039 {
1040 	switch (vei->vei.vei_size) {
1041 	case 1:
1042 		vei->vei.vei_data &= ~0xFF;
1043 		vei->vei.vei_data |= (uint8_t)data;
1044 		break;
1045 	case 2:
1046 		vei->vei.vei_data &= ~0xFFFF;
1047 		vei->vei.vei_data |= (uint16_t)data;
1048 		break;
1049 	case 4:
1050 		vei->vei.vei_data = data;
1051 		break;
1052 	}
1053 }
1054 
1055 /*
1056  * get_input_data
1057  *
1058  * Utility function for manipulating register data in vm exit info
1059  * structs. This function ensures that the data is copied from the
1060  * vei->vei.vei_data field with the proper size for the operation being
1061  * performed.
1062  *
1063  * Parameters:
1064  *  vei: exit information
1065  *  data: location to store the result
1066  */
1067 void
1068 get_input_data(struct vm_exit *vei, uint32_t *data)
1069 {
1070 	switch (vei->vei.vei_size) {
1071 	case 1:
1072 		*data &= 0xFFFFFF00;
1073 		*data |= (uint8_t)vei->vei.vei_data;
1074 		break;
1075 	case 2:
1076 		*data &= 0xFFFF0000;
1077 		*data |= (uint16_t)vei->vei.vei_data;
1078 		break;
1079 	case 4:
1080 		*data = vei->vei.vei_data;
1081 		break;
1082 	default:
1083 		log_warnx("%s: invalid i/o size %d", __func__,
1084 		    vei->vei.vei_size);
1085 	}
1086 
1087 }
1088 
1089 /*
1090  * translate_gva
1091  *
1092  * Translates a guest virtual address to a guest physical address by walking
1093  * the currently active page table (if needed).
1094  *
1095  * XXX ensure translate_gva updates the A bit in the PTE
1096  * XXX ensure translate_gva respects segment base and limits in i386 mode
1097  * XXX ensure translate_gva respects segment wraparound in i8086 mode
1098  * XXX ensure translate_gva updates the A bit in the segment selector
1099  * XXX ensure translate_gva respects CR4.LMSLE if available
1100  *
1101  * Parameters:
1102  *  exit: The VCPU this translation should be performed for (guest MMU settings
1103  *   are gathered from this VCPU)
1104  *  va: virtual address to translate
1105  *  pa: pointer to paddr_t variable that will receive the translated physical
1106  *   address. 'pa' is unchanged on error.
1107  *  mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which
1108  *   the address should be translated
1109  *
1110  * Return values:
1111  *  0: the address was successfully translated - 'pa' contains the physical
1112  *     address currently mapped by 'va'.
1113  *  EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case
1114  *     and %cr2 set in the vcpu structure.
1115  *  EINVAL: an error occurred reading paging table structures
1116  */
1117 int
1118 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode)
1119 {
1120 	int level, shift, pdidx;
1121 	uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask;
1122 	uint64_t shift_width, pte_size;
1123 	struct vcpu_reg_state *vrs;
1124 
1125 	vrs = &exit->vrs;
1126 
1127 	if (!pa)
1128 		return (EINVAL);
1129 
1130 	if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) {
1131 		log_debug("%s: unpaged, va=pa=0x%llx", __func__, va);
1132 		*pa = va;
1133 		return (0);
1134 	}
1135 
1136 	pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3];
1137 
1138 	log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__,
1139 	    vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]);
1140 
1141 	if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) {
1142 		if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) {
1143 			pte_size = sizeof(uint64_t);
1144 			shift_width = 9;
1145 
1146 			if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) {
1147 				/* 4 level paging */
1148 				level = 4;
1149 				mask = L4_MASK;
1150 				shift = L4_SHIFT;
1151 			} else {
1152 				/* 32 bit with PAE paging */
1153 				level = 3;
1154 				mask = L3_MASK;
1155 				shift = L3_SHIFT;
1156 			}
1157 		} else {
1158 			/* 32 bit paging */
1159 			level = 2;
1160 			shift_width = 10;
1161 			mask = 0xFFC00000;
1162 			shift = 22;
1163 			pte_size = sizeof(uint32_t);
1164 		}
1165 	} else
1166 		return (EINVAL);
1167 
1168 	/* XXX: Check for R bit in segment selector and set A bit */
1169 
1170 	for (;level > 0; level--) {
1171 		pdidx = (va & mask) >> shift;
1172 		pte_paddr = (pt_paddr) + (pdidx * pte_size);
1173 
1174 		log_debug("%s: read pte level %d @ GPA 0x%llx", __func__,
1175 		    level, pte_paddr);
1176 		if (read_mem(pte_paddr, &pte, pte_size)) {
1177 			log_warn("%s: failed to read pte", __func__);
1178 			return (EFAULT);
1179 		}
1180 
1181 		log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr,
1182 		    pte);
1183 
1184 		/* XXX: Set CR2  */
1185 		if (!(pte & PG_V))
1186 			return (EFAULT);
1187 
1188 		/* XXX: Check for SMAP */
1189 		if ((mode == PROT_WRITE) && !(pte & PG_RW))
1190 			return (EPERM);
1191 
1192 		if ((exit->cpl > 0) && !(pte & PG_u))
1193 			return (EPERM);
1194 
1195 		pte = pte | PG_U;
1196 		if (mode == PROT_WRITE)
1197 			pte = pte | PG_M;
1198 		if (write_mem(pte_paddr, &pte, pte_size)) {
1199 			log_warn("%s: failed to write back flags to pte",
1200 			    __func__);
1201 			return (EIO);
1202 		}
1203 
1204 		/* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */
1205 		if (pte & PG_PS)
1206 			break;
1207 
1208 		if (level > 1) {
1209 			pt_paddr = pte & PG_FRAME;
1210 			shift -= shift_width;
1211 			mask = mask >> shift_width;
1212 		}
1213 	}
1214 
1215 	low_mask = (1 << shift) - 1;
1216 	high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask;
1217 	*pa = (pte & high_mask) | (va & low_mask);
1218 
1219 	log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa);
1220 
1221 	return (0);
1222 }
1223 
1224 int
1225 intr_pending(struct vmd_vm *vm)
1226 {
1227 	/* XXX select active interrupt controller */
1228 	return i8259_is_pending();
1229 }
1230 
1231 int
1232 intr_ack(struct vmd_vm *vm)
1233 {
1234 	/* XXX select active interrupt controller */
1235 	return i8259_ack();
1236 }
1237 
1238 void
1239 intr_toggle_el(struct vmd_vm *vm, int irq, int val)
1240 {
1241 	/* XXX select active interrupt controller */
1242 	pic_set_elcr(irq, val);
1243 }
1244 
1245 int
1246 vmd_check_vmh(struct vm_dump_header *vmh)
1247 {
1248 	int i;
1249 	unsigned int code, leaf;
1250 	unsigned int a, b, c, d;
1251 
1252 	if (strncmp(vmh->vmh_signature, VM_DUMP_SIGNATURE, strlen(VM_DUMP_SIGNATURE)) != 0) {
1253 		log_warnx("%s: incompatible dump signature", __func__);
1254 		return (-1);
1255 	}
1256 
1257 	if (vmh->vmh_version != VM_DUMP_VERSION) {
1258 		log_warnx("%s: incompatible dump version", __func__);
1259 		return (-1);
1260 	}
1261 
1262 	for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
1263 		code = vmh->vmh_cpuids[i].code;
1264 		leaf = vmh->vmh_cpuids[i].leaf;
1265 		if (leaf != 0x00) {
1266 			log_debug("%s: invalid leaf 0x%x for code 0x%x",
1267 			    __func__, leaf, code);
1268 			return (-1);
1269 		}
1270 
1271 		switch (code) {
1272 		case 0x00:
1273 			CPUID_LEAF(code, leaf, a, b, c, d);
1274 			if (vmh->vmh_cpuids[i].a > a) {
1275 				log_debug("%s: incompatible cpuid level",
1276 				    __func__);
1277 				return (-1);
1278 			}
1279 			if (!(vmh->vmh_cpuids[i].b == b &&
1280 			    vmh->vmh_cpuids[i].c == c &&
1281 			    vmh->vmh_cpuids[i].d == d)) {
1282 				log_debug("%s: incompatible cpu brand",
1283 				    __func__);
1284 				return (-1);
1285 			}
1286 			break;
1287 
1288 		case 0x01:
1289 			CPUID_LEAF(code, leaf, a, b, c, d);
1290 			if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
1291 			    (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
1292 				log_debug("%s: incompatible cpu features "
1293 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
1294 				    code, leaf);
1295 				return (-1);
1296 			}
1297 			if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
1298 			    (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
1299 				log_debug("%s: incompatible cpu features "
1300 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
1301 				    code, leaf);
1302 				return (-1);
1303 			}
1304 			break;
1305 
1306 		case 0x07:
1307 			CPUID_LEAF(code, leaf, a, b, c, d);
1308 			if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
1309 			    (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
1310 				log_debug("%s: incompatible cpu features "
1311 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
1312 				    code, leaf);
1313 				return (-1);
1314 			}
1315 			if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
1316 			    (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
1317 				log_debug("%s: incompatible cpu features "
1318 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
1319 				    code, leaf);
1320 				return (-1);
1321 			}
1322 			break;
1323 
1324 		case 0x0d:
1325 			CPUID_LEAF(code, leaf, a, b, c, d);
1326 			if (vmh->vmh_cpuids[i].b > b) {
1327 				log_debug("%s: incompatible cpu: insufficient "
1328 				    "max save area for enabled XCR0 features",
1329 				    __func__);
1330 				return (-1);
1331 			}
1332 			if (vmh->vmh_cpuids[i].c > c) {
1333 				log_debug("%s: incompatible cpu: insufficient "
1334 				    "max save area for supported XCR0 features",
1335 				    __func__);
1336 				return (-1);
1337 			}
1338 			break;
1339 
1340 		case 0x80000001:
1341 			CPUID_LEAF(code, leaf, a, b, c, d);
1342 			if ((vmh->vmh_cpuids[i].a & a) !=
1343 			    vmh->vmh_cpuids[i].a) {
1344 				log_debug("%s: incompatible cpu features "
1345 				    "code: 0x%x leaf: 0x%x  reg: a", __func__,
1346 				    code, leaf);
1347 				return (-1);
1348 			}
1349 			if ((vmh->vmh_cpuids[i].c & c) !=
1350 			    vmh->vmh_cpuids[i].c) {
1351 				log_debug("%s: incompatible cpu features "
1352 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
1353 				    code, leaf);
1354 				return (-1);
1355 			}
1356 			if ((vmh->vmh_cpuids[i].d & d) !=
1357 			    vmh->vmh_cpuids[i].d) {
1358 				log_debug("%s: incompatible cpu features "
1359 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
1360 				    code, leaf);
1361 				return (-1);
1362 			}
1363 			break;
1364 
1365 		default:
1366 			log_debug("%s: unknown code 0x%x", __func__, code);
1367 			return (-1);
1368 		}
1369 	}
1370 
1371 	return (0);
1372 }
1373