xref: /openbsd/usr.sbin/vmd/vm.c (revision 9ea232b5)
1 /*	$OpenBSD: vm.c,v 1.96 2024/01/18 14:49:59 claudio Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>	/* PAGE_SIZE, MAXCOMLEN */
20 #include <sys/types.h>
21 #include <sys/ioctl.h>
22 #include <sys/queue.h>
23 #include <sys/wait.h>
24 #include <sys/uio.h>
25 #include <sys/stat.h>
26 #include <sys/socket.h>
27 #include <sys/time.h>
28 #include <sys/mman.h>
29 #include <sys/resource.h>
30 
31 #include <dev/ic/i8253reg.h>
32 #include <dev/isa/isareg.h>
33 #include <dev/pci/pcireg.h>
34 
35 #include <machine/psl.h>
36 #include <machine/pte.h>
37 #include <machine/specialreg.h>
38 #include <machine/vmmvar.h>
39 
40 #include <net/if.h>
41 
42 #include <errno.h>
43 #include <event.h>
44 #include <fcntl.h>
45 #include <imsg.h>
46 #include <limits.h>
47 #include <poll.h>
48 #include <pthread.h>
49 #include <pthread_np.h>
50 #include <stddef.h>
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <unistd.h>
55 #include <util.h>
56 
57 #include "atomicio.h"
58 #include "fw_cfg.h"
59 #include "i8253.h"
60 #include "i8259.h"
61 #include "loadfile.h"
62 #include "mc146818.h"
63 #include "mmio.h"
64 #include "ns8250.h"
65 #include "pci.h"
66 #include "virtio.h"
67 #include "vmd.h"
68 #include "vmm.h"
69 
70 #define MB(x)	(x * 1024UL * 1024UL)
71 #define GB(x)	(x * 1024UL * 1024UL * 1024UL)
72 
73 #define MMIO_NOTYET 0
74 
75 io_fn_t ioports_map[MAX_PORTS];
76 
77 static int run_vm(struct vmop_create_params *, struct vcpu_reg_state *);
78 void vm_dispatch_vmm(int, short, void *);
79 void *event_thread(void *);
80 void *vcpu_run_loop(void *);
81 int vcpu_exit(struct vm_run_params *);
82 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
83 void create_memory_map(struct vm_create_params *);
84 static int vmm_create_vm(struct vmd_vm *);
85 int alloc_guest_mem(struct vmd_vm *);
86 void init_emulated_hw(struct vmop_create_params *, int,
87     int[][VM_MAX_BASE_PER_DISK], int *);
88 void restore_emulated_hw(struct vm_create_params *, int, int *,
89     int[][VM_MAX_BASE_PER_DISK],int);
90 void vcpu_exit_inout(struct vm_run_params *);
91 int vcpu_exit_eptviolation(struct vm_run_params *);
92 uint8_t vcpu_exit_pci(struct vm_run_params *);
93 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
94 int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *);
95 static int send_vm(int, struct vmd_vm *);
96 int dump_send_header(int);
97 static int dump_vmr(int , struct vm_mem_range *);
98 static int dump_mem(int, struct vmd_vm *);
99 void restore_vmr(int, struct vm_mem_range *);
100 void restore_mem(int, struct vm_create_params *);
101 int restore_vm_params(int, struct vm_create_params *);
102 static void pause_vm(struct vmd_vm *);
103 static void unpause_vm(struct vmd_vm *);
104 
105 int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int);
106 
107 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
108     size_t);
109 
110 int con_fd;
111 struct vmd_vm *current_vm;
112 
113 extern struct vmd *env;
114 
115 extern char *__progname;
116 
117 pthread_mutex_t threadmutex;
118 pthread_cond_t threadcond;
119 
120 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM];
121 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM];
122 pthread_barrier_t vm_pause_barrier;
123 pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM];
124 pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM];
125 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM];
126 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM];
127 
128 /*
129  * Represents a standard register set for an OS to be booted
130  * as a flat 64 bit address space.
131  *
132  * NOT set here are:
133  *  RIP
134  *  RSP
135  *  GDTR BASE
136  *
137  * Specific bootloaders should clone this structure and override
138  * those fields as needed.
139  *
140  * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
141  *        features of the CPU in use.
142  */
143 static const struct vcpu_reg_state vcpu_init_flat64 = {
144 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
145 	.vrs_gprs[VCPU_REGS_RIP] = 0x0,
146 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
147 	.vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG,
148 	.vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
149 	.vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE,
150 	.vrs_crs[VCPU_REGS_PDPTE0] = 0ULL,
151 	.vrs_crs[VCPU_REGS_PDPTE1] = 0ULL,
152 	.vrs_crs[VCPU_REGS_PDPTE2] = 0ULL,
153 	.vrs_crs[VCPU_REGS_PDPTE3] = 0ULL,
154 	.vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
155 	.vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
156 	.vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
157 	.vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
158 	.vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
159 	.vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
160 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
161 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
162 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
163 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
164 	.vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA,
165 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
166 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
167 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
168 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
169 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
170 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
171 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
172 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
173 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
174 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
175 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
176 	.vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL,
177 	.vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87
178 };
179 
180 /*
181  * Represents a standard register set for an BIOS to be booted
182  * as a flat 16 bit address space.
183  */
184 static const struct vcpu_reg_state vcpu_init_flat16 = {
185 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
186 	.vrs_gprs[VCPU_REGS_RIP] = 0xFFF0,
187 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
188 	.vrs_crs[VCPU_REGS_CR0] = 0x60000010,
189 	.vrs_crs[VCPU_REGS_CR3] = 0,
190 	.vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000},
191 	.vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0},
192 	.vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0},
193 	.vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0},
194 	.vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0},
195 	.vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0},
196 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
197 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
198 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
199 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
200 	.vrs_msrs[VCPU_REGS_EFER] = 0ULL,
201 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
202 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
203 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
204 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
205 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
206 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
207 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
208 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
209 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
210 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
211 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
212 	.vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87
213 };
214 
215 /*
216  * vm_main
217  *
218  * Primary entrypoint for launching a vm. Does not return.
219  *
220  * fd: file descriptor for communicating with vmm process.
221  * fd_vmm: file descriptor for communicating with vmm(4) device
222  */
223 void
224 vm_main(int fd, int vmm_fd)
225 {
226 	struct vm_create_params	*vcp = NULL;
227 	struct vmd_vm		 vm;
228 	size_t			 sz = 0;
229 	int			 ret = 0;
230 
231 	/*
232 	 * We aren't root, so we can't chroot(2). Use unveil(2) instead.
233 	 */
234 	if (unveil(env->argv0, "x") == -1)
235 		fatal("unveil %s", env->argv0);
236 	if (unveil(NULL, NULL) == -1)
237 		fatal("unveil lock");
238 
239 	/*
240 	 * pledge in the vm processes:
241 	 * stdio - for malloc and basic I/O including events.
242 	 * vmm - for the vmm ioctls and operations.
243 	 * proc exec - fork/exec for launching devices.
244 	 * recvfd - for vm send/recv and sending fd to devices.
245 	 */
246 	if (pledge("stdio vmm proc exec recvfd", NULL) == -1)
247 		fatal("pledge");
248 
249 	/* Receive our vm configuration. */
250 	memset(&vm, 0, sizeof(vm));
251 	sz = atomicio(read, fd, &vm, sizeof(vm));
252 	if (sz != sizeof(vm)) {
253 		log_warnx("failed to receive start message");
254 		_exit(EIO);
255 	}
256 
257 	/* Update process with the vm name. */
258 	vcp = &vm.vm_params.vmc_params;
259 	setproctitle("%s", vcp->vcp_name);
260 	log_procinit("vm/%s", vcp->vcp_name);
261 
262 	/* Receive the local prefix settings. */
263 	sz = atomicio(read, fd, &env->vmd_cfg.cfg_localprefix,
264 	    sizeof(env->vmd_cfg.cfg_localprefix));
265 	if (sz != sizeof(env->vmd_cfg.cfg_localprefix)) {
266 		log_warnx("failed to receive local prefix");
267 		_exit(EIO);
268 	}
269 
270 	/*
271 	 * We need, at minimum, a vm_kernel fd to boot a vm. This is either a
272 	 * kernel or a BIOS image.
273 	 */
274 	if (!(vm.vm_state & VM_STATE_RECEIVED)) {
275 		if (vm.vm_kernel == -1) {
276 			log_warnx("%s: failed to receive boot fd",
277 			    vcp->vcp_name);
278 			_exit(EINVAL);
279 		}
280 		if (fcntl(vm.vm_kernel, F_SETFL, O_NONBLOCK) == -1) {
281 			ret = errno;
282 			log_warn("failed to set nonblocking mode on boot fd");
283 			_exit(ret);
284 		}
285 	}
286 
287 	ret = start_vm(&vm, fd);
288 	_exit(ret);
289 }
290 
291 /*
292  * loadfile_bios
293  *
294  * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image
295  * directly into memory.
296  *
297  * Parameters:
298  *  fp: file of a kernel file to load
299  *  size: uncompressed size of the image
300  *  (out) vrs: register state to set on init for this kernel
301  *
302  * Return values:
303  *  0 if successful
304  *  various error codes returned from read(2) or loadelf functions
305  */
306 int
307 loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs)
308 {
309 	off_t	 off;
310 
311 	/* Set up a "flat 16 bit" register state for BIOS */
312 	memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs));
313 
314 	/* Seek to the beginning of the BIOS image */
315 	if (gzseek(fp, 0, SEEK_SET) == -1)
316 		return (-1);
317 
318 	/* The BIOS image must end at 1MB */
319 	if ((off = MB(1) - size) < 0)
320 		return (-1);
321 
322 	/* Read BIOS image into memory */
323 	if (mread(fp, off, size) != (size_t)size) {
324 		errno = EIO;
325 		return (-1);
326 	}
327 
328 	if (gzseek(fp, 0, SEEK_SET) == -1)
329 		return (-1);
330 
331 	/* Read a second BIOS copy into memory ending at 4GB */
332 	off = GB(4) - size;
333 	if (mread(fp, off, size) != (size_t)size) {
334 		errno = EIO;
335 		return (-1);
336 	}
337 
338 	log_debug("%s: loaded BIOS image", __func__);
339 
340 	return (0);
341 }
342 
343 /*
344  * start_vm
345  *
346  * After forking a new VM process, starts the new VM with the creation
347  * parameters supplied (in the incoming vm->vm_params field). This
348  * function performs a basic sanity check on the incoming parameters
349  * and then performs the following steps to complete the creation of the VM:
350  *
351  * 1. validates and create the new VM
352  * 2. opens the imsg control channel to the parent and drops more privilege
353  * 3. drops additional privileges by calling pledge(2)
354  * 4. loads the kernel from the disk image or file descriptor
355  * 5. runs the VM's VCPU loops.
356  *
357  * Parameters:
358  *  vm: The VM data structure that is including the VM create parameters.
359  *  fd: The imsg socket that is connected to the parent process.
360  *
361  * Return values:
362  *  0: success
363  *  !0 : failure - typically an errno indicating the source of the failure
364  */
365 int
366 start_vm(struct vmd_vm *vm, int fd)
367 {
368 	struct vmop_create_params *vmc = &vm->vm_params;
369 	struct vm_create_params	*vcp = &vmc->vmc_params;
370 	struct vcpu_reg_state	 vrs;
371 	int			 nicfds[VM_MAX_NICS_PER_VM];
372 	int			 ret;
373 	gzFile			 fp;
374 	size_t			 i;
375 	struct vm_rwregs_params  vrp;
376 	struct stat		 sb;
377 
378 	/*
379 	 * We first try to initialize and allocate memory before bothering
380 	 * vmm(4) with a request to create a new vm.
381 	 */
382 	if (!(vm->vm_state & VM_STATE_RECEIVED))
383 		create_memory_map(vcp);
384 
385 	ret = alloc_guest_mem(vm);
386 	if (ret) {
387 		struct rlimit lim;
388 		char buf[FMT_SCALED_STRSIZE];
389 		if (ret == ENOMEM && getrlimit(RLIMIT_DATA, &lim) == 0) {
390 			if (fmt_scaled(lim.rlim_cur, buf) == 0)
391 				fatalx("could not allocate guest memory (data "
392 				    "limit is %s)", buf);
393 		}
394 		errno = ret;
395 		log_warn("could not allocate guest memory");
396 		return (ret);
397 	}
398 
399 	/* We've allocated guest memory, so now create the vm in vmm(4). */
400 	ret = vmm_create_vm(vm);
401 	if (ret) {
402 		/* Let the vmm process know we failed by sending a 0 vm id. */
403 		vcp->vcp_id = 0;
404 		atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id));
405 		return (ret);
406 	}
407 
408 	/*
409 	 * Some of vmd currently relies on global state (current_vm, con_fd).
410 	 */
411 	current_vm = vm;
412 	con_fd = vm->vm_tty;
413 	if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) {
414 		log_warn("failed to set nonblocking mode on console");
415 		return (1);
416 	}
417 
418 	/*
419 	 * We now let the vmm process know we were successful by sending it our
420 	 * vmm(4) assigned vm id.
421 	 */
422 	if (atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
423 	    sizeof(vcp->vcp_id)) {
424 		log_warn("failed to send created vm id to vmm process");
425 		return (1);
426 	}
427 
428 	/* Prepare either our boot image or receive an existing vm to launch. */
429 	if (vm->vm_state & VM_STATE_RECEIVED) {
430 		ret = atomicio(read, vm->vm_receive_fd, &vrp, sizeof(vrp));
431 		if (ret != sizeof(vrp))
432 			fatal("received incomplete vrp - exiting");
433 		vrs = vrp.vrwp_regs;
434 	} else {
435 		/*
436 		 * Set up default "flat 64 bit" register state - RIP,
437 		 * RSP, and GDT info will be set in bootloader
438 		 */
439 		memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs));
440 
441 		/* Find and open kernel image */
442 		if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL)
443 			fatalx("failed to open kernel - exiting");
444 
445 		/* Load kernel image */
446 		ret = loadfile_elf(fp, vm, &vrs, vmc->vmc_bootdevice);
447 
448 		/*
449 		 * Try BIOS as a fallback (only if it was provided as an image
450 		 * with vm->vm_kernel and the file is not compressed)
451 		 */
452 		if (ret && errno == ENOEXEC && vm->vm_kernel != -1 &&
453 		    gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0)
454 			ret = loadfile_bios(fp, sb.st_size, &vrs);
455 
456 		if (ret)
457 			fatal("failed to load kernel or BIOS - exiting");
458 
459 		gzclose(fp);
460 	}
461 
462 	if (vm->vm_kernel != -1)
463 		close_fd(vm->vm_kernel);
464 
465 	/* Initialize our mutexes. */
466 	ret = pthread_mutex_init(&threadmutex, NULL);
467 	if (ret) {
468 		log_warn("%s: could not initialize thread state mutex",
469 		    __func__);
470 		return (ret);
471 	}
472 	ret = pthread_cond_init(&threadcond, NULL);
473 	if (ret) {
474 		log_warn("%s: could not initialize thread state "
475 		    "condition variable", __func__);
476 		return (ret);
477 	}
478 	mutex_lock(&threadmutex);
479 
480 
481 	/*
482 	 * Finalize our communication socket with the vmm process. From here
483 	 * onwards, communication with the vmm process is event-based.
484 	 */
485 	event_init();
486 	if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1)
487 		fatal("setup vm pipe");
488 
489 	/*
490 	 * Initialize or restore our emulated hardware.
491 	 */
492 	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
493 		nicfds[i] = vm->vm_ifs[i].vif_fd;
494 
495 	if (vm->vm_state & VM_STATE_RECEIVED) {
496 		restore_mem(vm->vm_receive_fd, vcp);
497 		restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds,
498 		    vm->vm_disks, vm->vm_cdrom);
499 		if (restore_vm_params(vm->vm_receive_fd, vcp))
500 			fatal("restore vm params failed");
501 		unpause_vm(vm);
502 	} else
503 		init_emulated_hw(vmc, vm->vm_cdrom, vm->vm_disks, nicfds);
504 
505 	/* Drop privleges further before starting the vcpu run loop(s). */
506 	if (pledge("stdio vmm recvfd", NULL) == -1)
507 		fatal("pledge");
508 
509 	/*
510 	 * Execute the vcpu run loop(s) for this VM.
511 	 */
512 	ret = run_vm(&vm->vm_params, &vrs);
513 
514 	/* Ensure that any in-flight data is written back */
515 	virtio_shutdown(vm);
516 
517 	return (ret);
518 }
519 
520 /*
521  * vm_dispatch_vmm
522  *
523  * imsg callback for messages that are received from the vmm parent process.
524  */
525 void
526 vm_dispatch_vmm(int fd, short event, void *arg)
527 {
528 	struct vmd_vm		*vm = arg;
529 	struct vmop_result	 vmr;
530 	struct vmop_addr_result	 var;
531 	struct imsgev		*iev = &vm->vm_iev;
532 	struct imsgbuf		*ibuf = &iev->ibuf;
533 	struct imsg		 imsg;
534 	ssize_t			 n;
535 	int			 verbose;
536 
537 	if (event & EV_READ) {
538 		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
539 			fatal("%s: imsg_read", __func__);
540 		if (n == 0)
541 			_exit(0);
542 	}
543 
544 	if (event & EV_WRITE) {
545 		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
546 			fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd);
547 		if (n == 0)
548 			_exit(0);
549 	}
550 
551 	for (;;) {
552 		if ((n = imsg_get(ibuf, &imsg)) == -1)
553 			fatal("%s: imsg_get", __func__);
554 		if (n == 0)
555 			break;
556 
557 #if DEBUG > 1
558 		log_debug("%s: got imsg %d from %s",
559 		    __func__, imsg.hdr.type,
560 		    vm->vm_params.vmc_params.vcp_name);
561 #endif
562 
563 		switch (imsg.hdr.type) {
564 		case IMSG_CTL_VERBOSE:
565 			IMSG_SIZE_CHECK(&imsg, &verbose);
566 			memcpy(&verbose, imsg.data, sizeof(verbose));
567 			log_setverbose(verbose);
568 			virtio_broadcast_imsg(vm, IMSG_CTL_VERBOSE, &verbose,
569 			    sizeof(verbose));
570 			break;
571 		case IMSG_VMDOP_VM_SHUTDOWN:
572 			if (vmmci_ctl(VMMCI_SHUTDOWN) == -1)
573 				_exit(0);
574 			break;
575 		case IMSG_VMDOP_VM_REBOOT:
576 			if (vmmci_ctl(VMMCI_REBOOT) == -1)
577 				_exit(0);
578 			break;
579 		case IMSG_VMDOP_PAUSE_VM:
580 			vmr.vmr_result = 0;
581 			vmr.vmr_id = vm->vm_vmid;
582 			pause_vm(vm);
583 			imsg_compose_event(&vm->vm_iev,
584 			    IMSG_VMDOP_PAUSE_VM_RESPONSE,
585 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
586 			    sizeof(vmr));
587 			break;
588 		case IMSG_VMDOP_UNPAUSE_VM:
589 			vmr.vmr_result = 0;
590 			vmr.vmr_id = vm->vm_vmid;
591 			unpause_vm(vm);
592 			imsg_compose_event(&vm->vm_iev,
593 			    IMSG_VMDOP_UNPAUSE_VM_RESPONSE,
594 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
595 			    sizeof(vmr));
596 			break;
597 		case IMSG_VMDOP_SEND_VM_REQUEST:
598 			vmr.vmr_id = vm->vm_vmid;
599 			vmr.vmr_result = send_vm(imsg_get_fd(&imsg), vm);
600 			imsg_compose_event(&vm->vm_iev,
601 			    IMSG_VMDOP_SEND_VM_RESPONSE,
602 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
603 			    sizeof(vmr));
604 			if (!vmr.vmr_result) {
605 				imsg_flush(&current_vm->vm_iev.ibuf);
606 				_exit(0);
607 			}
608 			break;
609 		case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
610 			IMSG_SIZE_CHECK(&imsg, &var);
611 			memcpy(&var, imsg.data, sizeof(var));
612 
613 			log_debug("%s: received tap addr %s for nic %d",
614 			    vm->vm_params.vmc_params.vcp_name,
615 			    ether_ntoa((void *)var.var_addr), var.var_nic_idx);
616 
617 			vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr);
618 			break;
619 		default:
620 			fatalx("%s: got invalid imsg %d from %s",
621 			    __func__, imsg.hdr.type,
622 			    vm->vm_params.vmc_params.vcp_name);
623 		}
624 		imsg_free(&imsg);
625 	}
626 	imsg_event_add(iev);
627 }
628 
629 /*
630  * vm_shutdown
631  *
632  * Tell the vmm parent process to shutdown or reboot the VM and exit.
633  */
634 __dead void
635 vm_shutdown(unsigned int cmd)
636 {
637 	switch (cmd) {
638 	case VMMCI_NONE:
639 	case VMMCI_SHUTDOWN:
640 		(void)imsg_compose_event(&current_vm->vm_iev,
641 		    IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0);
642 		break;
643 	case VMMCI_REBOOT:
644 		(void)imsg_compose_event(&current_vm->vm_iev,
645 		    IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0);
646 		break;
647 	default:
648 		fatalx("invalid vm ctl command: %d", cmd);
649 	}
650 	imsg_flush(&current_vm->vm_iev.ibuf);
651 
652 	_exit(0);
653 }
654 
655 int
656 send_vm(int fd, struct vmd_vm *vm)
657 {
658 	struct vm_rwregs_params	   vrp;
659 	struct vm_rwvmparams_params vpp;
660 	struct vmop_create_params *vmc;
661 	struct vm_terminate_params vtp;
662 	unsigned int		   flags = 0;
663 	unsigned int		   i;
664 	int			   ret = 0;
665 	size_t			   sz;
666 
667 	if (dump_send_header(fd)) {
668 		log_warnx("%s: failed to send vm dump header", __func__);
669 		goto err;
670 	}
671 
672 	pause_vm(vm);
673 
674 	vmc = calloc(1, sizeof(struct vmop_create_params));
675 	if (vmc == NULL) {
676 		log_warn("%s: calloc error getting vmc", __func__);
677 		ret = -1;
678 		goto err;
679 	}
680 
681 	flags |= VMOP_CREATE_MEMORY;
682 	memcpy(&vmc->vmc_params, &current_vm->vm_params, sizeof(struct
683 	    vmop_create_params));
684 	vmc->vmc_flags = flags;
685 	vrp.vrwp_vm_id = vm->vm_params.vmc_params.vcp_id;
686 	vrp.vrwp_mask = VM_RWREGS_ALL;
687 	vpp.vpp_mask = VM_RWVMPARAMS_ALL;
688 	vpp.vpp_vm_id = vm->vm_params.vmc_params.vcp_id;
689 
690 	sz = atomicio(vwrite, fd, vmc, sizeof(struct vmop_create_params));
691 	if (sz != sizeof(struct vmop_create_params)) {
692 		ret = -1;
693 		goto err;
694 	}
695 
696 	for (i = 0; i < vm->vm_params.vmc_params.vcp_ncpus; i++) {
697 		vrp.vrwp_vcpu_id = i;
698 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) {
699 			log_warn("%s: readregs failed", __func__);
700 			goto err;
701 		}
702 
703 		sz = atomicio(vwrite, fd, &vrp,
704 		    sizeof(struct vm_rwregs_params));
705 		if (sz != sizeof(struct vm_rwregs_params)) {
706 			log_warn("%s: dumping registers failed", __func__);
707 			ret = -1;
708 			goto err;
709 		}
710 	}
711 
712 	/* Dump memory before devices to aid in restoration. */
713 	if ((ret = dump_mem(fd, vm)))
714 		goto err;
715 	if ((ret = i8253_dump(fd)))
716 		goto err;
717 	if ((ret = i8259_dump(fd)))
718 		goto err;
719 	if ((ret = ns8250_dump(fd)))
720 		goto err;
721 	if ((ret = mc146818_dump(fd)))
722 		goto err;
723 	if ((ret = fw_cfg_dump(fd)))
724 		goto err;
725 	if ((ret = pci_dump(fd)))
726 		goto err;
727 	if ((ret = virtio_dump(fd)))
728 		goto err;
729 
730 	for (i = 0; i < vm->vm_params.vmc_params.vcp_ncpus; i++) {
731 		vpp.vpp_vcpu_id = i;
732 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) {
733 			log_warn("%s: readvmparams failed", __func__);
734 			goto err;
735 		}
736 
737 		sz = atomicio(vwrite, fd, &vpp,
738 		    sizeof(struct vm_rwvmparams_params));
739 		if (sz != sizeof(struct vm_rwvmparams_params)) {
740 			log_warn("%s: dumping vm params failed", __func__);
741 			ret = -1;
742 			goto err;
743 		}
744 	}
745 
746 	vtp.vtp_vm_id = vm->vm_params.vmc_params.vcp_id;
747 	if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) {
748 		log_warnx("%s: term IOC error: %d, %d", __func__,
749 		    errno, ENOENT);
750 	}
751 err:
752 	close(fd);
753 	if (ret)
754 		unpause_vm(vm);
755 	return ret;
756 }
757 
758 int
759 dump_send_header(int fd) {
760 	struct vm_dump_header	   vmh;
761 	int			   i;
762 
763 	memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE,
764 	    sizeof(vmh.vmh_signature));
765 
766 	vmh.vmh_cpuids[0].code = 0x00;
767 	vmh.vmh_cpuids[0].leaf = 0x00;
768 
769 	vmh.vmh_cpuids[1].code = 0x01;
770 	vmh.vmh_cpuids[1].leaf = 0x00;
771 
772 	vmh.vmh_cpuids[2].code = 0x07;
773 	vmh.vmh_cpuids[2].leaf = 0x00;
774 
775 	vmh.vmh_cpuids[3].code = 0x0d;
776 	vmh.vmh_cpuids[3].leaf = 0x00;
777 
778 	vmh.vmh_cpuids[4].code = 0x80000001;
779 	vmh.vmh_cpuids[4].leaf = 0x00;
780 
781 	vmh.vmh_version = VM_DUMP_VERSION;
782 
783 	for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
784 		CPUID_LEAF(vmh.vmh_cpuids[i].code,
785 		    vmh.vmh_cpuids[i].leaf,
786 		    vmh.vmh_cpuids[i].a,
787 		    vmh.vmh_cpuids[i].b,
788 		    vmh.vmh_cpuids[i].c,
789 		    vmh.vmh_cpuids[i].d);
790 	}
791 
792 	if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh))
793 		return (-1);
794 
795 	return (0);
796 }
797 
798 int
799 dump_mem(int fd, struct vmd_vm *vm)
800 {
801 	unsigned int	i;
802 	int		ret;
803 	struct		vm_mem_range *vmr;
804 
805 	for (i = 0; i < vm->vm_params.vmc_params.vcp_nmemranges; i++) {
806 		vmr = &vm->vm_params.vmc_params.vcp_memranges[i];
807 		ret = dump_vmr(fd, vmr);
808 		if (ret)
809 			return ret;
810 	}
811 	return (0);
812 }
813 
814 int
815 restore_vm_params(int fd, struct vm_create_params *vcp) {
816 	unsigned int			i;
817 	struct vm_rwvmparams_params    vpp;
818 
819 	for (i = 0; i < vcp->vcp_ncpus; i++) {
820 		if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) {
821 			log_warn("%s: error restoring vm params", __func__);
822 			return (-1);
823 		}
824 		vpp.vpp_vm_id = vcp->vcp_id;
825 		vpp.vpp_vcpu_id = i;
826 		if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) {
827 			log_debug("%s: writing vm params failed", __func__);
828 			return (-1);
829 		}
830 	}
831 	return (0);
832 }
833 
834 void
835 restore_mem(int fd, struct vm_create_params *vcp)
836 {
837 	unsigned int	     i;
838 	struct vm_mem_range *vmr;
839 
840 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
841 		vmr = &vcp->vcp_memranges[i];
842 		restore_vmr(fd, vmr);
843 	}
844 }
845 
846 int
847 dump_vmr(int fd, struct vm_mem_range *vmr)
848 {
849 	size_t	rem = vmr->vmr_size, read=0;
850 	char	buf[PAGE_SIZE];
851 
852 	while (rem > 0) {
853 		if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) {
854 			log_warn("failed to read vmr");
855 			return (-1);
856 		}
857 		if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) {
858 			log_warn("failed to dump vmr");
859 			return (-1);
860 		}
861 		rem = rem - PAGE_SIZE;
862 		read = read + PAGE_SIZE;
863 	}
864 	return (0);
865 }
866 
867 void
868 restore_vmr(int fd, struct vm_mem_range *vmr)
869 {
870 	size_t	rem = vmr->vmr_size, wrote=0;
871 	char	buf[PAGE_SIZE];
872 
873 	while (rem > 0) {
874 		if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf))
875 			fatal("failed to restore vmr");
876 		if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE))
877 			fatal("failed to write vmr");
878 		rem = rem - PAGE_SIZE;
879 		wrote = wrote + PAGE_SIZE;
880 	}
881 }
882 
883 static void
884 pause_vm(struct vmd_vm *vm)
885 {
886 	unsigned int n;
887 	int ret;
888 	if (vm->vm_state & VM_STATE_PAUSED)
889 		return;
890 
891 	current_vm->vm_state |= VM_STATE_PAUSED;
892 
893 	ret = pthread_barrier_init(&vm_pause_barrier, NULL,
894 	    vm->vm_params.vmc_params.vcp_ncpus + 1);
895 	if (ret) {
896 		log_warnx("%s: cannot initialize pause barrier (%d)",
897 		    __progname, ret);
898 		return;
899 	}
900 
901 	for (n = 0; n < vm->vm_params.vmc_params.vcp_ncpus; n++) {
902 		ret = pthread_cond_broadcast(&vcpu_run_cond[n]);
903 		if (ret) {
904 			log_warnx("%s: can't broadcast vcpu run cond (%d)",
905 			    __func__, (int)ret);
906 			return;
907 		}
908 	}
909 	ret = pthread_barrier_wait(&vm_pause_barrier);
910 	if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
911 		log_warnx("%s: could not wait on pause barrier (%d)",
912 		    __func__, (int)ret);
913 		return;
914 	}
915 
916 	ret = pthread_barrier_destroy(&vm_pause_barrier);
917 	if (ret) {
918 		log_warnx("%s: could not destroy pause barrier (%d)",
919 		    __progname, ret);
920 		return;
921 	}
922 
923 	i8253_stop();
924 	mc146818_stop();
925 	ns8250_stop();
926 	virtio_stop(vm);
927 }
928 
929 static void
930 unpause_vm(struct vmd_vm *vm)
931 {
932 	unsigned int n;
933 	int ret;
934 	if (!(vm->vm_state & VM_STATE_PAUSED))
935 		return;
936 
937 	current_vm->vm_state &= ~VM_STATE_PAUSED;
938 	for (n = 0; n < vm->vm_params.vmc_params.vcp_ncpus; n++) {
939 		ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]);
940 		if (ret) {
941 			log_warnx("%s: can't broadcast vcpu unpause cond (%d)",
942 			    __func__, (int)ret);
943 			return;
944 		}
945 	}
946 
947 	i8253_start();
948 	mc146818_start();
949 	ns8250_start();
950 	virtio_start(vm);
951 }
952 
953 /*
954  * vcpu_reset
955  *
956  * Requests vmm(4) to reset the VCPUs in the indicated VM to
957  * the register state provided
958  *
959  * Parameters
960  *  vmid: VM ID to reset
961  *  vcpu_id: VCPU ID to reset
962  *  vrs: the register state to initialize
963  *
964  * Return values:
965  *  0: success
966  *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not
967  *      valid)
968  */
969 int
970 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
971 {
972 	struct vm_resetcpu_params vrp;
973 
974 	memset(&vrp, 0, sizeof(vrp));
975 	vrp.vrp_vm_id = vmid;
976 	vrp.vrp_vcpu_id = vcpu_id;
977 	memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state));
978 
979 	log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid);
980 
981 	if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1)
982 		return (errno);
983 
984 	return (0);
985 }
986 
987 /*
988  * create_memory_map
989  *
990  * Sets up the guest physical memory ranges that the VM can access.
991  *
992  * Parameters:
993  *  vcp: VM create parameters describing the VM whose memory map
994  *       is being created
995  *
996  * Return values:
997  *  nothing
998  */
999 void
1000 create_memory_map(struct vm_create_params *vcp)
1001 {
1002 	size_t len, mem_bytes;
1003 	size_t above_1m = 0, above_4g = 0;
1004 
1005 	mem_bytes = vcp->vcp_memranges[0].vmr_size;
1006 	vcp->vcp_nmemranges = 0;
1007 	if (mem_bytes == 0 || mem_bytes > VMM_MAX_VM_MEM_SIZE)
1008 		return;
1009 
1010 	/* First memory region: 0 - LOWMEM_KB (DOS low mem) */
1011 	len = LOWMEM_KB * 1024;
1012 	vcp->vcp_memranges[0].vmr_gpa = 0x0;
1013 	vcp->vcp_memranges[0].vmr_size = len;
1014 	vcp->vcp_memranges[0].vmr_type = VM_MEM_RAM;
1015 	mem_bytes -= len;
1016 
1017 	/*
1018 	 * Second memory region: LOWMEM_KB - 1MB.
1019 	 *
1020 	 * N.B. - Normally ROMs or parts of video RAM are mapped here.
1021 	 * We have to add this region, because some systems
1022 	 * unconditionally write to 0xb8000 (VGA RAM), and
1023 	 * we need to make sure that vmm(4) permits accesses
1024 	 * to it. So allocate guest memory for it.
1025 	 */
1026 	len = MB(1) - (LOWMEM_KB * 1024);
1027 	vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
1028 	vcp->vcp_memranges[1].vmr_size = len;
1029 	vcp->vcp_memranges[1].vmr_type = VM_MEM_RESERVED;
1030 	mem_bytes -= len;
1031 
1032 	/* If we have less than 2MB remaining, still create a 2nd BIOS area. */
1033 	if (mem_bytes <= MB(2)) {
1034 		vcp->vcp_memranges[2].vmr_gpa = VMM_PCI_MMIO_BAR_END;
1035 		vcp->vcp_memranges[2].vmr_size = MB(2);
1036 		vcp->vcp_memranges[2].vmr_type = VM_MEM_RESERVED;
1037 		vcp->vcp_nmemranges = 3;
1038 		return;
1039 	}
1040 
1041 	/*
1042 	 * Calculate the how to split any remaining memory across the 4GB
1043 	 * boundary while making sure we do not place physical memory into
1044 	 * MMIO ranges.
1045 	 */
1046 	if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - MB(1)) {
1047 		above_1m = VMM_PCI_MMIO_BAR_BASE - MB(1);
1048 		above_4g = mem_bytes - above_1m;
1049 	} else {
1050 		above_1m = mem_bytes;
1051 		above_4g = 0;
1052 	}
1053 
1054 	/* Third memory region: area above 1MB to MMIO region */
1055 	vcp->vcp_memranges[2].vmr_gpa = MB(1);
1056 	vcp->vcp_memranges[2].vmr_size = above_1m;
1057 	vcp->vcp_memranges[2].vmr_type = VM_MEM_RAM;
1058 
1059 	/* Fourth region: PCI MMIO range */
1060 	vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_BASE;
1061 	vcp->vcp_memranges[3].vmr_size = VMM_PCI_MMIO_BAR_END -
1062 	    VMM_PCI_MMIO_BAR_BASE + 1;
1063 	vcp->vcp_memranges[3].vmr_type = VM_MEM_MMIO;
1064 
1065 	/* Fifth region: 2nd copy of BIOS above MMIO ending at 4GB */
1066 	vcp->vcp_memranges[4].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
1067 	vcp->vcp_memranges[4].vmr_size = MB(2);
1068 	vcp->vcp_memranges[4].vmr_type = VM_MEM_RESERVED;
1069 
1070 	/* Sixth region: any remainder above 4GB */
1071 	if (above_4g > 0) {
1072 		vcp->vcp_memranges[5].vmr_gpa = GB(4);
1073 		vcp->vcp_memranges[5].vmr_size = above_4g;
1074 		vcp->vcp_memranges[5].vmr_type = VM_MEM_RAM;
1075 		vcp->vcp_nmemranges = 6;
1076 	} else
1077 		vcp->vcp_nmemranges = 5;
1078 }
1079 
1080 /*
1081  * alloc_guest_mem
1082  *
1083  * Allocates memory for the guest.
1084  * Instead of doing a single allocation with one mmap(), we allocate memory
1085  * separately for every range for the following reasons:
1086  * - ASLR for the individual ranges
1087  * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
1088  *   map the single mmap'd userspace memory to the individual guest physical
1089  *   memory ranges, the underlying amap of the single mmap'd range would have
1090  *   to allocate per-page reference counters. The reason is that the
1091  *   individual guest physical ranges would reference the single mmap'd region
1092  *   only partially. However, if every guest physical range has its own
1093  *   corresponding mmap'd userspace allocation, there are no partial
1094  *   references: every guest physical range fully references an mmap'd
1095  *   range => no per-page reference counters have to be allocated.
1096  *
1097  * Return values:
1098  *  0: success
1099  *  !0: failure - errno indicating the source of the failure
1100  */
1101 int
1102 alloc_guest_mem(struct vmd_vm *vm)
1103 {
1104 	void *p;
1105 	int ret = 0;
1106 	size_t i, j;
1107 	struct vm_create_params *vcp = &vm->vm_params.vmc_params;
1108 	struct vm_mem_range *vmr;
1109 
1110 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1111 		vmr = &vcp->vcp_memranges[i];
1112 
1113 		/*
1114 		 * We only need R/W as userland. vmm(4) will use R/W/X in its
1115 		 * mapping.
1116 		 *
1117 		 * We must use MAP_SHARED so emulated devices will be able
1118 		 * to generate shared mappings.
1119 		 */
1120 		p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
1121 		    MAP_ANON | MAP_CONCEAL | MAP_SHARED, -1, 0);
1122 		if (p == MAP_FAILED) {
1123 			ret = errno;
1124 			for (j = 0; j < i; j++) {
1125 				vmr = &vcp->vcp_memranges[j];
1126 				munmap((void *)vmr->vmr_va, vmr->vmr_size);
1127 			}
1128 			return (ret);
1129 		}
1130 		vmr->vmr_va = (vaddr_t)p;
1131 	}
1132 
1133 	return (ret);
1134 }
1135 
1136 /*
1137  * vmm_create_vm
1138  *
1139  * Requests vmm(4) to create a new VM using the supplied creation
1140  * parameters. This operation results in the creation of the in-kernel
1141  * structures for the VM, but does not start the VM's vcpu(s).
1142  *
1143  * Parameters:
1144  *  vm: pointer to the vm object
1145  *
1146  * Return values:
1147  *  0: success
1148  *  !0 : ioctl to vmm(4) failed
1149  */
1150 static int
1151 vmm_create_vm(struct vmd_vm *vm)
1152 {
1153 	struct vm_create_params *vcp = &vm->vm_params.vmc_params;
1154 
1155 	/* Sanity check arguments */
1156 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
1157 		return (EINVAL);
1158 
1159 	if (vcp->vcp_nmemranges == 0 ||
1160 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
1161 		return (EINVAL);
1162 
1163 	if (vm->vm_params.vmc_ndisks > VM_MAX_DISKS_PER_VM)
1164 		return (EINVAL);
1165 
1166 	if (vm->vm_params.vmc_nnics > VM_MAX_NICS_PER_VM)
1167 		return (EINVAL);
1168 
1169 	if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1)
1170 		return (errno);
1171 
1172 	return (0);
1173 }
1174 
1175 /*
1176  * init_emulated_hw
1177  *
1178  * Initializes the userspace hardware emulation
1179  */
1180 void
1181 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
1182     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
1183 {
1184 	struct vm_create_params *vcp = &vmc->vmc_params;
1185 	size_t i;
1186 	uint64_t memlo, memhi;
1187 
1188 	/* Calculate memory size for NVRAM registers */
1189 	memlo = memhi = 0;
1190 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1191 		if (vcp->vcp_memranges[i].vmr_gpa == MB(1) &&
1192 		    vcp->vcp_memranges[i].vmr_size > (15 * MB(1)))
1193 			memlo = vcp->vcp_memranges[i].vmr_size - (15 * MB(1));
1194 		else if (vcp->vcp_memranges[i].vmr_gpa == GB(4))
1195 			memhi = vcp->vcp_memranges[i].vmr_size;
1196 	}
1197 
1198 	/* Reset the IO port map */
1199 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1200 
1201 	/* Init i8253 PIT */
1202 	i8253_init(vcp->vcp_id);
1203 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1204 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1205 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1206 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1207 	ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc;
1208 
1209 	/* Init mc146818 RTC */
1210 	mc146818_init(vcp->vcp_id, memlo, memhi);
1211 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1212 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1213 
1214 	/* Init master and slave PICs */
1215 	i8259_init();
1216 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1217 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1218 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1219 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1220 	ioports_map[ELCR0] = vcpu_exit_elcr;
1221 	ioports_map[ELCR1] = vcpu_exit_elcr;
1222 
1223 	/* Init ns8250 UART */
1224 	ns8250_init(con_fd, vcp->vcp_id);
1225 	for (i = COM1_DATA; i <= COM1_SCR; i++)
1226 		ioports_map[i] = vcpu_exit_com;
1227 
1228 	/* Initialize PCI */
1229 	for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++)
1230 		ioports_map[i] = vcpu_exit_pci;
1231 
1232 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1233 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1234 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1235 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1236 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1237 	pci_init();
1238 
1239 	/* Initialize virtio devices */
1240 	virtio_init(current_vm, child_cdrom, child_disks, child_taps);
1241 
1242 	/*
1243 	 * Init QEMU fw_cfg interface. Must be done last for pci hardware
1244 	 * detection.
1245 	 */
1246 	fw_cfg_init(vmc);
1247 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1248 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1249 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1250 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1251 }
1252 
1253 /*
1254  * restore_emulated_hw
1255  *
1256  * Restores the userspace hardware emulation from fd
1257  */
1258 void
1259 restore_emulated_hw(struct vm_create_params *vcp, int fd,
1260     int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
1261 {
1262 	/* struct vm_create_params *vcp = &vmc->vmc_params; */
1263 	int i;
1264 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1265 
1266 	/* Init i8253 PIT */
1267 	i8253_restore(fd, vcp->vcp_id);
1268 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1269 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1270 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1271 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1272 
1273 	/* Init master and slave PICs */
1274 	i8259_restore(fd);
1275 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1276 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1277 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1278 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1279 
1280 	/* Init ns8250 UART */
1281 	ns8250_restore(fd, con_fd, vcp->vcp_id);
1282 	for (i = COM1_DATA; i <= COM1_SCR; i++)
1283 		ioports_map[i] = vcpu_exit_com;
1284 
1285 	/* Init mc146818 RTC */
1286 	mc146818_restore(fd, vcp->vcp_id);
1287 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1288 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1289 
1290 	/* Init QEMU fw_cfg interface */
1291 	fw_cfg_restore(fd);
1292 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1293 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1294 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1295 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1296 
1297 	/* Initialize PCI */
1298 	for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++)
1299 		ioports_map[i] = vcpu_exit_pci;
1300 
1301 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1302 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1303 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1304 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1305 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1306 	pci_restore(fd);
1307 	virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps);
1308 }
1309 
1310 /*
1311  * run_vm
1312  *
1313  * Runs the VM whose creation parameters are specified in vcp
1314  *
1315  * Parameters:
1316  *  child_cdrom: previously-opened child ISO disk file descriptor
1317  *  child_disks: previously-opened child VM disk file file descriptors
1318  *  child_taps: previously-opened child tap file descriptors
1319  *  vmc: vmop_create_params struct containing the VM's desired creation
1320  *      configuration
1321  *  vrs: VCPU register state to initialize
1322  *
1323  * Return values:
1324  *  0: the VM exited normally
1325  *  !0 : the VM exited abnormally or failed to start
1326  */
1327 static int
1328 run_vm(struct vmop_create_params *vmc, struct vcpu_reg_state *vrs)
1329 {
1330 	struct vm_create_params *vcp = &vmc->vmc_params;
1331 	struct vm_rwregs_params vregsp;
1332 	uint8_t evdone = 0;
1333 	size_t i;
1334 	int ret;
1335 	pthread_t *tid, evtid;
1336 	char tname[MAXCOMLEN + 1];
1337 	struct vm_run_params **vrp;
1338 	void *exit_status;
1339 
1340 	if (vcp == NULL)
1341 		return (EINVAL);
1342 
1343 	if (vcp->vcp_nmemranges == 0 ||
1344 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
1345 		return (EINVAL);
1346 
1347 	tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t));
1348 	vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *));
1349 	if (tid == NULL || vrp == NULL) {
1350 		log_warn("%s: memory allocation error - exiting.",
1351 		    __progname);
1352 		return (ENOMEM);
1353 	}
1354 
1355 	log_debug("%s: starting %zu vcpu thread(s) for vm %s", __func__,
1356 	    vcp->vcp_ncpus, vcp->vcp_name);
1357 
1358 	/*
1359 	 * Create and launch one thread for each VCPU. These threads may
1360 	 * migrate between PCPUs over time; the need to reload CPU state
1361 	 * in such situations is detected and performed by vmm(4) in the
1362 	 * kernel.
1363 	 */
1364 	for (i = 0 ; i < vcp->vcp_ncpus; i++) {
1365 		vrp[i] = malloc(sizeof(struct vm_run_params));
1366 		if (vrp[i] == NULL) {
1367 			log_warn("%s: memory allocation error - "
1368 			    "exiting.", __progname);
1369 			/* caller will exit, so skip freeing */
1370 			return (ENOMEM);
1371 		}
1372 		vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit));
1373 		if (vrp[i]->vrp_exit == NULL) {
1374 			log_warn("%s: memory allocation error - "
1375 			    "exiting.", __progname);
1376 			/* caller will exit, so skip freeing */
1377 			return (ENOMEM);
1378 		}
1379 		vrp[i]->vrp_vm_id = vcp->vcp_id;
1380 		vrp[i]->vrp_vcpu_id = i;
1381 
1382 		if (vcpu_reset(vcp->vcp_id, i, vrs)) {
1383 			log_warnx("%s: cannot reset VCPU %zu - exiting.",
1384 			    __progname, i);
1385 			return (EIO);
1386 		}
1387 
1388 		/* once more because reset_cpu changes regs */
1389 		if (current_vm->vm_state & VM_STATE_RECEIVED) {
1390 			vregsp.vrwp_vm_id = vcp->vcp_id;
1391 			vregsp.vrwp_vcpu_id = i;
1392 			vregsp.vrwp_regs = *vrs;
1393 			vregsp.vrwp_mask = VM_RWREGS_ALL;
1394 			if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS,
1395 			    &vregsp)) == -1) {
1396 				log_warn("%s: writeregs failed", __func__);
1397 				return (ret);
1398 			}
1399 		}
1400 
1401 		ret = pthread_cond_init(&vcpu_run_cond[i], NULL);
1402 		if (ret) {
1403 			log_warnx("%s: cannot initialize cond var (%d)",
1404 			    __progname, ret);
1405 			return (ret);
1406 		}
1407 
1408 		ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL);
1409 		if (ret) {
1410 			log_warnx("%s: cannot initialize mtx (%d)",
1411 			    __progname, ret);
1412 			return (ret);
1413 		}
1414 
1415 		ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL);
1416 		if (ret) {
1417 			log_warnx("%s: cannot initialize unpause var (%d)",
1418 			    __progname, ret);
1419 			return (ret);
1420 		}
1421 
1422 		ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL);
1423 		if (ret) {
1424 			log_warnx("%s: cannot initialize unpause mtx (%d)",
1425 			    __progname, ret);
1426 			return (ret);
1427 		}
1428 
1429 		vcpu_hlt[i] = 0;
1430 
1431 		/* Start each VCPU run thread at vcpu_run_loop */
1432 		ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
1433 		if (ret) {
1434 			/* caller will _exit after this return */
1435 			ret = errno;
1436 			log_warn("%s: could not create vcpu thread %zu",
1437 			    __func__, i);
1438 			return (ret);
1439 		}
1440 
1441 		snprintf(tname, sizeof(tname), "vcpu-%zu", i);
1442 		pthread_set_name_np(tid[i], tname);
1443 	}
1444 
1445 	log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name);
1446 	ret = pthread_create(&evtid, NULL, event_thread, &evdone);
1447 	if (ret) {
1448 		errno = ret;
1449 		log_warn("%s: could not create event thread", __func__);
1450 		return (ret);
1451 	}
1452 	pthread_set_name_np(evtid, "event");
1453 
1454 	for (;;) {
1455 		ret = pthread_cond_wait(&threadcond, &threadmutex);
1456 		if (ret) {
1457 			log_warn("%s: waiting on thread state condition "
1458 			    "variable failed", __func__);
1459 			return (ret);
1460 		}
1461 
1462 		/*
1463 		 * Did a VCPU thread exit with an error? => return the first one
1464 		 */
1465 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1466 			if (vcpu_done[i] == 0)
1467 				continue;
1468 
1469 			if (pthread_join(tid[i], &exit_status)) {
1470 				log_warn("%s: failed to join thread %zd - "
1471 				    "exiting", __progname, i);
1472 				return (EIO);
1473 			}
1474 
1475 			ret = (intptr_t)exit_status;
1476 		}
1477 
1478 		/* Did the event thread exit? => return with an error */
1479 		if (evdone) {
1480 			if (pthread_join(evtid, &exit_status)) {
1481 				log_warn("%s: failed to join event thread - "
1482 				    "exiting", __progname);
1483 				return (EIO);
1484 			}
1485 
1486 			log_warnx("%s: vm %d event thread exited "
1487 			    "unexpectedly", __progname, vcp->vcp_id);
1488 			return (EIO);
1489 		}
1490 
1491 		/* Did all VCPU threads exit successfully? => return */
1492 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1493 			if (vcpu_done[i] == 0)
1494 				break;
1495 		}
1496 		if (i == vcp->vcp_ncpus)
1497 			return (ret);
1498 
1499 		/* Some more threads to wait for, start over */
1500 	}
1501 
1502 	return (ret);
1503 }
1504 
1505 void *
1506 event_thread(void *arg)
1507 {
1508 	uint8_t *donep = arg;
1509 	intptr_t ret;
1510 
1511 	ret = event_dispatch();
1512 
1513 	mutex_lock(&threadmutex);
1514 	*donep = 1;
1515 	pthread_cond_signal(&threadcond);
1516 	mutex_unlock(&threadmutex);
1517 
1518 	return (void *)ret;
1519  }
1520 
1521 /*
1522  * vcpu_run_loop
1523  *
1524  * Runs a single VCPU until vmm(4) requires help handling an exit,
1525  * or the VM terminates.
1526  *
1527  * Parameters:
1528  *  arg: vcpu_run_params for the VCPU being run by this thread
1529  *
1530  * Return values:
1531  *  NULL: the VCPU shutdown properly
1532  *  !NULL: error processing VCPU run, or the VCPU shutdown abnormally
1533  */
1534 void *
1535 vcpu_run_loop(void *arg)
1536 {
1537 	struct vm_run_params *vrp = (struct vm_run_params *)arg;
1538 	intptr_t ret = 0;
1539 	int irq;
1540 	uint32_t n;
1541 
1542 	vrp->vrp_continue = 0;
1543 	n = vrp->vrp_vcpu_id;
1544 
1545 	for (;;) {
1546 		ret = pthread_mutex_lock(&vcpu_run_mtx[n]);
1547 
1548 		if (ret) {
1549 			log_warnx("%s: can't lock vcpu run mtx (%d)",
1550 			    __func__, (int)ret);
1551 			return ((void *)ret);
1552 		}
1553 
1554 		/* If we are halted and need to pause, pause */
1555 		if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED)) {
1556 			ret = pthread_barrier_wait(&vm_pause_barrier);
1557 			if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
1558 				log_warnx("%s: could not wait on pause barrier (%d)",
1559 				    __func__, (int)ret);
1560 				return ((void *)ret);
1561 			}
1562 
1563 			ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]);
1564 			if (ret) {
1565 				log_warnx("%s: can't lock vcpu unpause mtx (%d)",
1566 				    __func__, (int)ret);
1567 				return ((void *)ret);
1568 			}
1569 
1570 			/* i8259 may be firing as we pause, release run mtx. */
1571 			mutex_unlock(&vcpu_run_mtx[n]);
1572 			ret = pthread_cond_wait(&vcpu_unpause_cond[n],
1573 			    &vcpu_unpause_mtx[n]);
1574 			if (ret) {
1575 				log_warnx(
1576 				    "%s: can't wait on unpause cond (%d)",
1577 				    __func__, (int)ret);
1578 				break;
1579 			}
1580 			mutex_lock(&vcpu_run_mtx[n]);
1581 
1582 			ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]);
1583 			if (ret) {
1584 				log_warnx("%s: can't unlock unpause mtx (%d)",
1585 				    __func__, (int)ret);
1586 				break;
1587 			}
1588 		}
1589 
1590 		/* If we are halted and not paused, wait */
1591 		if (vcpu_hlt[n]) {
1592 			ret = pthread_cond_wait(&vcpu_run_cond[n],
1593 			    &vcpu_run_mtx[n]);
1594 
1595 			if (ret) {
1596 				log_warnx(
1597 				    "%s: can't wait on cond (%d)",
1598 				    __func__, (int)ret);
1599 				(void)pthread_mutex_unlock(
1600 				    &vcpu_run_mtx[n]);
1601 				break;
1602 			}
1603 		}
1604 
1605 		ret = pthread_mutex_unlock(&vcpu_run_mtx[n]);
1606 
1607 		if (ret) {
1608 			log_warnx("%s: can't unlock mutex on cond (%d)",
1609 			    __func__, (int)ret);
1610 			break;
1611 		}
1612 
1613 		if (vrp->vrp_irqready && i8259_is_pending()) {
1614 			irq = i8259_ack();
1615 			vrp->vrp_irq = irq;
1616 		} else
1617 			vrp->vrp_irq = 0xFFFF;
1618 
1619 		/* Still more interrupts pending? */
1620 		vrp->vrp_intr_pending = i8259_is_pending();
1621 
1622 		if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) {
1623 			/* If run ioctl failed, exit */
1624 			ret = errno;
1625 			log_warn("%s: vm %d / vcpu %d run ioctl failed",
1626 			    __func__, current_vm->vm_vmid, n);
1627 			break;
1628 		}
1629 
1630 		/* If the VM is terminating, exit normally */
1631 		if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) {
1632 			ret = (intptr_t)NULL;
1633 			break;
1634 		}
1635 
1636 		if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
1637 			/*
1638 			 * vmm(4) needs help handling an exit, handle in
1639 			 * vcpu_exit.
1640 			 */
1641 			ret = vcpu_exit(vrp);
1642 			if (ret)
1643 				break;
1644 		}
1645 	}
1646 
1647 	mutex_lock(&threadmutex);
1648 	vcpu_done[n] = 1;
1649 	pthread_cond_signal(&threadcond);
1650 	mutex_unlock(&threadmutex);
1651 
1652 	return ((void *)ret);
1653 }
1654 
1655 int
1656 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
1657 {
1658 	struct vm_intr_params vip;
1659 
1660 	memset(&vip, 0, sizeof(vip));
1661 
1662 	vip.vip_vm_id = vm_id;
1663 	vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */
1664 	vip.vip_intr = intr;
1665 
1666 	if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1)
1667 		return (errno);
1668 
1669 	return (0);
1670 }
1671 
1672 /*
1673  * vcpu_exit_pci
1674  *
1675  * Handle all I/O to the emulated PCI subsystem.
1676  *
1677  * Parameters:
1678  *  vrp: vcpu run parameters containing guest state for this exit
1679  *
1680  * Return value:
1681  *  Interrupt to inject to the guest VM, or 0xFF if no interrupt should
1682  *      be injected.
1683  */
1684 uint8_t
1685 vcpu_exit_pci(struct vm_run_params *vrp)
1686 {
1687 	struct vm_exit *vei = vrp->vrp_exit;
1688 	uint8_t intr;
1689 
1690 	intr = 0xFF;
1691 
1692 	switch (vei->vei.vei_port) {
1693 	case PCI_MODE1_ADDRESS_REG:
1694 		pci_handle_address_reg(vrp);
1695 		break;
1696 	case PCI_MODE1_DATA_REG:
1697 	case PCI_MODE1_DATA_REG + 1:
1698 	case PCI_MODE1_DATA_REG + 2:
1699 	case PCI_MODE1_DATA_REG + 3:
1700 		pci_handle_data_reg(vrp);
1701 		break;
1702 	case VM_PCI_IO_BAR_BASE ... VM_PCI_IO_BAR_END:
1703 		intr = pci_handle_io(vrp);
1704 		break;
1705 	default:
1706 		log_warnx("%s: unknown PCI register 0x%llx",
1707 		    __progname, (uint64_t)vei->vei.vei_port);
1708 		break;
1709 	}
1710 
1711 	return (intr);
1712 }
1713 
1714 /*
1715  * vcpu_exit_inout
1716  *
1717  * Handle all I/O exits that need to be emulated in vmd. This includes the
1718  * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
1719  *
1720  * Parameters:
1721  *  vrp: vcpu run parameters containing guest state for this exit
1722  */
1723 void
1724 vcpu_exit_inout(struct vm_run_params *vrp)
1725 {
1726 	struct vm_exit *vei = vrp->vrp_exit;
1727 	uint8_t intr = 0xFF;
1728 
1729 	if (vei->vei.vei_rep || vei->vei.vei_string) {
1730 #ifdef MMIO_DEBUG
1731 		log_info("%s: %s%s%s %d-byte, enc=%d, data=0x%08x, port=0x%04x",
1732 		    __func__,
1733 		    vei->vei.vei_rep == 0 ? "" : "REP ",
1734 		    vei->vei.vei_dir == VEI_DIR_IN ? "IN" : "OUT",
1735 		    vei->vei.vei_string == 0 ? "" : "S",
1736 		    vei->vei.vei_size, vei->vei.vei_encoding,
1737 		    vei->vei.vei_data, vei->vei.vei_port);
1738 		log_info("%s: ECX = 0x%llx, RDX = 0x%llx, RSI = 0x%llx",
1739 		    __func__,
1740 		    vei->vrs.vrs_gprs[VCPU_REGS_RCX],
1741 		    vei->vrs.vrs_gprs[VCPU_REGS_RDX],
1742 		    vei->vrs.vrs_gprs[VCPU_REGS_RSI]);
1743 #endif /* MMIO_DEBUG */
1744 		fatalx("%s: can't emulate REP prefixed IN(S)/OUT(S)",
1745 		    __func__);
1746 	}
1747 
1748 	if (ioports_map[vei->vei.vei_port] != NULL)
1749 		intr = ioports_map[vei->vei.vei_port](vrp);
1750 	else if (vei->vei.vei_dir == VEI_DIR_IN)
1751 		set_return_data(vei, 0xFFFFFFFF);
1752 
1753 	vei->vrs.vrs_gprs[VCPU_REGS_RIP] += vei->vei.vei_insn_len;
1754 
1755 	if (intr != 0xFF)
1756 		vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
1757 }
1758 
1759 /*
1760  * vcpu_exit_eptviolation
1761  *
1762  * handle an EPT Violation
1763  *
1764  * Parameters:
1765  *  vrp: vcpu run parameters containing guest state for this exit
1766  *
1767  * Return values:
1768  *  0: no action required
1769  *  EFAULT: a protection fault occured, kill the vm.
1770  */
1771 int
1772 vcpu_exit_eptviolation(struct vm_run_params *vrp)
1773 {
1774 	struct vm_exit *ve = vrp->vrp_exit;
1775 	int ret = 0;
1776 #if MMIO_NOTYET
1777 	struct x86_insn insn;
1778 	uint64_t va, pa;
1779 	size_t len = 15;		/* Max instruction length in x86. */
1780 #endif /* MMIO_NOTYET */
1781 	switch (ve->vee.vee_fault_type) {
1782 	case VEE_FAULT_HANDLED:
1783 		log_debug("%s: fault already handled", __func__);
1784 		break;
1785 
1786 #if MMIO_NOTYET
1787 	case VEE_FAULT_MMIO_ASSIST:
1788 		/* Intel VMX might give us the length of the instruction. */
1789 		if (ve->vee.vee_insn_info & VEE_LEN_VALID)
1790 			len = ve->vee.vee_insn_len;
1791 
1792 		if (len > 15)
1793 			fatalx("%s: invalid instruction length %lu", __func__,
1794 			    len);
1795 
1796 		/* If we weren't given instruction bytes, we need to fetch. */
1797 		if (!(ve->vee.vee_insn_info & VEE_BYTES_VALID)) {
1798 			memset(ve->vee.vee_insn_bytes, 0,
1799 			    sizeof(ve->vee.vee_insn_bytes));
1800 			va = ve->vrs.vrs_gprs[VCPU_REGS_RIP];
1801 
1802 			/* XXX Only support instructions that fit on 1 page. */
1803 			if ((va & PAGE_MASK) + len > PAGE_SIZE) {
1804 				log_warnx("%s: instruction might cross page "
1805 				    "boundary", __func__);
1806 				ret = EINVAL;
1807 				break;
1808 			}
1809 
1810 			ret = translate_gva(ve, va, &pa, PROT_EXEC);
1811 			if (ret != 0) {
1812 				log_warnx("%s: failed gva translation",
1813 				    __func__);
1814 				break;
1815 			}
1816 
1817 			ret = read_mem(pa, ve->vee.vee_insn_bytes, len);
1818 			if (ret != 0) {
1819 				log_warnx("%s: failed to fetch instruction "
1820 				    "bytes from 0x%llx", __func__, pa);
1821 				break;
1822 			}
1823 		}
1824 
1825 		ret = insn_decode(ve, &insn);
1826 		if (ret == 0)
1827 			ret = insn_emulate(ve, &insn);
1828 		break;
1829 #endif /* MMIO_NOTYET */
1830 
1831 	case VEE_FAULT_PROTECT:
1832 		log_debug("%s: EPT Violation: rip=0x%llx", __progname,
1833 		    ve->vrs.vrs_gprs[VCPU_REGS_RIP]);
1834 		ret = EFAULT;
1835 		break;
1836 
1837 	default:
1838 		fatalx("%s: invalid fault_type %d", __progname,
1839 		    ve->vee.vee_fault_type);
1840 		/* UNREACHED */
1841 	}
1842 
1843 	return (ret);
1844 }
1845 
1846 /*
1847  * vcpu_exit
1848  *
1849  * Handle a vcpu exit. This function is called when it is determined that
1850  * vmm(4) requires the assistance of vmd to support a particular guest
1851  * exit type (eg, accessing an I/O port or device). Guest state is contained
1852  * in 'vrp', and will be resent to vmm(4) on exit completion.
1853  *
1854  * Upon conclusion of handling the exit, the function determines if any
1855  * interrupts should be injected into the guest, and asserts the proper
1856  * IRQ line whose interrupt should be vectored.
1857  *
1858  * Parameters:
1859  *  vrp: vcpu run parameters containing guest state for this exit
1860  *
1861  * Return values:
1862  *  0: the exit was handled successfully
1863  *  1: an error occurred (eg, unknown exit reason passed in 'vrp')
1864  */
1865 int
1866 vcpu_exit(struct vm_run_params *vrp)
1867 {
1868 	int ret;
1869 
1870 	switch (vrp->vrp_exit_reason) {
1871 	case VMX_EXIT_INT_WINDOW:
1872 	case SVM_VMEXIT_VINTR:
1873 	case VMX_EXIT_CPUID:
1874 	case VMX_EXIT_EXTINT:
1875 	case SVM_VMEXIT_INTR:
1876 	case SVM_VMEXIT_MSR:
1877 	case SVM_VMEXIT_CPUID:
1878 		/*
1879 		 * We may be exiting to vmd to handle a pending interrupt but
1880 		 * at the same time the last exit type may have been one of
1881 		 * these. In this case, there's nothing extra to be done
1882 		 * here (and falling through to the default case below results
1883 		 * in more vmd log spam).
1884 		 */
1885 		break;
1886 	case SVM_VMEXIT_NPF:
1887 	case VMX_EXIT_EPT_VIOLATION:
1888 		ret = vcpu_exit_eptviolation(vrp);
1889 		if (ret)
1890 			return (ret);
1891 		break;
1892 	case VMX_EXIT_IO:
1893 	case SVM_VMEXIT_IOIO:
1894 		vcpu_exit_inout(vrp);
1895 		break;
1896 	case VMX_EXIT_HLT:
1897 	case SVM_VMEXIT_HLT:
1898 		ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1899 		if (ret) {
1900 			log_warnx("%s: can't lock vcpu mutex (%d)",
1901 			    __func__, ret);
1902 			return (ret);
1903 		}
1904 		vcpu_hlt[vrp->vrp_vcpu_id] = 1;
1905 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1906 		if (ret) {
1907 			log_warnx("%s: can't unlock vcpu mutex (%d)",
1908 			    __func__, ret);
1909 			return (ret);
1910 		}
1911 		break;
1912 	case VMX_EXIT_TRIPLE_FAULT:
1913 	case SVM_VMEXIT_SHUTDOWN:
1914 		/* reset VM */
1915 		return (EAGAIN);
1916 	default:
1917 		log_debug("%s: unknown exit reason 0x%x",
1918 		    __progname, vrp->vrp_exit_reason);
1919 	}
1920 
1921 	vrp->vrp_continue = 1;
1922 
1923 	return (0);
1924 }
1925 
1926 /*
1927  * find_gpa_range
1928  *
1929  * Search for a contiguous guest physical mem range.
1930  *
1931  * Parameters:
1932  *  vcp: VM create parameters that contain the memory map to search in
1933  *  gpa: the starting guest physical address
1934  *  len: the length of the memory range
1935  *
1936  * Return values:
1937  *  NULL: on failure if there is no memory range as described by the parameters
1938  *  Pointer to vm_mem_range that contains the start of the range otherwise.
1939  */
1940 static struct vm_mem_range *
1941 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
1942 {
1943 	size_t i, n;
1944 	struct vm_mem_range *vmr;
1945 
1946 	/* Find the first vm_mem_range that contains gpa */
1947 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1948 		vmr = &vcp->vcp_memranges[i];
1949 		if (gpa < vmr->vmr_gpa + vmr->vmr_size)
1950 			break;
1951 	}
1952 
1953 	/* No range found. */
1954 	if (i == vcp->vcp_nmemranges)
1955 		return (NULL);
1956 
1957 	/*
1958 	 * vmr may cover the range [gpa, gpa + len) only partly. Make
1959 	 * sure that the following vm_mem_ranges are contiguous and
1960 	 * cover the rest.
1961 	 */
1962 	n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
1963 	if (len < n)
1964 		len = 0;
1965 	else
1966 		len -= n;
1967 	gpa = vmr->vmr_gpa + vmr->vmr_size;
1968 	for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
1969 		vmr = &vcp->vcp_memranges[i];
1970 		if (gpa != vmr->vmr_gpa)
1971 			return (NULL);
1972 		if (len <= vmr->vmr_size)
1973 			len = 0;
1974 		else
1975 			len -= vmr->vmr_size;
1976 
1977 		gpa = vmr->vmr_gpa + vmr->vmr_size;
1978 	}
1979 
1980 	if (len != 0)
1981 		return (NULL);
1982 
1983 	return (vmr);
1984 }
1985 
1986 /*
1987  * write_mem
1988  *
1989  * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
1990  *
1991  * Parameters:
1992  *  dst: the destination paddr_t in the guest VM
1993  *  buf: data to copy (or NULL to zero the data)
1994  *  len: number of bytes to copy
1995  *
1996  * Return values:
1997  *  0: success
1998  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1999  *      exist in the guest.
2000  */
2001 int
2002 write_mem(paddr_t dst, const void *buf, size_t len)
2003 {
2004 	const char *from = buf;
2005 	char *to;
2006 	size_t n, off;
2007 	struct vm_mem_range *vmr;
2008 
2009 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
2010 	if (vmr == NULL) {
2011 		errno = EINVAL;
2012 		log_warn("%s: failed - invalid memory range dst = 0x%lx, "
2013 		    "len = 0x%zx", __func__, dst, len);
2014 		return (EINVAL);
2015 	}
2016 
2017 	off = dst - vmr->vmr_gpa;
2018 	while (len != 0) {
2019 		n = vmr->vmr_size - off;
2020 		if (len < n)
2021 			n = len;
2022 
2023 		to = (char *)vmr->vmr_va + off;
2024 		if (buf == NULL)
2025 			memset(to, 0, n);
2026 		else {
2027 			memcpy(to, from, n);
2028 			from += n;
2029 		}
2030 		len -= n;
2031 		off = 0;
2032 		vmr++;
2033 	}
2034 
2035 	return (0);
2036 }
2037 
2038 /*
2039  * read_mem
2040  *
2041  * Reads memory at guest paddr 'src' into 'buf'.
2042  *
2043  * Parameters:
2044  *  src: the source paddr_t in the guest VM to read from.
2045  *  buf: destination (local) buffer
2046  *  len: number of bytes to read
2047  *
2048  * Return values:
2049  *  0: success
2050  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
2051  *      exist in the guest.
2052  */
2053 int
2054 read_mem(paddr_t src, void *buf, size_t len)
2055 {
2056 	char *from, *to = buf;
2057 	size_t n, off;
2058 	struct vm_mem_range *vmr;
2059 
2060 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
2061 	if (vmr == NULL) {
2062 		errno = EINVAL;
2063 		log_warn("%s: failed - invalid memory range src = 0x%lx, "
2064 		    "len = 0x%zx", __func__, src, len);
2065 		return (EINVAL);
2066 	}
2067 
2068 	off = src - vmr->vmr_gpa;
2069 	while (len != 0) {
2070 		n = vmr->vmr_size - off;
2071 		if (len < n)
2072 			n = len;
2073 
2074 		from = (char *)vmr->vmr_va + off;
2075 		memcpy(to, from, n);
2076 
2077 		to += n;
2078 		len -= n;
2079 		off = 0;
2080 		vmr++;
2081 	}
2082 
2083 	return (0);
2084 }
2085 
2086 /*
2087  * hvaddr_mem
2088  *
2089  * Translate a guest physical address to a host virtual address, checking the
2090  * provided memory range length to confirm it's contiguous within the same
2091  * guest memory range (vm_mem_range).
2092  *
2093  * Parameters:
2094  *  gpa: guest physical address to translate
2095  *  len: number of bytes in the intended range
2096  *
2097  * Return values:
2098  *  void* to host virtual memory on success
2099  *  NULL on error, setting errno to:
2100  *    EFAULT: gpa falls outside guest memory ranges
2101  *    EINVAL: requested len extends beyond memory range
2102  */
2103 void *
2104 hvaddr_mem(paddr_t gpa, size_t len)
2105 {
2106 	struct vm_mem_range *vmr;
2107 	size_t off;
2108 
2109 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, gpa, len);
2110 	if (vmr == NULL) {
2111 		log_warnx("%s: failed - invalid gpa: 0x%lx\n", __func__, gpa);
2112 		errno = EFAULT;
2113 		return (NULL);
2114 	}
2115 
2116 	off = gpa - vmr->vmr_gpa;
2117 	if (len > (vmr->vmr_size - off)) {
2118 		log_warnx("%s: failed - invalid memory range: gpa=0x%lx, "
2119 		    "len=%zu", __func__, gpa, len);
2120 		errno = EINVAL;
2121 		return (NULL);
2122 	}
2123 
2124 	return ((char *)vmr->vmr_va + off);
2125 }
2126 
2127 /*
2128  * vcpu_assert_pic_irq
2129  *
2130  * Injects the specified IRQ on the supplied vcpu/vm
2131  *
2132  * Parameters:
2133  *  vm_id: VM ID to inject to
2134  *  vcpu_id: VCPU ID to inject to
2135  *  irq: IRQ to inject
2136  */
2137 void
2138 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
2139 {
2140 	int ret;
2141 
2142 	i8259_assert_irq(irq);
2143 
2144 	if (i8259_is_pending()) {
2145 		if (vcpu_pic_intr(vm_id, vcpu_id, 1))
2146 			fatalx("%s: can't assert INTR", __func__);
2147 		mutex_lock(&vcpu_run_mtx[vcpu_id]);
2148 		vcpu_hlt[vcpu_id] = 0;
2149 		ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
2150 		if (ret)
2151 			fatalx("%s: can't signal (%d)", __func__, ret);
2152 		mutex_unlock(&vcpu_run_mtx[vcpu_id]);
2153 	}
2154 }
2155 
2156 /*
2157  * vcpu_deassert_pic_irq
2158  *
2159  * Clears the specified IRQ on the supplied vcpu/vm
2160  *
2161  * Parameters:
2162  *  vm_id: VM ID to clear in
2163  *  vcpu_id: VCPU ID to clear in
2164  *  irq: IRQ to clear
2165  */
2166 void
2167 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
2168 {
2169 	i8259_deassert_irq(irq);
2170 
2171 	if (!i8259_is_pending()) {
2172 		if (vcpu_pic_intr(vm_id, vcpu_id, 0))
2173 			fatalx("%s: can't deassert INTR for vm_id %d, "
2174 			    "vcpu_id %d", __func__, vm_id, vcpu_id);
2175 	}
2176 }
2177 
2178 /*
2179  * fd_hasdata
2180  *
2181  * Determines if data can be read from a file descriptor.
2182  *
2183  * Parameters:
2184  *  fd: the fd to check
2185  *
2186  * Return values:
2187  *  1 if data can be read from an fd, or 0 otherwise.
2188  */
2189 int
2190 fd_hasdata(int fd)
2191 {
2192 	struct pollfd pfd[1];
2193 	int nready, hasdata = 0;
2194 
2195 	pfd[0].fd = fd;
2196 	pfd[0].events = POLLIN;
2197 	nready = poll(pfd, 1, 0);
2198 	if (nready == -1)
2199 		log_warn("checking file descriptor for data failed");
2200 	else if (nready == 1 && pfd[0].revents & POLLIN)
2201 		hasdata = 1;
2202 	return (hasdata);
2203 }
2204 
2205 /*
2206  * mutex_lock
2207  *
2208  * Wrapper function for pthread_mutex_lock that does error checking and that
2209  * exits on failure
2210  */
2211 void
2212 mutex_lock(pthread_mutex_t *m)
2213 {
2214 	int ret;
2215 
2216 	ret = pthread_mutex_lock(m);
2217 	if (ret) {
2218 		errno = ret;
2219 		fatal("could not acquire mutex");
2220 	}
2221 }
2222 
2223 /*
2224  * mutex_unlock
2225  *
2226  * Wrapper function for pthread_mutex_unlock that does error checking and that
2227  * exits on failure
2228  */
2229 void
2230 mutex_unlock(pthread_mutex_t *m)
2231 {
2232 	int ret;
2233 
2234 	ret = pthread_mutex_unlock(m);
2235 	if (ret) {
2236 		errno = ret;
2237 		fatal("could not release mutex");
2238 	}
2239 }
2240 
2241 /*
2242  * set_return_data
2243  *
2244  * Utility function for manipulating register data in vm exit info structs. This
2245  * function ensures that the data is copied to the vei->vei.vei_data field with
2246  * the proper size for the operation being performed.
2247  *
2248  * Parameters:
2249  *  vei: exit information
2250  *  data: return data
2251  */
2252 void
2253 set_return_data(struct vm_exit *vei, uint32_t data)
2254 {
2255 	switch (vei->vei.vei_size) {
2256 	case 1:
2257 		vei->vei.vei_data &= ~0xFF;
2258 		vei->vei.vei_data |= (uint8_t)data;
2259 		break;
2260 	case 2:
2261 		vei->vei.vei_data &= ~0xFFFF;
2262 		vei->vei.vei_data |= (uint16_t)data;
2263 		break;
2264 	case 4:
2265 		vei->vei.vei_data = data;
2266 		break;
2267 	}
2268 }
2269 
2270 /*
2271  * get_input_data
2272  *
2273  * Utility function for manipulating register data in vm exit info
2274  * structs. This function ensures that the data is copied from the
2275  * vei->vei.vei_data field with the proper size for the operation being
2276  * performed.
2277  *
2278  * Parameters:
2279  *  vei: exit information
2280  *  data: location to store the result
2281  */
2282 void
2283 get_input_data(struct vm_exit *vei, uint32_t *data)
2284 {
2285 	switch (vei->vei.vei_size) {
2286 	case 1:
2287 		*data &= 0xFFFFFF00;
2288 		*data |= (uint8_t)vei->vei.vei_data;
2289 		break;
2290 	case 2:
2291 		*data &= 0xFFFF0000;
2292 		*data |= (uint16_t)vei->vei.vei_data;
2293 		break;
2294 	case 4:
2295 		*data = vei->vei.vei_data;
2296 		break;
2297 	default:
2298 		log_warnx("%s: invalid i/o size %d", __func__,
2299 		    vei->vei.vei_size);
2300 	}
2301 
2302 }
2303 
2304 /*
2305  * translate_gva
2306  *
2307  * Translates a guest virtual address to a guest physical address by walking
2308  * the currently active page table (if needed).
2309  *
2310  * XXX ensure translate_gva updates the A bit in the PTE
2311  * XXX ensure translate_gva respects segment base and limits in i386 mode
2312  * XXX ensure translate_gva respects segment wraparound in i8086 mode
2313  * XXX ensure translate_gva updates the A bit in the segment selector
2314  * XXX ensure translate_gva respects CR4.LMSLE if available
2315  *
2316  * Parameters:
2317  *  exit: The VCPU this translation should be performed for (guest MMU settings
2318  *   are gathered from this VCPU)
2319  *  va: virtual address to translate
2320  *  pa: pointer to paddr_t variable that will receive the translated physical
2321  *   address. 'pa' is unchanged on error.
2322  *  mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which
2323  *   the address should be translated
2324  *
2325  * Return values:
2326  *  0: the address was successfully translated - 'pa' contains the physical
2327  *     address currently mapped by 'va'.
2328  *  EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case
2329  *     and %cr2 set in the vcpu structure.
2330  *  EINVAL: an error occurred reading paging table structures
2331  */
2332 int
2333 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode)
2334 {
2335 	int level, shift, pdidx;
2336 	uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask;
2337 	uint64_t shift_width, pte_size;
2338 	struct vcpu_reg_state *vrs;
2339 
2340 	vrs = &exit->vrs;
2341 
2342 	if (!pa)
2343 		return (EINVAL);
2344 
2345 	if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) {
2346 		log_debug("%s: unpaged, va=pa=0x%llx", __func__, va);
2347 		*pa = va;
2348 		return (0);
2349 	}
2350 
2351 	pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3];
2352 
2353 	log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__,
2354 	    vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]);
2355 
2356 	if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) {
2357 		if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) {
2358 			pte_size = sizeof(uint64_t);
2359 			shift_width = 9;
2360 
2361 			if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) {
2362 				/* 4 level paging */
2363 				level = 4;
2364 				mask = L4_MASK;
2365 				shift = L4_SHIFT;
2366 			} else {
2367 				/* 32 bit with PAE paging */
2368 				level = 3;
2369 				mask = L3_MASK;
2370 				shift = L3_SHIFT;
2371 			}
2372 		} else {
2373 			/* 32 bit paging */
2374 			level = 2;
2375 			shift_width = 10;
2376 			mask = 0xFFC00000;
2377 			shift = 22;
2378 			pte_size = sizeof(uint32_t);
2379 		}
2380 	} else
2381 		return (EINVAL);
2382 
2383 	/* XXX: Check for R bit in segment selector and set A bit */
2384 
2385 	for (;level > 0; level--) {
2386 		pdidx = (va & mask) >> shift;
2387 		pte_paddr = (pt_paddr) + (pdidx * pte_size);
2388 
2389 		log_debug("%s: read pte level %d @ GPA 0x%llx", __func__,
2390 		    level, pte_paddr);
2391 		if (read_mem(pte_paddr, &pte, pte_size)) {
2392 			log_warn("%s: failed to read pte", __func__);
2393 			return (EFAULT);
2394 		}
2395 
2396 		log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr,
2397 		    pte);
2398 
2399 		/* XXX: Set CR2  */
2400 		if (!(pte & PG_V))
2401 			return (EFAULT);
2402 
2403 		/* XXX: Check for SMAP */
2404 		if ((mode == PROT_WRITE) && !(pte & PG_RW))
2405 			return (EPERM);
2406 
2407 		if ((exit->cpl > 0) && !(pte & PG_u))
2408 			return (EPERM);
2409 
2410 		pte = pte | PG_U;
2411 		if (mode == PROT_WRITE)
2412 			pte = pte | PG_M;
2413 		if (write_mem(pte_paddr, &pte, pte_size)) {
2414 			log_warn("%s: failed to write back flags to pte",
2415 			    __func__);
2416 			return (EIO);
2417 		}
2418 
2419 		/* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */
2420 		if (pte & PG_PS)
2421 			break;
2422 
2423 		if (level > 1) {
2424 			pt_paddr = pte & PG_FRAME;
2425 			shift -= shift_width;
2426 			mask = mask >> shift_width;
2427 		}
2428 	}
2429 
2430 	low_mask = (1 << shift) - 1;
2431 	high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask;
2432 	*pa = (pte & high_mask) | (va & low_mask);
2433 
2434 	log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa);
2435 
2436 	return (0);
2437 }
2438 
2439 /*
2440  * vm_pipe_init
2441  *
2442  * Initialize a vm_dev_pipe, setting up its file descriptors and its
2443  * event structure with the given callback.
2444  *
2445  * Parameters:
2446  *  p: pointer to vm_dev_pipe struct to initizlize
2447  *  cb: callback to use for READ events on the read end of the pipe
2448  */
2449 void
2450 vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *))
2451 {
2452 	int ret;
2453 	int fds[2];
2454 
2455 	memset(p, 0, sizeof(struct vm_dev_pipe));
2456 
2457 	ret = pipe(fds);
2458 	if (ret)
2459 		fatal("failed to create vm_dev_pipe pipe");
2460 
2461 	p->read = fds[0];
2462 	p->write = fds[1];
2463 
2464 	event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, NULL);
2465 }
2466 
2467 /*
2468  * vm_pipe_send
2469  *
2470  * Send a message to an emulated device vie the provided vm_dev_pipe.
2471  *
2472  * Parameters:
2473  *  p: pointer to initialized vm_dev_pipe
2474  *  msg: message to send in the channel
2475  */
2476 void
2477 vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg)
2478 {
2479 	size_t n;
2480 	n = write(p->write, &msg, sizeof(msg));
2481 	if (n != sizeof(msg))
2482 		fatal("failed to write to device pipe");
2483 }
2484 
2485 /*
2486  * vm_pipe_recv
2487  *
2488  * Receive a message for an emulated device via the provided vm_dev_pipe.
2489  * Returns the message value, otherwise will exit on failure.
2490  *
2491  * Parameters:
2492  *  p: pointer to initialized vm_dev_pipe
2493  *
2494  * Return values:
2495  *  a value of enum pipe_msg_type or fatal exit on read(2) error
2496  */
2497 enum pipe_msg_type
2498 vm_pipe_recv(struct vm_dev_pipe *p)
2499 {
2500 	size_t n;
2501 	enum pipe_msg_type msg;
2502 	n = read(p->read, &msg, sizeof(msg));
2503 	if (n != sizeof(msg))
2504 		fatal("failed to read from device pipe");
2505 
2506 	return msg;
2507 }
2508 
2509 /*
2510  * Re-map the guest address space using the shared memory file descriptor.
2511  *
2512  * Returns 0 on success, non-zero in event of failure.
2513  */
2514 int
2515 remap_guest_mem(struct vmd_vm *vm, int vmm_fd)
2516 {
2517 	struct vm_create_params	*vcp;
2518 	struct vm_mem_range	*vmr;
2519 	struct vm_sharemem_params vsp;
2520 	size_t			 i, j;
2521 	void			*p = NULL;
2522 	int			 ret;
2523 
2524 	if (vm == NULL)
2525 		return (1);
2526 
2527 	vcp = &vm->vm_params.vmc_params;
2528 
2529 	/*
2530 	 * Initialize our VM shared memory request using our original
2531 	 * creation parameters. We'll overwrite the va's after mmap(2).
2532 	 */
2533 	memset(&vsp, 0, sizeof(vsp));
2534 	vsp.vsp_nmemranges = vcp->vcp_nmemranges;
2535 	vsp.vsp_vm_id = vcp->vcp_id;
2536 	memcpy(&vsp.vsp_memranges, &vcp->vcp_memranges,
2537 	    sizeof(vsp.vsp_memranges));
2538 
2539 	/*
2540 	 * Use mmap(2) to identify virtual address space for our mappings.
2541 	 */
2542 	for (i = 0; i < VMM_MAX_MEM_RANGES; i++) {
2543 		if (i < vsp.vsp_nmemranges) {
2544 			vmr = &vsp.vsp_memranges[i];
2545 
2546 			/* Ignore any MMIO ranges. */
2547 			if (vmr->vmr_type == VM_MEM_MMIO) {
2548 				vmr->vmr_va = 0;
2549 				vcp->vcp_memranges[i].vmr_va = 0;
2550 				continue;
2551 			}
2552 
2553 			/* Make initial mappings for the memrange. */
2554 			p = mmap(NULL, vmr->vmr_size, PROT_READ, MAP_ANON, -1,
2555 			    0);
2556 			if (p == MAP_FAILED) {
2557 				ret = errno;
2558 				log_warn("%s: mmap", __func__);
2559 				for (j = 0; j < i; j++) {
2560 					vmr = &vcp->vcp_memranges[j];
2561 					munmap((void *)vmr->vmr_va,
2562 					    vmr->vmr_size);
2563 				}
2564 				return (ret);
2565 			}
2566 			vmr->vmr_va = (vaddr_t)p;
2567 			vcp->vcp_memranges[i].vmr_va = vmr->vmr_va;
2568 		}
2569 	}
2570 
2571 	/*
2572 	 * munmap(2) now that we have va's and ranges that don't overlap. vmm
2573 	 * will use the va's and sizes to recreate the mappings for us.
2574 	 */
2575 	for (i = 0; i < vsp.vsp_nmemranges; i++) {
2576 		vmr = &vsp.vsp_memranges[i];
2577 		if (vmr->vmr_type == VM_MEM_MMIO)
2578 			continue;
2579 		if (munmap((void*)vmr->vmr_va, vmr->vmr_size) == -1)
2580 			fatal("%s: munmap", __func__);
2581 	}
2582 
2583 	/*
2584 	 * Ask vmm to enter the shared mappings for us. They'll point
2585 	 * to the same host physical memory, but will have a randomized
2586 	 * virtual address for the calling process.
2587 	 */
2588 	if (ioctl(vmm_fd, VMM_IOC_SHAREMEM, &vsp) == -1)
2589 		return (errno);
2590 
2591 	return (0);
2592 }
2593