xref: /openbsd/usr.sbin/vmd/vm.c (revision 5dea098c)
1 /*	$OpenBSD: vm.c,v 1.100 2024/04/29 14:47:06 dv Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>	/* PAGE_SIZE, MAXCOMLEN */
20 #include <sys/types.h>
21 #include <sys/ioctl.h>
22 #include <sys/queue.h>
23 #include <sys/wait.h>
24 #include <sys/uio.h>
25 #include <sys/stat.h>
26 #include <sys/socket.h>
27 #include <sys/time.h>
28 #include <sys/mman.h>
29 #include <sys/resource.h>
30 
31 #include <dev/ic/i8253reg.h>
32 #include <dev/isa/isareg.h>
33 #include <dev/pci/pcireg.h>
34 
35 #include <machine/psl.h>
36 #include <machine/pte.h>
37 #include <machine/specialreg.h>
38 #include <machine/vmmvar.h>
39 
40 #include <net/if.h>
41 
42 #include <errno.h>
43 #include <event.h>
44 #include <fcntl.h>
45 #include <imsg.h>
46 #include <limits.h>
47 #include <poll.h>
48 #include <pthread.h>
49 #include <pthread_np.h>
50 #include <stddef.h>
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <unistd.h>
55 #include <util.h>
56 
57 #include "atomicio.h"
58 #include "fw_cfg.h"
59 #include "i8253.h"
60 #include "i8259.h"
61 #include "loadfile.h"
62 #include "mc146818.h"
63 #include "mmio.h"
64 #include "ns8250.h"
65 #include "pci.h"
66 #include "virtio.h"
67 #include "vmd.h"
68 #include "vmm.h"
69 
70 #define MB(x)	(x * 1024UL * 1024UL)
71 #define GB(x)	(x * 1024UL * 1024UL * 1024UL)
72 
73 #define MMIO_NOTYET 0
74 
75 io_fn_t ioports_map[MAX_PORTS];
76 
77 static int run_vm(struct vmop_create_params *, struct vcpu_reg_state *);
78 void vm_dispatch_vmm(int, short, void *);
79 void *event_thread(void *);
80 void *vcpu_run_loop(void *);
81 int vcpu_exit(struct vm_run_params *);
82 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
83 void create_memory_map(struct vm_create_params *);
84 static int vmm_create_vm(struct vmd_vm *);
85 int alloc_guest_mem(struct vmd_vm *);
86 void init_emulated_hw(struct vmop_create_params *, int,
87     int[][VM_MAX_BASE_PER_DISK], int *);
88 void restore_emulated_hw(struct vm_create_params *, int, int *,
89     int[][VM_MAX_BASE_PER_DISK],int);
90 void vcpu_exit_inout(struct vm_run_params *);
91 int vcpu_exit_eptviolation(struct vm_run_params *);
92 uint8_t vcpu_exit_pci(struct vm_run_params *);
93 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
94 int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *);
95 static int send_vm(int, struct vmd_vm *);
96 int dump_send_header(int);
97 static int dump_vmr(int , struct vm_mem_range *);
98 static int dump_mem(int, struct vmd_vm *);
99 void restore_vmr(int, struct vm_mem_range *);
100 void restore_mem(int, struct vm_create_params *);
101 int restore_vm_params(int, struct vm_create_params *);
102 static void pause_vm(struct vmd_vm *);
103 static void unpause_vm(struct vmd_vm *);
104 
105 int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int);
106 
107 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
108     size_t);
109 
110 int con_fd;
111 struct vmd_vm *current_vm;
112 
113 extern struct vmd *env;
114 
115 extern char *__progname;
116 
117 pthread_mutex_t threadmutex;
118 pthread_cond_t threadcond;
119 
120 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM];
121 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM];
122 pthread_barrier_t vm_pause_barrier;
123 pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM];
124 pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM];
125 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM];
126 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM];
127 
128 /*
129  * Represents a standard register set for an OS to be booted
130  * as a flat 64 bit address space.
131  *
132  * NOT set here are:
133  *  RIP
134  *  RSP
135  *  GDTR BASE
136  *
137  * Specific bootloaders should clone this structure and override
138  * those fields as needed.
139  *
140  * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
141  *        features of the CPU in use.
142  */
143 static const struct vcpu_reg_state vcpu_init_flat64 = {
144 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
145 	.vrs_gprs[VCPU_REGS_RIP] = 0x0,
146 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
147 	.vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG,
148 	.vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
149 	.vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE,
150 	.vrs_crs[VCPU_REGS_PDPTE0] = 0ULL,
151 	.vrs_crs[VCPU_REGS_PDPTE1] = 0ULL,
152 	.vrs_crs[VCPU_REGS_PDPTE2] = 0ULL,
153 	.vrs_crs[VCPU_REGS_PDPTE3] = 0ULL,
154 	.vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
155 	.vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
156 	.vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
157 	.vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
158 	.vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
159 	.vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
160 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
161 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
162 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
163 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
164 	.vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA,
165 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
166 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
167 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
168 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
169 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
170 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
171 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
172 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
173 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
174 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
175 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
176 	.vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL,
177 	.vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87
178 };
179 
180 /*
181  * Represents a standard register set for an BIOS to be booted
182  * as a flat 16 bit address space.
183  */
184 static const struct vcpu_reg_state vcpu_init_flat16 = {
185 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
186 	.vrs_gprs[VCPU_REGS_RIP] = 0xFFF0,
187 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
188 	.vrs_crs[VCPU_REGS_CR0] = 0x60000010,
189 	.vrs_crs[VCPU_REGS_CR3] = 0,
190 	.vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000},
191 	.vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0},
192 	.vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0},
193 	.vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0},
194 	.vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0},
195 	.vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0},
196 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
197 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
198 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
199 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
200 	.vrs_msrs[VCPU_REGS_EFER] = 0ULL,
201 	.vrs_drs[VCPU_REGS_DR0] = 0x0,
202 	.vrs_drs[VCPU_REGS_DR1] = 0x0,
203 	.vrs_drs[VCPU_REGS_DR2] = 0x0,
204 	.vrs_drs[VCPU_REGS_DR3] = 0x0,
205 	.vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
206 	.vrs_drs[VCPU_REGS_DR7] = 0x400,
207 	.vrs_msrs[VCPU_REGS_STAR] = 0ULL,
208 	.vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
209 	.vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
210 	.vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
211 	.vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
212 	.vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87
213 };
214 
215 /*
216  * vm_main
217  *
218  * Primary entrypoint for launching a vm. Does not return.
219  *
220  * fd: file descriptor for communicating with vmm process.
221  * fd_vmm: file descriptor for communicating with vmm(4) device
222  */
223 void
224 vm_main(int fd, int fd_vmm)
225 {
226 	struct vm_create_params	*vcp = NULL;
227 	struct vmd_vm		 vm;
228 	size_t			 sz = 0;
229 	int			 ret = 0;
230 
231 	/*
232 	 * The vm process relies on global state. Set the fd for /dev/vmm.
233 	 */
234 	env->vmd_fd = fd_vmm;
235 
236 	/*
237 	 * We aren't root, so we can't chroot(2). Use unveil(2) instead.
238 	 */
239 	if (unveil(env->argv0, "x") == -1)
240 		fatal("unveil %s", env->argv0);
241 	if (unveil(NULL, NULL) == -1)
242 		fatal("unveil lock");
243 
244 	/*
245 	 * pledge in the vm processes:
246 	 * stdio - for malloc and basic I/O including events.
247 	 * vmm - for the vmm ioctls and operations.
248 	 * proc exec - fork/exec for launching devices.
249 	 * recvfd - for vm send/recv and sending fd to devices.
250 	 */
251 	if (pledge("stdio vmm proc exec recvfd", NULL) == -1)
252 		fatal("pledge");
253 
254 	/* Receive our vm configuration. */
255 	memset(&vm, 0, sizeof(vm));
256 	sz = atomicio(read, fd, &vm, sizeof(vm));
257 	if (sz != sizeof(vm)) {
258 		log_warnx("failed to receive start message");
259 		_exit(EIO);
260 	}
261 
262 	/* Update process with the vm name. */
263 	vcp = &vm.vm_params.vmc_params;
264 	setproctitle("%s", vcp->vcp_name);
265 	log_procinit("vm/%s", vcp->vcp_name);
266 
267 	/* Receive the local prefix settings. */
268 	sz = atomicio(read, fd, &env->vmd_cfg.cfg_localprefix,
269 	    sizeof(env->vmd_cfg.cfg_localprefix));
270 	if (sz != sizeof(env->vmd_cfg.cfg_localprefix)) {
271 		log_warnx("failed to receive local prefix");
272 		_exit(EIO);
273 	}
274 
275 	/*
276 	 * We need, at minimum, a vm_kernel fd to boot a vm. This is either a
277 	 * kernel or a BIOS image.
278 	 */
279 	if (!(vm.vm_state & VM_STATE_RECEIVED)) {
280 		if (vm.vm_kernel == -1) {
281 			log_warnx("%s: failed to receive boot fd",
282 			    vcp->vcp_name);
283 			_exit(EINVAL);
284 		}
285 	}
286 
287 	ret = start_vm(&vm, fd);
288 	_exit(ret);
289 }
290 
291 /*
292  * loadfile_bios
293  *
294  * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image
295  * directly into memory.
296  *
297  * Parameters:
298  *  fp: file of a kernel file to load
299  *  size: uncompressed size of the image
300  *  (out) vrs: register state to set on init for this kernel
301  *
302  * Return values:
303  *  0 if successful
304  *  various error codes returned from read(2) or loadelf functions
305  */
306 int
307 loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs)
308 {
309 	off_t	 off;
310 
311 	/* Set up a "flat 16 bit" register state for BIOS */
312 	memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs));
313 
314 	/* Seek to the beginning of the BIOS image */
315 	if (gzseek(fp, 0, SEEK_SET) == -1)
316 		return (-1);
317 
318 	/* The BIOS image must end at 1MB */
319 	if ((off = MB(1) - size) < 0)
320 		return (-1);
321 
322 	/* Read BIOS image into memory */
323 	if (mread(fp, off, size) != (size_t)size) {
324 		errno = EIO;
325 		return (-1);
326 	}
327 
328 	if (gzseek(fp, 0, SEEK_SET) == -1)
329 		return (-1);
330 
331 	/* Read a second BIOS copy into memory ending at 4GB */
332 	off = GB(4) - size;
333 	if (mread(fp, off, size) != (size_t)size) {
334 		errno = EIO;
335 		return (-1);
336 	}
337 
338 	log_debug("%s: loaded BIOS image", __func__);
339 
340 	return (0);
341 }
342 
343 /*
344  * start_vm
345  *
346  * After forking a new VM process, starts the new VM with the creation
347  * parameters supplied (in the incoming vm->vm_params field). This
348  * function performs a basic sanity check on the incoming parameters
349  * and then performs the following steps to complete the creation of the VM:
350  *
351  * 1. validates and create the new VM
352  * 2. opens the imsg control channel to the parent and drops more privilege
353  * 3. drops additional privileges by calling pledge(2)
354  * 4. loads the kernel from the disk image or file descriptor
355  * 5. runs the VM's VCPU loops.
356  *
357  * Parameters:
358  *  vm: The VM data structure that is including the VM create parameters.
359  *  fd: The imsg socket that is connected to the parent process.
360  *
361  * Return values:
362  *  0: success
363  *  !0 : failure - typically an errno indicating the source of the failure
364  */
365 int
366 start_vm(struct vmd_vm *vm, int fd)
367 {
368 	struct vmop_create_params *vmc = &vm->vm_params;
369 	struct vm_create_params	*vcp = &vmc->vmc_params;
370 	struct vcpu_reg_state	 vrs;
371 	int			 nicfds[VM_MAX_NICS_PER_VM];
372 	int			 ret;
373 	gzFile			 fp;
374 	size_t			 i;
375 	struct vm_rwregs_params  vrp;
376 	struct stat		 sb;
377 
378 	/*
379 	 * We first try to initialize and allocate memory before bothering
380 	 * vmm(4) with a request to create a new vm.
381 	 */
382 	if (!(vm->vm_state & VM_STATE_RECEIVED))
383 		create_memory_map(vcp);
384 
385 	ret = alloc_guest_mem(vm);
386 	if (ret) {
387 		struct rlimit lim;
388 		char buf[FMT_SCALED_STRSIZE];
389 		if (ret == ENOMEM && getrlimit(RLIMIT_DATA, &lim) == 0) {
390 			if (fmt_scaled(lim.rlim_cur, buf) == 0)
391 				fatalx("could not allocate guest memory (data "
392 				    "limit is %s)", buf);
393 		}
394 		errno = ret;
395 		log_warn("could not allocate guest memory");
396 		return (ret);
397 	}
398 
399 	/* We've allocated guest memory, so now create the vm in vmm(4). */
400 	ret = vmm_create_vm(vm);
401 	if (ret) {
402 		/* Let the vmm process know we failed by sending a 0 vm id. */
403 		vcp->vcp_id = 0;
404 		atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id));
405 		return (ret);
406 	}
407 
408 	/*
409 	 * Some of vmd currently relies on global state (current_vm, con_fd).
410 	 */
411 	current_vm = vm;
412 	con_fd = vm->vm_tty;
413 	if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) {
414 		log_warn("failed to set nonblocking mode on console");
415 		return (1);
416 	}
417 
418 	/*
419 	 * We now let the vmm process know we were successful by sending it our
420 	 * vmm(4) assigned vm id.
421 	 */
422 	if (atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
423 	    sizeof(vcp->vcp_id)) {
424 		log_warn("failed to send created vm id to vmm process");
425 		return (1);
426 	}
427 
428 	/* Prepare either our boot image or receive an existing vm to launch. */
429 	if (vm->vm_state & VM_STATE_RECEIVED) {
430 		ret = atomicio(read, vm->vm_receive_fd, &vrp, sizeof(vrp));
431 		if (ret != sizeof(vrp))
432 			fatal("received incomplete vrp - exiting");
433 		vrs = vrp.vrwp_regs;
434 	} else {
435 		/*
436 		 * Set up default "flat 64 bit" register state - RIP,
437 		 * RSP, and GDT info will be set in bootloader
438 		 */
439 		memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs));
440 
441 		/* Find and open kernel image */
442 		if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL)
443 			fatalx("failed to open kernel - exiting");
444 
445 		/* Load kernel image */
446 		ret = loadfile_elf(fp, vm, &vrs, vmc->vmc_bootdevice);
447 
448 		/*
449 		 * Try BIOS as a fallback (only if it was provided as an image
450 		 * with vm->vm_kernel and the file is not compressed)
451 		 */
452 		if (ret && errno == ENOEXEC && vm->vm_kernel != -1 &&
453 		    gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0)
454 			ret = loadfile_bios(fp, sb.st_size, &vrs);
455 
456 		if (ret)
457 			fatal("failed to load kernel or BIOS - exiting");
458 
459 		gzclose(fp);
460 	}
461 
462 	if (vm->vm_kernel != -1)
463 		close_fd(vm->vm_kernel);
464 
465 	/* Initialize our mutexes. */
466 	ret = pthread_mutex_init(&threadmutex, NULL);
467 	if (ret) {
468 		log_warn("%s: could not initialize thread state mutex",
469 		    __func__);
470 		return (ret);
471 	}
472 	ret = pthread_cond_init(&threadcond, NULL);
473 	if (ret) {
474 		log_warn("%s: could not initialize thread state "
475 		    "condition variable", __func__);
476 		return (ret);
477 	}
478 	mutex_lock(&threadmutex);
479 
480 
481 	/*
482 	 * Finalize our communication socket with the vmm process. From here
483 	 * onwards, communication with the vmm process is event-based.
484 	 */
485 	event_init();
486 	if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1)
487 		fatal("setup vm pipe");
488 
489 	/*
490 	 * Initialize or restore our emulated hardware.
491 	 */
492 	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
493 		nicfds[i] = vm->vm_ifs[i].vif_fd;
494 
495 	if (vm->vm_state & VM_STATE_RECEIVED) {
496 		restore_mem(vm->vm_receive_fd, vcp);
497 		restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds,
498 		    vm->vm_disks, vm->vm_cdrom);
499 		if (restore_vm_params(vm->vm_receive_fd, vcp))
500 			fatal("restore vm params failed");
501 		unpause_vm(vm);
502 	} else
503 		init_emulated_hw(vmc, vm->vm_cdrom, vm->vm_disks, nicfds);
504 
505 	/* Drop privleges further before starting the vcpu run loop(s). */
506 	if (pledge("stdio vmm recvfd", NULL) == -1)
507 		fatal("pledge");
508 
509 	/*
510 	 * Execute the vcpu run loop(s) for this VM.
511 	 */
512 	ret = run_vm(&vm->vm_params, &vrs);
513 
514 	/* Ensure that any in-flight data is written back */
515 	virtio_shutdown(vm);
516 
517 	return (ret);
518 }
519 
520 /*
521  * vm_dispatch_vmm
522  *
523  * imsg callback for messages that are received from the vmm parent process.
524  */
525 void
526 vm_dispatch_vmm(int fd, short event, void *arg)
527 {
528 	struct vmd_vm		*vm = arg;
529 	struct vmop_result	 vmr;
530 	struct vmop_addr_result	 var;
531 	struct imsgev		*iev = &vm->vm_iev;
532 	struct imsgbuf		*ibuf = &iev->ibuf;
533 	struct imsg		 imsg;
534 	ssize_t			 n;
535 	int			 verbose;
536 
537 	if (event & EV_READ) {
538 		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
539 			fatal("%s: imsg_read", __func__);
540 		if (n == 0)
541 			_exit(0);
542 	}
543 
544 	if (event & EV_WRITE) {
545 		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
546 			fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd);
547 		if (n == 0)
548 			_exit(0);
549 	}
550 
551 	for (;;) {
552 		if ((n = imsg_get(ibuf, &imsg)) == -1)
553 			fatal("%s: imsg_get", __func__);
554 		if (n == 0)
555 			break;
556 
557 #if DEBUG > 1
558 		log_debug("%s: got imsg %d from %s",
559 		    __func__, imsg.hdr.type,
560 		    vm->vm_params.vmc_params.vcp_name);
561 #endif
562 
563 		switch (imsg.hdr.type) {
564 		case IMSG_CTL_VERBOSE:
565 			IMSG_SIZE_CHECK(&imsg, &verbose);
566 			memcpy(&verbose, imsg.data, sizeof(verbose));
567 			log_setverbose(verbose);
568 			virtio_broadcast_imsg(vm, IMSG_CTL_VERBOSE, &verbose,
569 			    sizeof(verbose));
570 			break;
571 		case IMSG_VMDOP_VM_SHUTDOWN:
572 			if (vmmci_ctl(VMMCI_SHUTDOWN) == -1)
573 				_exit(0);
574 			break;
575 		case IMSG_VMDOP_VM_REBOOT:
576 			if (vmmci_ctl(VMMCI_REBOOT) == -1)
577 				_exit(0);
578 			break;
579 		case IMSG_VMDOP_PAUSE_VM:
580 			vmr.vmr_result = 0;
581 			vmr.vmr_id = vm->vm_vmid;
582 			pause_vm(vm);
583 			imsg_compose_event(&vm->vm_iev,
584 			    IMSG_VMDOP_PAUSE_VM_RESPONSE,
585 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
586 			    sizeof(vmr));
587 			break;
588 		case IMSG_VMDOP_UNPAUSE_VM:
589 			vmr.vmr_result = 0;
590 			vmr.vmr_id = vm->vm_vmid;
591 			unpause_vm(vm);
592 			imsg_compose_event(&vm->vm_iev,
593 			    IMSG_VMDOP_UNPAUSE_VM_RESPONSE,
594 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
595 			    sizeof(vmr));
596 			break;
597 		case IMSG_VMDOP_SEND_VM_REQUEST:
598 			vmr.vmr_id = vm->vm_vmid;
599 			vmr.vmr_result = send_vm(imsg_get_fd(&imsg), vm);
600 			imsg_compose_event(&vm->vm_iev,
601 			    IMSG_VMDOP_SEND_VM_RESPONSE,
602 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
603 			    sizeof(vmr));
604 			if (!vmr.vmr_result) {
605 				imsg_flush(&current_vm->vm_iev.ibuf);
606 				_exit(0);
607 			}
608 			break;
609 		case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
610 			IMSG_SIZE_CHECK(&imsg, &var);
611 			memcpy(&var, imsg.data, sizeof(var));
612 
613 			log_debug("%s: received tap addr %s for nic %d",
614 			    vm->vm_params.vmc_params.vcp_name,
615 			    ether_ntoa((void *)var.var_addr), var.var_nic_idx);
616 
617 			vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr);
618 			break;
619 		default:
620 			fatalx("%s: got invalid imsg %d from %s",
621 			    __func__, imsg.hdr.type,
622 			    vm->vm_params.vmc_params.vcp_name);
623 		}
624 		imsg_free(&imsg);
625 	}
626 	imsg_event_add(iev);
627 }
628 
629 /*
630  * vm_shutdown
631  *
632  * Tell the vmm parent process to shutdown or reboot the VM and exit.
633  */
634 __dead void
635 vm_shutdown(unsigned int cmd)
636 {
637 	switch (cmd) {
638 	case VMMCI_NONE:
639 	case VMMCI_SHUTDOWN:
640 		(void)imsg_compose_event(&current_vm->vm_iev,
641 		    IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0);
642 		break;
643 	case VMMCI_REBOOT:
644 		(void)imsg_compose_event(&current_vm->vm_iev,
645 		    IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0);
646 		break;
647 	default:
648 		fatalx("invalid vm ctl command: %d", cmd);
649 	}
650 	imsg_flush(&current_vm->vm_iev.ibuf);
651 
652 	_exit(0);
653 }
654 
655 int
656 send_vm(int fd, struct vmd_vm *vm)
657 {
658 	struct vm_rwregs_params	   vrp;
659 	struct vm_rwvmparams_params vpp;
660 	struct vmop_create_params *vmc;
661 	struct vm_terminate_params vtp;
662 	unsigned int		   flags = 0;
663 	unsigned int		   i;
664 	int			   ret = 0;
665 	size_t			   sz;
666 
667 	if (dump_send_header(fd)) {
668 		log_warnx("%s: failed to send vm dump header", __func__);
669 		goto err;
670 	}
671 
672 	pause_vm(vm);
673 
674 	vmc = calloc(1, sizeof(struct vmop_create_params));
675 	if (vmc == NULL) {
676 		log_warn("%s: calloc error getting vmc", __func__);
677 		ret = -1;
678 		goto err;
679 	}
680 
681 	flags |= VMOP_CREATE_MEMORY;
682 	memcpy(&vmc->vmc_params, &current_vm->vm_params, sizeof(struct
683 	    vmop_create_params));
684 	vmc->vmc_flags = flags;
685 	vrp.vrwp_vm_id = vm->vm_params.vmc_params.vcp_id;
686 	vrp.vrwp_mask = VM_RWREGS_ALL;
687 	vpp.vpp_mask = VM_RWVMPARAMS_ALL;
688 	vpp.vpp_vm_id = vm->vm_params.vmc_params.vcp_id;
689 
690 	sz = atomicio(vwrite, fd, vmc, sizeof(struct vmop_create_params));
691 	if (sz != sizeof(struct vmop_create_params)) {
692 		ret = -1;
693 		goto err;
694 	}
695 
696 	for (i = 0; i < vm->vm_params.vmc_params.vcp_ncpus; i++) {
697 		vrp.vrwp_vcpu_id = i;
698 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) {
699 			log_warn("%s: readregs failed", __func__);
700 			goto err;
701 		}
702 
703 		sz = atomicio(vwrite, fd, &vrp,
704 		    sizeof(struct vm_rwregs_params));
705 		if (sz != sizeof(struct vm_rwregs_params)) {
706 			log_warn("%s: dumping registers failed", __func__);
707 			ret = -1;
708 			goto err;
709 		}
710 	}
711 
712 	/* Dump memory before devices to aid in restoration. */
713 	if ((ret = dump_mem(fd, vm)))
714 		goto err;
715 	if ((ret = i8253_dump(fd)))
716 		goto err;
717 	if ((ret = i8259_dump(fd)))
718 		goto err;
719 	if ((ret = ns8250_dump(fd)))
720 		goto err;
721 	if ((ret = mc146818_dump(fd)))
722 		goto err;
723 	if ((ret = fw_cfg_dump(fd)))
724 		goto err;
725 	if ((ret = pci_dump(fd)))
726 		goto err;
727 	if ((ret = virtio_dump(fd)))
728 		goto err;
729 
730 	for (i = 0; i < vm->vm_params.vmc_params.vcp_ncpus; i++) {
731 		vpp.vpp_vcpu_id = i;
732 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) {
733 			log_warn("%s: readvmparams failed", __func__);
734 			goto err;
735 		}
736 
737 		sz = atomicio(vwrite, fd, &vpp,
738 		    sizeof(struct vm_rwvmparams_params));
739 		if (sz != sizeof(struct vm_rwvmparams_params)) {
740 			log_warn("%s: dumping vm params failed", __func__);
741 			ret = -1;
742 			goto err;
743 		}
744 	}
745 
746 	vtp.vtp_vm_id = vm->vm_params.vmc_params.vcp_id;
747 	if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) {
748 		log_warnx("%s: term IOC error: %d, %d", __func__,
749 		    errno, ENOENT);
750 	}
751 err:
752 	close(fd);
753 	if (ret)
754 		unpause_vm(vm);
755 	return ret;
756 }
757 
758 int
759 dump_send_header(int fd) {
760 	struct vm_dump_header	   vmh;
761 	int			   i;
762 
763 	memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE,
764 	    sizeof(vmh.vmh_signature));
765 
766 	vmh.vmh_cpuids[0].code = 0x00;
767 	vmh.vmh_cpuids[0].leaf = 0x00;
768 
769 	vmh.vmh_cpuids[1].code = 0x01;
770 	vmh.vmh_cpuids[1].leaf = 0x00;
771 
772 	vmh.vmh_cpuids[2].code = 0x07;
773 	vmh.vmh_cpuids[2].leaf = 0x00;
774 
775 	vmh.vmh_cpuids[3].code = 0x0d;
776 	vmh.vmh_cpuids[3].leaf = 0x00;
777 
778 	vmh.vmh_cpuids[4].code = 0x80000001;
779 	vmh.vmh_cpuids[4].leaf = 0x00;
780 
781 	vmh.vmh_version = VM_DUMP_VERSION;
782 
783 	for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
784 		CPUID_LEAF(vmh.vmh_cpuids[i].code,
785 		    vmh.vmh_cpuids[i].leaf,
786 		    vmh.vmh_cpuids[i].a,
787 		    vmh.vmh_cpuids[i].b,
788 		    vmh.vmh_cpuids[i].c,
789 		    vmh.vmh_cpuids[i].d);
790 	}
791 
792 	if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh))
793 		return (-1);
794 
795 	return (0);
796 }
797 
798 int
799 dump_mem(int fd, struct vmd_vm *vm)
800 {
801 	unsigned int	i;
802 	int		ret;
803 	struct		vm_mem_range *vmr;
804 
805 	for (i = 0; i < vm->vm_params.vmc_params.vcp_nmemranges; i++) {
806 		vmr = &vm->vm_params.vmc_params.vcp_memranges[i];
807 		ret = dump_vmr(fd, vmr);
808 		if (ret)
809 			return ret;
810 	}
811 	return (0);
812 }
813 
814 int
815 restore_vm_params(int fd, struct vm_create_params *vcp) {
816 	unsigned int			i;
817 	struct vm_rwvmparams_params    vpp;
818 
819 	for (i = 0; i < vcp->vcp_ncpus; i++) {
820 		if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) {
821 			log_warn("%s: error restoring vm params", __func__);
822 			return (-1);
823 		}
824 		vpp.vpp_vm_id = vcp->vcp_id;
825 		vpp.vpp_vcpu_id = i;
826 		if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) {
827 			log_debug("%s: writing vm params failed", __func__);
828 			return (-1);
829 		}
830 	}
831 	return (0);
832 }
833 
834 void
835 restore_mem(int fd, struct vm_create_params *vcp)
836 {
837 	unsigned int	     i;
838 	struct vm_mem_range *vmr;
839 
840 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
841 		vmr = &vcp->vcp_memranges[i];
842 		restore_vmr(fd, vmr);
843 	}
844 }
845 
846 int
847 dump_vmr(int fd, struct vm_mem_range *vmr)
848 {
849 	size_t	rem = vmr->vmr_size, read=0;
850 	char	buf[PAGE_SIZE];
851 
852 	while (rem > 0) {
853 		if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) {
854 			log_warn("failed to read vmr");
855 			return (-1);
856 		}
857 		if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) {
858 			log_warn("failed to dump vmr");
859 			return (-1);
860 		}
861 		rem = rem - PAGE_SIZE;
862 		read = read + PAGE_SIZE;
863 	}
864 	return (0);
865 }
866 
867 void
868 restore_vmr(int fd, struct vm_mem_range *vmr)
869 {
870 	size_t	rem = vmr->vmr_size, wrote=0;
871 	char	buf[PAGE_SIZE];
872 
873 	while (rem > 0) {
874 		if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf))
875 			fatal("failed to restore vmr");
876 		if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE))
877 			fatal("failed to write vmr");
878 		rem = rem - PAGE_SIZE;
879 		wrote = wrote + PAGE_SIZE;
880 	}
881 }
882 
883 static void
884 pause_vm(struct vmd_vm *vm)
885 {
886 	unsigned int n;
887 	int ret;
888 	if (vm->vm_state & VM_STATE_PAUSED)
889 		return;
890 
891 	current_vm->vm_state |= VM_STATE_PAUSED;
892 
893 	ret = pthread_barrier_init(&vm_pause_barrier, NULL,
894 	    vm->vm_params.vmc_params.vcp_ncpus + 1);
895 	if (ret) {
896 		log_warnx("%s: cannot initialize pause barrier (%d)",
897 		    __progname, ret);
898 		return;
899 	}
900 
901 	for (n = 0; n < vm->vm_params.vmc_params.vcp_ncpus; n++) {
902 		ret = pthread_cond_broadcast(&vcpu_run_cond[n]);
903 		if (ret) {
904 			log_warnx("%s: can't broadcast vcpu run cond (%d)",
905 			    __func__, (int)ret);
906 			return;
907 		}
908 	}
909 	ret = pthread_barrier_wait(&vm_pause_barrier);
910 	if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
911 		log_warnx("%s: could not wait on pause barrier (%d)",
912 		    __func__, (int)ret);
913 		return;
914 	}
915 
916 	ret = pthread_barrier_destroy(&vm_pause_barrier);
917 	if (ret) {
918 		log_warnx("%s: could not destroy pause barrier (%d)",
919 		    __progname, ret);
920 		return;
921 	}
922 
923 	i8253_stop();
924 	mc146818_stop();
925 	ns8250_stop();
926 	virtio_stop(vm);
927 }
928 
929 static void
930 unpause_vm(struct vmd_vm *vm)
931 {
932 	unsigned int n;
933 	int ret;
934 	if (!(vm->vm_state & VM_STATE_PAUSED))
935 		return;
936 
937 	current_vm->vm_state &= ~VM_STATE_PAUSED;
938 	for (n = 0; n < vm->vm_params.vmc_params.vcp_ncpus; n++) {
939 		ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]);
940 		if (ret) {
941 			log_warnx("%s: can't broadcast vcpu unpause cond (%d)",
942 			    __func__, (int)ret);
943 			return;
944 		}
945 	}
946 
947 	i8253_start();
948 	mc146818_start();
949 	ns8250_start();
950 	virtio_start(vm);
951 }
952 
953 /*
954  * vcpu_reset
955  *
956  * Requests vmm(4) to reset the VCPUs in the indicated VM to
957  * the register state provided
958  *
959  * Parameters
960  *  vmid: VM ID to reset
961  *  vcpu_id: VCPU ID to reset
962  *  vrs: the register state to initialize
963  *
964  * Return values:
965  *  0: success
966  *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not
967  *      valid)
968  */
969 int
970 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
971 {
972 	struct vm_resetcpu_params vrp;
973 
974 	memset(&vrp, 0, sizeof(vrp));
975 	vrp.vrp_vm_id = vmid;
976 	vrp.vrp_vcpu_id = vcpu_id;
977 	memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state));
978 
979 	log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid);
980 
981 	if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1)
982 		return (errno);
983 
984 	return (0);
985 }
986 
987 /*
988  * create_memory_map
989  *
990  * Sets up the guest physical memory ranges that the VM can access.
991  *
992  * Parameters:
993  *  vcp: VM create parameters describing the VM whose memory map
994  *       is being created
995  *
996  * Return values:
997  *  nothing
998  */
999 void
1000 create_memory_map(struct vm_create_params *vcp)
1001 {
1002 	size_t len, mem_bytes;
1003 	size_t above_1m = 0, above_4g = 0;
1004 
1005 	mem_bytes = vcp->vcp_memranges[0].vmr_size;
1006 	vcp->vcp_nmemranges = 0;
1007 	if (mem_bytes == 0 || mem_bytes > VMM_MAX_VM_MEM_SIZE)
1008 		return;
1009 
1010 	/* First memory region: 0 - LOWMEM_KB (DOS low mem) */
1011 	len = LOWMEM_KB * 1024;
1012 	vcp->vcp_memranges[0].vmr_gpa = 0x0;
1013 	vcp->vcp_memranges[0].vmr_size = len;
1014 	vcp->vcp_memranges[0].vmr_type = VM_MEM_RAM;
1015 	mem_bytes -= len;
1016 
1017 	/*
1018 	 * Second memory region: LOWMEM_KB - 1MB.
1019 	 *
1020 	 * N.B. - Normally ROMs or parts of video RAM are mapped here.
1021 	 * We have to add this region, because some systems
1022 	 * unconditionally write to 0xb8000 (VGA RAM), and
1023 	 * we need to make sure that vmm(4) permits accesses
1024 	 * to it. So allocate guest memory for it.
1025 	 */
1026 	len = MB(1) - (LOWMEM_KB * 1024);
1027 	vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
1028 	vcp->vcp_memranges[1].vmr_size = len;
1029 	vcp->vcp_memranges[1].vmr_type = VM_MEM_RESERVED;
1030 	mem_bytes -= len;
1031 
1032 	/* If we have less than 2MB remaining, still create a 2nd BIOS area. */
1033 	if (mem_bytes <= MB(2)) {
1034 		vcp->vcp_memranges[2].vmr_gpa = VMM_PCI_MMIO_BAR_END;
1035 		vcp->vcp_memranges[2].vmr_size = MB(2);
1036 		vcp->vcp_memranges[2].vmr_type = VM_MEM_RESERVED;
1037 		vcp->vcp_nmemranges = 3;
1038 		return;
1039 	}
1040 
1041 	/*
1042 	 * Calculate the how to split any remaining memory across the 4GB
1043 	 * boundary while making sure we do not place physical memory into
1044 	 * MMIO ranges.
1045 	 */
1046 	if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - MB(1)) {
1047 		above_1m = VMM_PCI_MMIO_BAR_BASE - MB(1);
1048 		above_4g = mem_bytes - above_1m;
1049 	} else {
1050 		above_1m = mem_bytes;
1051 		above_4g = 0;
1052 	}
1053 
1054 	/* Third memory region: area above 1MB to MMIO region */
1055 	vcp->vcp_memranges[2].vmr_gpa = MB(1);
1056 	vcp->vcp_memranges[2].vmr_size = above_1m;
1057 	vcp->vcp_memranges[2].vmr_type = VM_MEM_RAM;
1058 
1059 	/* Fourth region: PCI MMIO range */
1060 	vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_BASE;
1061 	vcp->vcp_memranges[3].vmr_size = VMM_PCI_MMIO_BAR_END -
1062 	    VMM_PCI_MMIO_BAR_BASE + 1;
1063 	vcp->vcp_memranges[3].vmr_type = VM_MEM_MMIO;
1064 
1065 	/* Fifth region: 2nd copy of BIOS above MMIO ending at 4GB */
1066 	vcp->vcp_memranges[4].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
1067 	vcp->vcp_memranges[4].vmr_size = MB(2);
1068 	vcp->vcp_memranges[4].vmr_type = VM_MEM_RESERVED;
1069 
1070 	/* Sixth region: any remainder above 4GB */
1071 	if (above_4g > 0) {
1072 		vcp->vcp_memranges[5].vmr_gpa = GB(4);
1073 		vcp->vcp_memranges[5].vmr_size = above_4g;
1074 		vcp->vcp_memranges[5].vmr_type = VM_MEM_RAM;
1075 		vcp->vcp_nmemranges = 6;
1076 	} else
1077 		vcp->vcp_nmemranges = 5;
1078 }
1079 
1080 /*
1081  * alloc_guest_mem
1082  *
1083  * Allocates memory for the guest.
1084  * Instead of doing a single allocation with one mmap(), we allocate memory
1085  * separately for every range for the following reasons:
1086  * - ASLR for the individual ranges
1087  * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
1088  *   map the single mmap'd userspace memory to the individual guest physical
1089  *   memory ranges, the underlying amap of the single mmap'd range would have
1090  *   to allocate per-page reference counters. The reason is that the
1091  *   individual guest physical ranges would reference the single mmap'd region
1092  *   only partially. However, if every guest physical range has its own
1093  *   corresponding mmap'd userspace allocation, there are no partial
1094  *   references: every guest physical range fully references an mmap'd
1095  *   range => no per-page reference counters have to be allocated.
1096  *
1097  * Return values:
1098  *  0: success
1099  *  !0: failure - errno indicating the source of the failure
1100  */
1101 int
1102 alloc_guest_mem(struct vmd_vm *vm)
1103 {
1104 	void *p;
1105 	int ret = 0;
1106 	size_t i, j;
1107 	struct vm_create_params *vcp = &vm->vm_params.vmc_params;
1108 	struct vm_mem_range *vmr;
1109 
1110 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1111 		vmr = &vcp->vcp_memranges[i];
1112 
1113 		/*
1114 		 * We only need R/W as userland. vmm(4) will use R/W/X in its
1115 		 * mapping.
1116 		 *
1117 		 * We must use MAP_SHARED so emulated devices will be able
1118 		 * to generate shared mappings.
1119 		 */
1120 		p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
1121 		    MAP_ANON | MAP_CONCEAL | MAP_SHARED, -1, 0);
1122 		if (p == MAP_FAILED) {
1123 			ret = errno;
1124 			for (j = 0; j < i; j++) {
1125 				vmr = &vcp->vcp_memranges[j];
1126 				munmap((void *)vmr->vmr_va, vmr->vmr_size);
1127 			}
1128 			return (ret);
1129 		}
1130 		vmr->vmr_va = (vaddr_t)p;
1131 	}
1132 
1133 	return (ret);
1134 }
1135 
1136 /*
1137  * vmm_create_vm
1138  *
1139  * Requests vmm(4) to create a new VM using the supplied creation
1140  * parameters. This operation results in the creation of the in-kernel
1141  * structures for the VM, but does not start the VM's vcpu(s).
1142  *
1143  * Parameters:
1144  *  vm: pointer to the vm object
1145  *
1146  * Return values:
1147  *  0: success
1148  *  !0 : ioctl to vmm(4) failed
1149  */
1150 static int
1151 vmm_create_vm(struct vmd_vm *vm)
1152 {
1153 	struct vm_create_params *vcp = &vm->vm_params.vmc_params;
1154 
1155 	/* Sanity check arguments */
1156 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
1157 		return (EINVAL);
1158 
1159 	if (vcp->vcp_nmemranges == 0 ||
1160 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
1161 		return (EINVAL);
1162 
1163 	if (vm->vm_params.vmc_ndisks > VM_MAX_DISKS_PER_VM)
1164 		return (EINVAL);
1165 
1166 	if (vm->vm_params.vmc_nnics > VM_MAX_NICS_PER_VM)
1167 		return (EINVAL);
1168 
1169 	if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1)
1170 		return (errno);
1171 
1172 	return (0);
1173 }
1174 
1175 /*
1176  * init_emulated_hw
1177  *
1178  * Initializes the userspace hardware emulation
1179  */
1180 void
1181 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
1182     int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
1183 {
1184 	struct vm_create_params *vcp = &vmc->vmc_params;
1185 	size_t i;
1186 	uint64_t memlo, memhi;
1187 
1188 	/* Calculate memory size for NVRAM registers */
1189 	memlo = memhi = 0;
1190 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1191 		if (vcp->vcp_memranges[i].vmr_gpa == MB(1) &&
1192 		    vcp->vcp_memranges[i].vmr_size > (15 * MB(1)))
1193 			memlo = vcp->vcp_memranges[i].vmr_size - (15 * MB(1));
1194 		else if (vcp->vcp_memranges[i].vmr_gpa == GB(4))
1195 			memhi = vcp->vcp_memranges[i].vmr_size;
1196 	}
1197 
1198 	/* Reset the IO port map */
1199 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1200 
1201 	/* Init i8253 PIT */
1202 	i8253_init(vcp->vcp_id);
1203 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1204 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1205 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1206 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1207 	ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc;
1208 
1209 	/* Init mc146818 RTC */
1210 	mc146818_init(vcp->vcp_id, memlo, memhi);
1211 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1212 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1213 
1214 	/* Init master and slave PICs */
1215 	i8259_init();
1216 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1217 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1218 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1219 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1220 	ioports_map[ELCR0] = vcpu_exit_elcr;
1221 	ioports_map[ELCR1] = vcpu_exit_elcr;
1222 
1223 	/* Init ns8250 UART */
1224 	ns8250_init(con_fd, vcp->vcp_id);
1225 	for (i = COM1_DATA; i <= COM1_SCR; i++)
1226 		ioports_map[i] = vcpu_exit_com;
1227 
1228 	/* Initialize PCI */
1229 	for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++)
1230 		ioports_map[i] = vcpu_exit_pci;
1231 
1232 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1233 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1234 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1235 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1236 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1237 	pci_init();
1238 
1239 	/* Initialize virtio devices */
1240 	virtio_init(current_vm, child_cdrom, child_disks, child_taps);
1241 
1242 	/*
1243 	 * Init QEMU fw_cfg interface. Must be done last for pci hardware
1244 	 * detection.
1245 	 */
1246 	fw_cfg_init(vmc);
1247 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1248 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1249 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1250 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1251 }
1252 
1253 /*
1254  * restore_emulated_hw
1255  *
1256  * Restores the userspace hardware emulation from fd
1257  */
1258 void
1259 restore_emulated_hw(struct vm_create_params *vcp, int fd,
1260     int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
1261 {
1262 	/* struct vm_create_params *vcp = &vmc->vmc_params; */
1263 	int i;
1264 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
1265 
1266 	/* Init i8253 PIT */
1267 	i8253_restore(fd, vcp->vcp_id);
1268 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
1269 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
1270 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
1271 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
1272 
1273 	/* Init master and slave PICs */
1274 	i8259_restore(fd);
1275 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
1276 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
1277 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
1278 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
1279 
1280 	/* Init ns8250 UART */
1281 	ns8250_restore(fd, con_fd, vcp->vcp_id);
1282 	for (i = COM1_DATA; i <= COM1_SCR; i++)
1283 		ioports_map[i] = vcpu_exit_com;
1284 
1285 	/* Init mc146818 RTC */
1286 	mc146818_restore(fd, vcp->vcp_id);
1287 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
1288 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
1289 
1290 	/* Init QEMU fw_cfg interface */
1291 	fw_cfg_restore(fd);
1292 	ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
1293 	ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
1294 	ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
1295 	ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
1296 
1297 	/* Initialize PCI */
1298 	for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++)
1299 		ioports_map[i] = vcpu_exit_pci;
1300 
1301 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
1302 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
1303 	ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
1304 	ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
1305 	ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
1306 	pci_restore(fd);
1307 	virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps);
1308 }
1309 
1310 /*
1311  * run_vm
1312  *
1313  * Runs the VM whose creation parameters are specified in vcp
1314  *
1315  * Parameters:
1316  *  child_cdrom: previously-opened child ISO disk file descriptor
1317  *  child_disks: previously-opened child VM disk file file descriptors
1318  *  child_taps: previously-opened child tap file descriptors
1319  *  vmc: vmop_create_params struct containing the VM's desired creation
1320  *      configuration
1321  *  vrs: VCPU register state to initialize
1322  *
1323  * Return values:
1324  *  0: the VM exited normally
1325  *  !0 : the VM exited abnormally or failed to start
1326  */
1327 static int
1328 run_vm(struct vmop_create_params *vmc, struct vcpu_reg_state *vrs)
1329 {
1330 	struct vm_create_params *vcp = &vmc->vmc_params;
1331 	struct vm_rwregs_params vregsp;
1332 	uint8_t evdone = 0;
1333 	size_t i;
1334 	int ret;
1335 	pthread_t *tid, evtid;
1336 	char tname[MAXCOMLEN + 1];
1337 	struct vm_run_params **vrp;
1338 	void *exit_status;
1339 
1340 	if (vcp == NULL)
1341 		return (EINVAL);
1342 
1343 	if (vcp->vcp_nmemranges == 0 ||
1344 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
1345 		return (EINVAL);
1346 
1347 	tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t));
1348 	vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *));
1349 	if (tid == NULL || vrp == NULL) {
1350 		log_warn("%s: memory allocation error - exiting.",
1351 		    __progname);
1352 		return (ENOMEM);
1353 	}
1354 
1355 	log_debug("%s: starting %zu vcpu thread(s) for vm %s", __func__,
1356 	    vcp->vcp_ncpus, vcp->vcp_name);
1357 
1358 	/*
1359 	 * Create and launch one thread for each VCPU. These threads may
1360 	 * migrate between PCPUs over time; the need to reload CPU state
1361 	 * in such situations is detected and performed by vmm(4) in the
1362 	 * kernel.
1363 	 */
1364 	for (i = 0 ; i < vcp->vcp_ncpus; i++) {
1365 		vrp[i] = malloc(sizeof(struct vm_run_params));
1366 		if (vrp[i] == NULL) {
1367 			log_warn("%s: memory allocation error - "
1368 			    "exiting.", __progname);
1369 			/* caller will exit, so skip freeing */
1370 			return (ENOMEM);
1371 		}
1372 		vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit));
1373 		if (vrp[i]->vrp_exit == NULL) {
1374 			log_warn("%s: memory allocation error - "
1375 			    "exiting.", __progname);
1376 			/* caller will exit, so skip freeing */
1377 			return (ENOMEM);
1378 		}
1379 		vrp[i]->vrp_vm_id = vcp->vcp_id;
1380 		vrp[i]->vrp_vcpu_id = i;
1381 
1382 		if (vcpu_reset(vcp->vcp_id, i, vrs)) {
1383 			log_warnx("%s: cannot reset VCPU %zu - exiting.",
1384 			    __progname, i);
1385 			return (EIO);
1386 		}
1387 
1388 		/* once more because reset_cpu changes regs */
1389 		if (current_vm->vm_state & VM_STATE_RECEIVED) {
1390 			vregsp.vrwp_vm_id = vcp->vcp_id;
1391 			vregsp.vrwp_vcpu_id = i;
1392 			vregsp.vrwp_regs = *vrs;
1393 			vregsp.vrwp_mask = VM_RWREGS_ALL;
1394 			if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS,
1395 			    &vregsp)) == -1) {
1396 				log_warn("%s: writeregs failed", __func__);
1397 				return (ret);
1398 			}
1399 		}
1400 
1401 		ret = pthread_cond_init(&vcpu_run_cond[i], NULL);
1402 		if (ret) {
1403 			log_warnx("%s: cannot initialize cond var (%d)",
1404 			    __progname, ret);
1405 			return (ret);
1406 		}
1407 
1408 		ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL);
1409 		if (ret) {
1410 			log_warnx("%s: cannot initialize mtx (%d)",
1411 			    __progname, ret);
1412 			return (ret);
1413 		}
1414 
1415 		ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL);
1416 		if (ret) {
1417 			log_warnx("%s: cannot initialize unpause var (%d)",
1418 			    __progname, ret);
1419 			return (ret);
1420 		}
1421 
1422 		ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL);
1423 		if (ret) {
1424 			log_warnx("%s: cannot initialize unpause mtx (%d)",
1425 			    __progname, ret);
1426 			return (ret);
1427 		}
1428 
1429 		vcpu_hlt[i] = 0;
1430 
1431 		/* Start each VCPU run thread at vcpu_run_loop */
1432 		ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
1433 		if (ret) {
1434 			/* caller will _exit after this return */
1435 			ret = errno;
1436 			log_warn("%s: could not create vcpu thread %zu",
1437 			    __func__, i);
1438 			return (ret);
1439 		}
1440 
1441 		snprintf(tname, sizeof(tname), "vcpu-%zu", i);
1442 		pthread_set_name_np(tid[i], tname);
1443 	}
1444 
1445 	log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name);
1446 	ret = pthread_create(&evtid, NULL, event_thread, &evdone);
1447 	if (ret) {
1448 		errno = ret;
1449 		log_warn("%s: could not create event thread", __func__);
1450 		return (ret);
1451 	}
1452 	pthread_set_name_np(evtid, "event");
1453 
1454 	for (;;) {
1455 		ret = pthread_cond_wait(&threadcond, &threadmutex);
1456 		if (ret) {
1457 			log_warn("%s: waiting on thread state condition "
1458 			    "variable failed", __func__);
1459 			return (ret);
1460 		}
1461 
1462 		/*
1463 		 * Did a VCPU thread exit with an error? => return the first one
1464 		 */
1465 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1466 			if (vcpu_done[i] == 0)
1467 				continue;
1468 
1469 			if (pthread_join(tid[i], &exit_status)) {
1470 				log_warn("%s: failed to join thread %zd - "
1471 				    "exiting", __progname, i);
1472 				return (EIO);
1473 			}
1474 
1475 			ret = (intptr_t)exit_status;
1476 		}
1477 
1478 		/* Did the event thread exit? => return with an error */
1479 		if (evdone) {
1480 			if (pthread_join(evtid, &exit_status)) {
1481 				log_warn("%s: failed to join event thread - "
1482 				    "exiting", __progname);
1483 				return (EIO);
1484 			}
1485 
1486 			log_warnx("%s: vm %d event thread exited "
1487 			    "unexpectedly", __progname, vcp->vcp_id);
1488 			return (EIO);
1489 		}
1490 
1491 		/* Did all VCPU threads exit successfully? => return */
1492 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1493 			if (vcpu_done[i] == 0)
1494 				break;
1495 		}
1496 		if (i == vcp->vcp_ncpus)
1497 			return (ret);
1498 
1499 		/* Some more threads to wait for, start over */
1500 	}
1501 
1502 	return (ret);
1503 }
1504 
1505 void *
1506 event_thread(void *arg)
1507 {
1508 	uint8_t *donep = arg;
1509 	intptr_t ret;
1510 
1511 	ret = event_dispatch();
1512 
1513 	mutex_lock(&threadmutex);
1514 	*donep = 1;
1515 	pthread_cond_signal(&threadcond);
1516 	mutex_unlock(&threadmutex);
1517 
1518 	return (void *)ret;
1519  }
1520 
1521 /*
1522  * vcpu_run_loop
1523  *
1524  * Runs a single VCPU until vmm(4) requires help handling an exit,
1525  * or the VM terminates.
1526  *
1527  * Parameters:
1528  *  arg: vcpu_run_params for the VCPU being run by this thread
1529  *
1530  * Return values:
1531  *  NULL: the VCPU shutdown properly
1532  *  !NULL: error processing VCPU run, or the VCPU shutdown abnormally
1533  */
1534 void *
1535 vcpu_run_loop(void *arg)
1536 {
1537 	struct vm_run_params *vrp = (struct vm_run_params *)arg;
1538 	intptr_t ret = 0;
1539 	uint32_t n;
1540 
1541 	n = vrp->vrp_vcpu_id;
1542 
1543 	for (;;) {
1544 		ret = pthread_mutex_lock(&vcpu_run_mtx[n]);
1545 
1546 		if (ret) {
1547 			log_warnx("%s: can't lock vcpu run mtx (%d)",
1548 			    __func__, (int)ret);
1549 			return ((void *)ret);
1550 		}
1551 
1552 		/* If we are halted and need to pause, pause */
1553 		if (vcpu_hlt[n] && (current_vm->vm_state & VM_STATE_PAUSED)) {
1554 			ret = pthread_barrier_wait(&vm_pause_barrier);
1555 			if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
1556 				log_warnx("%s: could not wait on pause barrier (%d)",
1557 				    __func__, (int)ret);
1558 				return ((void *)ret);
1559 			}
1560 
1561 			ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]);
1562 			if (ret) {
1563 				log_warnx("%s: can't lock vcpu unpause mtx (%d)",
1564 				    __func__, (int)ret);
1565 				return ((void *)ret);
1566 			}
1567 
1568 			/* i8259 may be firing as we pause, release run mtx. */
1569 			mutex_unlock(&vcpu_run_mtx[n]);
1570 			ret = pthread_cond_wait(&vcpu_unpause_cond[n],
1571 			    &vcpu_unpause_mtx[n]);
1572 			if (ret) {
1573 				log_warnx(
1574 				    "%s: can't wait on unpause cond (%d)",
1575 				    __func__, (int)ret);
1576 				break;
1577 			}
1578 			mutex_lock(&vcpu_run_mtx[n]);
1579 
1580 			ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]);
1581 			if (ret) {
1582 				log_warnx("%s: can't unlock unpause mtx (%d)",
1583 				    __func__, (int)ret);
1584 				break;
1585 			}
1586 		}
1587 
1588 		/* If we are halted and not paused, wait */
1589 		if (vcpu_hlt[n]) {
1590 			ret = pthread_cond_wait(&vcpu_run_cond[n],
1591 			    &vcpu_run_mtx[n]);
1592 
1593 			if (ret) {
1594 				log_warnx(
1595 				    "%s: can't wait on cond (%d)",
1596 				    __func__, (int)ret);
1597 				(void)pthread_mutex_unlock(
1598 				    &vcpu_run_mtx[n]);
1599 				break;
1600 			}
1601 		}
1602 
1603 		ret = pthread_mutex_unlock(&vcpu_run_mtx[n]);
1604 
1605 		if (ret) {
1606 			log_warnx("%s: can't unlock mutex on cond (%d)",
1607 			    __func__, (int)ret);
1608 			break;
1609 		}
1610 
1611 		if (vrp->vrp_irqready && i8259_is_pending()) {
1612 			vrp->vrp_inject.vie_vector = i8259_ack();
1613 			vrp->vrp_inject.vie_type = VCPU_INJECT_INTR;
1614 		} else
1615 			vrp->vrp_inject.vie_type = VCPU_INJECT_NONE;
1616 
1617 		/* Still more interrupts pending? */
1618 		vrp->vrp_intr_pending = i8259_is_pending();
1619 
1620 		if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) {
1621 			/* If run ioctl failed, exit */
1622 			ret = errno;
1623 			log_warn("%s: vm %d / vcpu %d run ioctl failed",
1624 			    __func__, current_vm->vm_vmid, n);
1625 			break;
1626 		}
1627 
1628 		/* If the VM is terminating, exit normally */
1629 		if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) {
1630 			ret = (intptr_t)NULL;
1631 			break;
1632 		}
1633 
1634 		if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
1635 			/*
1636 			 * vmm(4) needs help handling an exit, handle in
1637 			 * vcpu_exit.
1638 			 */
1639 			ret = vcpu_exit(vrp);
1640 			if (ret)
1641 				break;
1642 		}
1643 	}
1644 
1645 	mutex_lock(&threadmutex);
1646 	vcpu_done[n] = 1;
1647 	pthread_cond_signal(&threadcond);
1648 	mutex_unlock(&threadmutex);
1649 
1650 	return ((void *)ret);
1651 }
1652 
1653 int
1654 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
1655 {
1656 	struct vm_intr_params vip;
1657 
1658 	memset(&vip, 0, sizeof(vip));
1659 
1660 	vip.vip_vm_id = vm_id;
1661 	vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */
1662 	vip.vip_intr = intr;
1663 
1664 	if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1)
1665 		return (errno);
1666 
1667 	return (0);
1668 }
1669 
1670 /*
1671  * vcpu_exit_pci
1672  *
1673  * Handle all I/O to the emulated PCI subsystem.
1674  *
1675  * Parameters:
1676  *  vrp: vcpu run parameters containing guest state for this exit
1677  *
1678  * Return value:
1679  *  Interrupt to inject to the guest VM, or 0xFF if no interrupt should
1680  *      be injected.
1681  */
1682 uint8_t
1683 vcpu_exit_pci(struct vm_run_params *vrp)
1684 {
1685 	struct vm_exit *vei = vrp->vrp_exit;
1686 	uint8_t intr;
1687 
1688 	intr = 0xFF;
1689 
1690 	switch (vei->vei.vei_port) {
1691 	case PCI_MODE1_ADDRESS_REG:
1692 		pci_handle_address_reg(vrp);
1693 		break;
1694 	case PCI_MODE1_DATA_REG:
1695 	case PCI_MODE1_DATA_REG + 1:
1696 	case PCI_MODE1_DATA_REG + 2:
1697 	case PCI_MODE1_DATA_REG + 3:
1698 		pci_handle_data_reg(vrp);
1699 		break;
1700 	case VM_PCI_IO_BAR_BASE ... VM_PCI_IO_BAR_END:
1701 		intr = pci_handle_io(vrp);
1702 		break;
1703 	default:
1704 		log_warnx("%s: unknown PCI register 0x%llx",
1705 		    __progname, (uint64_t)vei->vei.vei_port);
1706 		break;
1707 	}
1708 
1709 	return (intr);
1710 }
1711 
1712 /*
1713  * vcpu_exit_inout
1714  *
1715  * Handle all I/O exits that need to be emulated in vmd. This includes the
1716  * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
1717  *
1718  * Parameters:
1719  *  vrp: vcpu run parameters containing guest state for this exit
1720  */
1721 void
1722 vcpu_exit_inout(struct vm_run_params *vrp)
1723 {
1724 	struct vm_exit *vei = vrp->vrp_exit;
1725 	uint8_t intr = 0xFF;
1726 
1727 	if (vei->vei.vei_rep || vei->vei.vei_string) {
1728 #ifdef MMIO_DEBUG
1729 		log_info("%s: %s%s%s %d-byte, enc=%d, data=0x%08x, port=0x%04x",
1730 		    __func__,
1731 		    vei->vei.vei_rep == 0 ? "" : "REP ",
1732 		    vei->vei.vei_dir == VEI_DIR_IN ? "IN" : "OUT",
1733 		    vei->vei.vei_string == 0 ? "" : "S",
1734 		    vei->vei.vei_size, vei->vei.vei_encoding,
1735 		    vei->vei.vei_data, vei->vei.vei_port);
1736 		log_info("%s: ECX = 0x%llx, RDX = 0x%llx, RSI = 0x%llx",
1737 		    __func__,
1738 		    vei->vrs.vrs_gprs[VCPU_REGS_RCX],
1739 		    vei->vrs.vrs_gprs[VCPU_REGS_RDX],
1740 		    vei->vrs.vrs_gprs[VCPU_REGS_RSI]);
1741 #endif /* MMIO_DEBUG */
1742 		fatalx("%s: can't emulate REP prefixed IN(S)/OUT(S)",
1743 		    __func__);
1744 	}
1745 
1746 	if (ioports_map[vei->vei.vei_port] != NULL)
1747 		intr = ioports_map[vei->vei.vei_port](vrp);
1748 	else if (vei->vei.vei_dir == VEI_DIR_IN)
1749 		set_return_data(vei, 0xFFFFFFFF);
1750 
1751 	vei->vrs.vrs_gprs[VCPU_REGS_RIP] += vei->vei.vei_insn_len;
1752 
1753 	if (intr != 0xFF)
1754 		vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
1755 }
1756 
1757 /*
1758  * vcpu_exit_eptviolation
1759  *
1760  * handle an EPT Violation
1761  *
1762  * Parameters:
1763  *  vrp: vcpu run parameters containing guest state for this exit
1764  *
1765  * Return values:
1766  *  0: no action required
1767  *  EFAULT: a protection fault occured, kill the vm.
1768  */
1769 int
1770 vcpu_exit_eptviolation(struct vm_run_params *vrp)
1771 {
1772 	struct vm_exit *ve = vrp->vrp_exit;
1773 	int ret = 0;
1774 #if MMIO_NOTYET
1775 	struct x86_insn insn;
1776 	uint64_t va, pa;
1777 	size_t len = 15;		/* Max instruction length in x86. */
1778 #endif /* MMIO_NOTYET */
1779 	switch (ve->vee.vee_fault_type) {
1780 	case VEE_FAULT_HANDLED:
1781 		log_debug("%s: fault already handled", __func__);
1782 		break;
1783 
1784 #if MMIO_NOTYET
1785 	case VEE_FAULT_MMIO_ASSIST:
1786 		/* Intel VMX might give us the length of the instruction. */
1787 		if (ve->vee.vee_insn_info & VEE_LEN_VALID)
1788 			len = ve->vee.vee_insn_len;
1789 
1790 		if (len > 15)
1791 			fatalx("%s: invalid instruction length %lu", __func__,
1792 			    len);
1793 
1794 		/* If we weren't given instruction bytes, we need to fetch. */
1795 		if (!(ve->vee.vee_insn_info & VEE_BYTES_VALID)) {
1796 			memset(ve->vee.vee_insn_bytes, 0,
1797 			    sizeof(ve->vee.vee_insn_bytes));
1798 			va = ve->vrs.vrs_gprs[VCPU_REGS_RIP];
1799 
1800 			/* XXX Only support instructions that fit on 1 page. */
1801 			if ((va & PAGE_MASK) + len > PAGE_SIZE) {
1802 				log_warnx("%s: instruction might cross page "
1803 				    "boundary", __func__);
1804 				ret = EINVAL;
1805 				break;
1806 			}
1807 
1808 			ret = translate_gva(ve, va, &pa, PROT_EXEC);
1809 			if (ret != 0) {
1810 				log_warnx("%s: failed gva translation",
1811 				    __func__);
1812 				break;
1813 			}
1814 
1815 			ret = read_mem(pa, ve->vee.vee_insn_bytes, len);
1816 			if (ret != 0) {
1817 				log_warnx("%s: failed to fetch instruction "
1818 				    "bytes from 0x%llx", __func__, pa);
1819 				break;
1820 			}
1821 		}
1822 
1823 		ret = insn_decode(ve, &insn);
1824 		if (ret == 0)
1825 			ret = insn_emulate(ve, &insn);
1826 		break;
1827 #endif /* MMIO_NOTYET */
1828 
1829 	case VEE_FAULT_PROTECT:
1830 		log_debug("%s: EPT Violation: rip=0x%llx", __progname,
1831 		    ve->vrs.vrs_gprs[VCPU_REGS_RIP]);
1832 		ret = EFAULT;
1833 		break;
1834 
1835 	default:
1836 		fatalx("%s: invalid fault_type %d", __progname,
1837 		    ve->vee.vee_fault_type);
1838 		/* UNREACHED */
1839 	}
1840 
1841 	return (ret);
1842 }
1843 
1844 /*
1845  * vcpu_exit
1846  *
1847  * Handle a vcpu exit. This function is called when it is determined that
1848  * vmm(4) requires the assistance of vmd to support a particular guest
1849  * exit type (eg, accessing an I/O port or device). Guest state is contained
1850  * in 'vrp', and will be resent to vmm(4) on exit completion.
1851  *
1852  * Upon conclusion of handling the exit, the function determines if any
1853  * interrupts should be injected into the guest, and asserts the proper
1854  * IRQ line whose interrupt should be vectored.
1855  *
1856  * Parameters:
1857  *  vrp: vcpu run parameters containing guest state for this exit
1858  *
1859  * Return values:
1860  *  0: the exit was handled successfully
1861  *  1: an error occurred (eg, unknown exit reason passed in 'vrp')
1862  */
1863 int
1864 vcpu_exit(struct vm_run_params *vrp)
1865 {
1866 	int ret;
1867 
1868 	switch (vrp->vrp_exit_reason) {
1869 	case VMX_EXIT_INT_WINDOW:
1870 	case SVM_VMEXIT_VINTR:
1871 	case VMX_EXIT_CPUID:
1872 	case VMX_EXIT_EXTINT:
1873 	case SVM_VMEXIT_INTR:
1874 	case SVM_VMEXIT_MSR:
1875 	case SVM_VMEXIT_CPUID:
1876 		/*
1877 		 * We may be exiting to vmd to handle a pending interrupt but
1878 		 * at the same time the last exit type may have been one of
1879 		 * these. In this case, there's nothing extra to be done
1880 		 * here (and falling through to the default case below results
1881 		 * in more vmd log spam).
1882 		 */
1883 		break;
1884 	case SVM_VMEXIT_NPF:
1885 	case VMX_EXIT_EPT_VIOLATION:
1886 		ret = vcpu_exit_eptviolation(vrp);
1887 		if (ret)
1888 			return (ret);
1889 		break;
1890 	case VMX_EXIT_IO:
1891 	case SVM_VMEXIT_IOIO:
1892 		vcpu_exit_inout(vrp);
1893 		break;
1894 	case VMX_EXIT_HLT:
1895 	case SVM_VMEXIT_HLT:
1896 		ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1897 		if (ret) {
1898 			log_warnx("%s: can't lock vcpu mutex (%d)",
1899 			    __func__, ret);
1900 			return (ret);
1901 		}
1902 		vcpu_hlt[vrp->vrp_vcpu_id] = 1;
1903 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1904 		if (ret) {
1905 			log_warnx("%s: can't unlock vcpu mutex (%d)",
1906 			    __func__, ret);
1907 			return (ret);
1908 		}
1909 		break;
1910 	case VMX_EXIT_TRIPLE_FAULT:
1911 	case SVM_VMEXIT_SHUTDOWN:
1912 		/* reset VM */
1913 		return (EAGAIN);
1914 	default:
1915 		log_debug("%s: unknown exit reason 0x%x",
1916 		    __progname, vrp->vrp_exit_reason);
1917 	}
1918 
1919 	return (0);
1920 }
1921 
1922 /*
1923  * find_gpa_range
1924  *
1925  * Search for a contiguous guest physical mem range.
1926  *
1927  * Parameters:
1928  *  vcp: VM create parameters that contain the memory map to search in
1929  *  gpa: the starting guest physical address
1930  *  len: the length of the memory range
1931  *
1932  * Return values:
1933  *  NULL: on failure if there is no memory range as described by the parameters
1934  *  Pointer to vm_mem_range that contains the start of the range otherwise.
1935  */
1936 static struct vm_mem_range *
1937 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
1938 {
1939 	size_t i, n;
1940 	struct vm_mem_range *vmr;
1941 
1942 	/* Find the first vm_mem_range that contains gpa */
1943 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1944 		vmr = &vcp->vcp_memranges[i];
1945 		if (gpa < vmr->vmr_gpa + vmr->vmr_size)
1946 			break;
1947 	}
1948 
1949 	/* No range found. */
1950 	if (i == vcp->vcp_nmemranges)
1951 		return (NULL);
1952 
1953 	/*
1954 	 * vmr may cover the range [gpa, gpa + len) only partly. Make
1955 	 * sure that the following vm_mem_ranges are contiguous and
1956 	 * cover the rest.
1957 	 */
1958 	n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
1959 	if (len < n)
1960 		len = 0;
1961 	else
1962 		len -= n;
1963 	gpa = vmr->vmr_gpa + vmr->vmr_size;
1964 	for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
1965 		vmr = &vcp->vcp_memranges[i];
1966 		if (gpa != vmr->vmr_gpa)
1967 			return (NULL);
1968 		if (len <= vmr->vmr_size)
1969 			len = 0;
1970 		else
1971 			len -= vmr->vmr_size;
1972 
1973 		gpa = vmr->vmr_gpa + vmr->vmr_size;
1974 	}
1975 
1976 	if (len != 0)
1977 		return (NULL);
1978 
1979 	return (vmr);
1980 }
1981 
1982 /*
1983  * write_mem
1984  *
1985  * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
1986  *
1987  * Parameters:
1988  *  dst: the destination paddr_t in the guest VM
1989  *  buf: data to copy (or NULL to zero the data)
1990  *  len: number of bytes to copy
1991  *
1992  * Return values:
1993  *  0: success
1994  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1995  *      exist in the guest.
1996  */
1997 int
1998 write_mem(paddr_t dst, const void *buf, size_t len)
1999 {
2000 	const char *from = buf;
2001 	char *to;
2002 	size_t n, off;
2003 	struct vm_mem_range *vmr;
2004 
2005 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
2006 	if (vmr == NULL) {
2007 		errno = EINVAL;
2008 		log_warn("%s: failed - invalid memory range dst = 0x%lx, "
2009 		    "len = 0x%zx", __func__, dst, len);
2010 		return (EINVAL);
2011 	}
2012 
2013 	off = dst - vmr->vmr_gpa;
2014 	while (len != 0) {
2015 		n = vmr->vmr_size - off;
2016 		if (len < n)
2017 			n = len;
2018 
2019 		to = (char *)vmr->vmr_va + off;
2020 		if (buf == NULL)
2021 			memset(to, 0, n);
2022 		else {
2023 			memcpy(to, from, n);
2024 			from += n;
2025 		}
2026 		len -= n;
2027 		off = 0;
2028 		vmr++;
2029 	}
2030 
2031 	return (0);
2032 }
2033 
2034 /*
2035  * read_mem
2036  *
2037  * Reads memory at guest paddr 'src' into 'buf'.
2038  *
2039  * Parameters:
2040  *  src: the source paddr_t in the guest VM to read from.
2041  *  buf: destination (local) buffer
2042  *  len: number of bytes to read
2043  *
2044  * Return values:
2045  *  0: success
2046  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
2047  *      exist in the guest.
2048  */
2049 int
2050 read_mem(paddr_t src, void *buf, size_t len)
2051 {
2052 	char *from, *to = buf;
2053 	size_t n, off;
2054 	struct vm_mem_range *vmr;
2055 
2056 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
2057 	if (vmr == NULL) {
2058 		errno = EINVAL;
2059 		log_warn("%s: failed - invalid memory range src = 0x%lx, "
2060 		    "len = 0x%zx", __func__, src, len);
2061 		return (EINVAL);
2062 	}
2063 
2064 	off = src - vmr->vmr_gpa;
2065 	while (len != 0) {
2066 		n = vmr->vmr_size - off;
2067 		if (len < n)
2068 			n = len;
2069 
2070 		from = (char *)vmr->vmr_va + off;
2071 		memcpy(to, from, n);
2072 
2073 		to += n;
2074 		len -= n;
2075 		off = 0;
2076 		vmr++;
2077 	}
2078 
2079 	return (0);
2080 }
2081 
2082 /*
2083  * hvaddr_mem
2084  *
2085  * Translate a guest physical address to a host virtual address, checking the
2086  * provided memory range length to confirm it's contiguous within the same
2087  * guest memory range (vm_mem_range).
2088  *
2089  * Parameters:
2090  *  gpa: guest physical address to translate
2091  *  len: number of bytes in the intended range
2092  *
2093  * Return values:
2094  *  void* to host virtual memory on success
2095  *  NULL on error, setting errno to:
2096  *    EFAULT: gpa falls outside guest memory ranges
2097  *    EINVAL: requested len extends beyond memory range
2098  */
2099 void *
2100 hvaddr_mem(paddr_t gpa, size_t len)
2101 {
2102 	struct vm_mem_range *vmr;
2103 	size_t off;
2104 
2105 	vmr = find_gpa_range(&current_vm->vm_params.vmc_params, gpa, len);
2106 	if (vmr == NULL) {
2107 		log_warnx("%s: failed - invalid gpa: 0x%lx\n", __func__, gpa);
2108 		errno = EFAULT;
2109 		return (NULL);
2110 	}
2111 
2112 	off = gpa - vmr->vmr_gpa;
2113 	if (len > (vmr->vmr_size - off)) {
2114 		log_warnx("%s: failed - invalid memory range: gpa=0x%lx, "
2115 		    "len=%zu", __func__, gpa, len);
2116 		errno = EINVAL;
2117 		return (NULL);
2118 	}
2119 
2120 	return ((char *)vmr->vmr_va + off);
2121 }
2122 
2123 /*
2124  * vcpu_assert_pic_irq
2125  *
2126  * Injects the specified IRQ on the supplied vcpu/vm
2127  *
2128  * Parameters:
2129  *  vm_id: VM ID to inject to
2130  *  vcpu_id: VCPU ID to inject to
2131  *  irq: IRQ to inject
2132  */
2133 void
2134 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
2135 {
2136 	int ret;
2137 
2138 	i8259_assert_irq(irq);
2139 
2140 	if (i8259_is_pending()) {
2141 		if (vcpu_pic_intr(vm_id, vcpu_id, 1))
2142 			fatalx("%s: can't assert INTR", __func__);
2143 		mutex_lock(&vcpu_run_mtx[vcpu_id]);
2144 		vcpu_hlt[vcpu_id] = 0;
2145 		ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
2146 		if (ret)
2147 			fatalx("%s: can't signal (%d)", __func__, ret);
2148 		mutex_unlock(&vcpu_run_mtx[vcpu_id]);
2149 	}
2150 }
2151 
2152 /*
2153  * vcpu_deassert_pic_irq
2154  *
2155  * Clears the specified IRQ on the supplied vcpu/vm
2156  *
2157  * Parameters:
2158  *  vm_id: VM ID to clear in
2159  *  vcpu_id: VCPU ID to clear in
2160  *  irq: IRQ to clear
2161  */
2162 void
2163 vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
2164 {
2165 	i8259_deassert_irq(irq);
2166 
2167 	if (!i8259_is_pending()) {
2168 		if (vcpu_pic_intr(vm_id, vcpu_id, 0))
2169 			fatalx("%s: can't deassert INTR for vm_id %d, "
2170 			    "vcpu_id %d", __func__, vm_id, vcpu_id);
2171 	}
2172 }
2173 
2174 /*
2175  * fd_hasdata
2176  *
2177  * Determines if data can be read from a file descriptor.
2178  *
2179  * Parameters:
2180  *  fd: the fd to check
2181  *
2182  * Return values:
2183  *  1 if data can be read from an fd, or 0 otherwise.
2184  */
2185 int
2186 fd_hasdata(int fd)
2187 {
2188 	struct pollfd pfd[1];
2189 	int nready, hasdata = 0;
2190 
2191 	pfd[0].fd = fd;
2192 	pfd[0].events = POLLIN;
2193 	nready = poll(pfd, 1, 0);
2194 	if (nready == -1)
2195 		log_warn("checking file descriptor for data failed");
2196 	else if (nready == 1 && pfd[0].revents & POLLIN)
2197 		hasdata = 1;
2198 	return (hasdata);
2199 }
2200 
2201 /*
2202  * mutex_lock
2203  *
2204  * Wrapper function for pthread_mutex_lock that does error checking and that
2205  * exits on failure
2206  */
2207 void
2208 mutex_lock(pthread_mutex_t *m)
2209 {
2210 	int ret;
2211 
2212 	ret = pthread_mutex_lock(m);
2213 	if (ret) {
2214 		errno = ret;
2215 		fatal("could not acquire mutex");
2216 	}
2217 }
2218 
2219 /*
2220  * mutex_unlock
2221  *
2222  * Wrapper function for pthread_mutex_unlock that does error checking and that
2223  * exits on failure
2224  */
2225 void
2226 mutex_unlock(pthread_mutex_t *m)
2227 {
2228 	int ret;
2229 
2230 	ret = pthread_mutex_unlock(m);
2231 	if (ret) {
2232 		errno = ret;
2233 		fatal("could not release mutex");
2234 	}
2235 }
2236 
2237 /*
2238  * set_return_data
2239  *
2240  * Utility function for manipulating register data in vm exit info structs. This
2241  * function ensures that the data is copied to the vei->vei.vei_data field with
2242  * the proper size for the operation being performed.
2243  *
2244  * Parameters:
2245  *  vei: exit information
2246  *  data: return data
2247  */
2248 void
2249 set_return_data(struct vm_exit *vei, uint32_t data)
2250 {
2251 	switch (vei->vei.vei_size) {
2252 	case 1:
2253 		vei->vei.vei_data &= ~0xFF;
2254 		vei->vei.vei_data |= (uint8_t)data;
2255 		break;
2256 	case 2:
2257 		vei->vei.vei_data &= ~0xFFFF;
2258 		vei->vei.vei_data |= (uint16_t)data;
2259 		break;
2260 	case 4:
2261 		vei->vei.vei_data = data;
2262 		break;
2263 	}
2264 }
2265 
2266 /*
2267  * get_input_data
2268  *
2269  * Utility function for manipulating register data in vm exit info
2270  * structs. This function ensures that the data is copied from the
2271  * vei->vei.vei_data field with the proper size for the operation being
2272  * performed.
2273  *
2274  * Parameters:
2275  *  vei: exit information
2276  *  data: location to store the result
2277  */
2278 void
2279 get_input_data(struct vm_exit *vei, uint32_t *data)
2280 {
2281 	switch (vei->vei.vei_size) {
2282 	case 1:
2283 		*data &= 0xFFFFFF00;
2284 		*data |= (uint8_t)vei->vei.vei_data;
2285 		break;
2286 	case 2:
2287 		*data &= 0xFFFF0000;
2288 		*data |= (uint16_t)vei->vei.vei_data;
2289 		break;
2290 	case 4:
2291 		*data = vei->vei.vei_data;
2292 		break;
2293 	default:
2294 		log_warnx("%s: invalid i/o size %d", __func__,
2295 		    vei->vei.vei_size);
2296 	}
2297 
2298 }
2299 
2300 /*
2301  * translate_gva
2302  *
2303  * Translates a guest virtual address to a guest physical address by walking
2304  * the currently active page table (if needed).
2305  *
2306  * XXX ensure translate_gva updates the A bit in the PTE
2307  * XXX ensure translate_gva respects segment base and limits in i386 mode
2308  * XXX ensure translate_gva respects segment wraparound in i8086 mode
2309  * XXX ensure translate_gva updates the A bit in the segment selector
2310  * XXX ensure translate_gva respects CR4.LMSLE if available
2311  *
2312  * Parameters:
2313  *  exit: The VCPU this translation should be performed for (guest MMU settings
2314  *   are gathered from this VCPU)
2315  *  va: virtual address to translate
2316  *  pa: pointer to paddr_t variable that will receive the translated physical
2317  *   address. 'pa' is unchanged on error.
2318  *  mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which
2319  *   the address should be translated
2320  *
2321  * Return values:
2322  *  0: the address was successfully translated - 'pa' contains the physical
2323  *     address currently mapped by 'va'.
2324  *  EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case
2325  *     and %cr2 set in the vcpu structure.
2326  *  EINVAL: an error occurred reading paging table structures
2327  */
2328 int
2329 translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode)
2330 {
2331 	int level, shift, pdidx;
2332 	uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask;
2333 	uint64_t shift_width, pte_size;
2334 	struct vcpu_reg_state *vrs;
2335 
2336 	vrs = &exit->vrs;
2337 
2338 	if (!pa)
2339 		return (EINVAL);
2340 
2341 	if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) {
2342 		log_debug("%s: unpaged, va=pa=0x%llx", __func__, va);
2343 		*pa = va;
2344 		return (0);
2345 	}
2346 
2347 	pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3];
2348 
2349 	log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__,
2350 	    vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]);
2351 
2352 	if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) {
2353 		if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) {
2354 			pte_size = sizeof(uint64_t);
2355 			shift_width = 9;
2356 
2357 			if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) {
2358 				/* 4 level paging */
2359 				level = 4;
2360 				mask = L4_MASK;
2361 				shift = L4_SHIFT;
2362 			} else {
2363 				/* 32 bit with PAE paging */
2364 				level = 3;
2365 				mask = L3_MASK;
2366 				shift = L3_SHIFT;
2367 			}
2368 		} else {
2369 			/* 32 bit paging */
2370 			level = 2;
2371 			shift_width = 10;
2372 			mask = 0xFFC00000;
2373 			shift = 22;
2374 			pte_size = sizeof(uint32_t);
2375 		}
2376 	} else
2377 		return (EINVAL);
2378 
2379 	/* XXX: Check for R bit in segment selector and set A bit */
2380 
2381 	for (;level > 0; level--) {
2382 		pdidx = (va & mask) >> shift;
2383 		pte_paddr = (pt_paddr) + (pdidx * pte_size);
2384 
2385 		log_debug("%s: read pte level %d @ GPA 0x%llx", __func__,
2386 		    level, pte_paddr);
2387 		if (read_mem(pte_paddr, &pte, pte_size)) {
2388 			log_warn("%s: failed to read pte", __func__);
2389 			return (EFAULT);
2390 		}
2391 
2392 		log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr,
2393 		    pte);
2394 
2395 		/* XXX: Set CR2  */
2396 		if (!(pte & PG_V))
2397 			return (EFAULT);
2398 
2399 		/* XXX: Check for SMAP */
2400 		if ((mode == PROT_WRITE) && !(pte & PG_RW))
2401 			return (EPERM);
2402 
2403 		if ((exit->cpl > 0) && !(pte & PG_u))
2404 			return (EPERM);
2405 
2406 		pte = pte | PG_U;
2407 		if (mode == PROT_WRITE)
2408 			pte = pte | PG_M;
2409 		if (write_mem(pte_paddr, &pte, pte_size)) {
2410 			log_warn("%s: failed to write back flags to pte",
2411 			    __func__);
2412 			return (EIO);
2413 		}
2414 
2415 		/* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */
2416 		if (pte & PG_PS)
2417 			break;
2418 
2419 		if (level > 1) {
2420 			pt_paddr = pte & PG_FRAME;
2421 			shift -= shift_width;
2422 			mask = mask >> shift_width;
2423 		}
2424 	}
2425 
2426 	low_mask = (1 << shift) - 1;
2427 	high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask;
2428 	*pa = (pte & high_mask) | (va & low_mask);
2429 
2430 	log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa);
2431 
2432 	return (0);
2433 }
2434 
2435 void
2436 vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *))
2437 {
2438 	vm_pipe_init2(p, cb, NULL);
2439 }
2440 
2441 /*
2442  * vm_pipe_init2
2443  *
2444  * Initialize a vm_dev_pipe, setting up its file descriptors and its
2445  * event structure with the given callback and argument.
2446  *
2447  * Parameters:
2448  *  p: pointer to vm_dev_pipe struct to initizlize
2449  *  cb: callback to use for READ events on the read end of the pipe
2450  *  arg: pointer to pass to the callback on event trigger
2451  */
2452 void
2453 vm_pipe_init2(struct vm_dev_pipe *p, void (*cb)(int, short, void *), void *arg)
2454 {
2455 	int ret;
2456 	int fds[2];
2457 
2458 	memset(p, 0, sizeof(struct vm_dev_pipe));
2459 
2460 	ret = pipe2(fds, O_CLOEXEC);
2461 	if (ret)
2462 		fatal("failed to create vm_dev_pipe pipe");
2463 
2464 	p->read = fds[0];
2465 	p->write = fds[1];
2466 
2467 	event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, arg);
2468 }
2469 
2470 /*
2471  * vm_pipe_send
2472  *
2473  * Send a message to an emulated device vie the provided vm_dev_pipe. This
2474  * relies on the fact sizeof(msg) < PIPE_BUF to ensure atomic writes.
2475  *
2476  * Parameters:
2477  *  p: pointer to initialized vm_dev_pipe
2478  *  msg: message to send in the channel
2479  */
2480 void
2481 vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg)
2482 {
2483 	size_t n;
2484 	n = write(p->write, &msg, sizeof(msg));
2485 	if (n != sizeof(msg))
2486 		fatal("failed to write to device pipe");
2487 }
2488 
2489 /*
2490  * vm_pipe_recv
2491  *
2492  * Receive a message for an emulated device via the provided vm_dev_pipe.
2493  * Returns the message value, otherwise will exit on failure. This relies on
2494  * the fact sizeof(enum pipe_msg_type) < PIPE_BUF for atomic reads.
2495  *
2496  * Parameters:
2497  *  p: pointer to initialized vm_dev_pipe
2498  *
2499  * Return values:
2500  *  a value of enum pipe_msg_type or fatal exit on read(2) error
2501  */
2502 enum pipe_msg_type
2503 vm_pipe_recv(struct vm_dev_pipe *p)
2504 {
2505 	size_t n;
2506 	enum pipe_msg_type msg;
2507 	n = read(p->read, &msg, sizeof(msg));
2508 	if (n != sizeof(msg))
2509 		fatal("failed to read from device pipe");
2510 
2511 	return msg;
2512 }
2513 
2514 /*
2515  * Re-map the guest address space using vmm(4)'s VMM_IOC_SHARE
2516  *
2517  * Returns 0 on success, non-zero in event of failure.
2518  */
2519 int
2520 remap_guest_mem(struct vmd_vm *vm, int vmm_fd)
2521 {
2522 	struct vm_create_params	*vcp;
2523 	struct vm_mem_range	*vmr;
2524 	struct vm_sharemem_params vsp;
2525 	size_t			 i, j;
2526 	void			*p = NULL;
2527 	int			 ret;
2528 
2529 	if (vm == NULL)
2530 		return (1);
2531 
2532 	vcp = &vm->vm_params.vmc_params;
2533 
2534 	/*
2535 	 * Initialize our VM shared memory request using our original
2536 	 * creation parameters. We'll overwrite the va's after mmap(2).
2537 	 */
2538 	memset(&vsp, 0, sizeof(vsp));
2539 	vsp.vsp_nmemranges = vcp->vcp_nmemranges;
2540 	vsp.vsp_vm_id = vcp->vcp_id;
2541 	memcpy(&vsp.vsp_memranges, &vcp->vcp_memranges,
2542 	    sizeof(vsp.vsp_memranges));
2543 
2544 	/*
2545 	 * Use mmap(2) to identify virtual address space for our mappings.
2546 	 */
2547 	for (i = 0; i < VMM_MAX_MEM_RANGES; i++) {
2548 		if (i < vsp.vsp_nmemranges) {
2549 			vmr = &vsp.vsp_memranges[i];
2550 
2551 			/* Ignore any MMIO ranges. */
2552 			if (vmr->vmr_type == VM_MEM_MMIO) {
2553 				vmr->vmr_va = 0;
2554 				vcp->vcp_memranges[i].vmr_va = 0;
2555 				continue;
2556 			}
2557 
2558 			/* Make initial mappings for the memrange. */
2559 			p = mmap(NULL, vmr->vmr_size, PROT_READ, MAP_ANON, -1,
2560 			    0);
2561 			if (p == MAP_FAILED) {
2562 				ret = errno;
2563 				log_warn("%s: mmap", __func__);
2564 				for (j = 0; j < i; j++) {
2565 					vmr = &vcp->vcp_memranges[j];
2566 					munmap((void *)vmr->vmr_va,
2567 					    vmr->vmr_size);
2568 				}
2569 				return (ret);
2570 			}
2571 			vmr->vmr_va = (vaddr_t)p;
2572 			vcp->vcp_memranges[i].vmr_va = vmr->vmr_va;
2573 		}
2574 	}
2575 
2576 	/*
2577 	 * munmap(2) now that we have va's and ranges that don't overlap. vmm
2578 	 * will use the va's and sizes to recreate the mappings for us.
2579 	 */
2580 	for (i = 0; i < vsp.vsp_nmemranges; i++) {
2581 		vmr = &vsp.vsp_memranges[i];
2582 		if (vmr->vmr_type == VM_MEM_MMIO)
2583 			continue;
2584 		if (munmap((void*)vmr->vmr_va, vmr->vmr_size) == -1)
2585 			fatal("%s: munmap", __func__);
2586 	}
2587 
2588 	/*
2589 	 * Ask vmm to enter the shared mappings for us. They'll point
2590 	 * to the same host physical memory, but will have a randomized
2591 	 * virtual address for the calling process.
2592 	 */
2593 	if (ioctl(vmm_fd, VMM_IOC_SHAREMEM, &vsp) == -1)
2594 		return (errno);
2595 
2596 	return (0);
2597 }
2598