xref: /openbsd/usr.sbin/vmd/vmm.c (revision fd0cc40e)
1 /*	$OpenBSD: vmm.c,v 1.41 2016/09/01 17:09:33 mlarkin Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>	/* nitems */
20 #include <sys/ioctl.h>
21 #include <sys/queue.h>
22 #include <sys/wait.h>
23 #include <sys/uio.h>
24 #include <sys/socket.h>
25 #include <sys/time.h>
26 #include <sys/mman.h>
27 
28 #include <dev/ic/i8253reg.h>
29 #include <dev/isa/isareg.h>
30 #include <dev/pci/pcireg.h>
31 
32 #include <machine/param.h>
33 #include <machine/psl.h>
34 #include <machine/specialreg.h>
35 #include <machine/vmmvar.h>
36 
37 #include <errno.h>
38 #include <event.h>
39 #include <fcntl.h>
40 #include <imsg.h>
41 #include <limits.h>
42 #include <poll.h>
43 #include <pthread.h>
44 #include <stddef.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <unistd.h>
49 #include <util.h>
50 
51 #include "vmd.h"
52 #include "vmm.h"
53 #include "loadfile.h"
54 #include "pci.h"
55 #include "virtio.h"
56 #include "proc.h"
57 #include "i8253.h"
58 #include "i8259.h"
59 #include "ns8250.h"
60 #include "mc146818.h"
61 
62 io_fn_t ioports_map[MAX_PORTS];
63 
64 void vmm_sighdlr(int, short, void *);
65 int start_client_vmd(void);
66 int opentap(void);
67 int start_vm(struct imsg *, uint32_t *);
68 int terminate_vm(struct vm_terminate_params *);
69 int get_info_vm(struct privsep *, struct imsg *, int);
70 int run_vm(int *, int *, struct vm_create_params *, struct vcpu_reg_state *);
71 void *vcpu_run_loop(void *);
72 int vcpu_exit(struct vm_run_params *);
73 int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
74 void create_memory_map(struct vm_create_params *);
75 int alloc_guest_mem(struct vm_create_params *);
76 int vmm_create_vm(struct vm_create_params *);
77 void init_emulated_hw(struct vm_create_params *, int *, int *);
78 void vcpu_exit_inout(struct vm_run_params *);
79 uint8_t vcpu_exit_pci(struct vm_run_params *);
80 int vmm_dispatch_parent(int, struct privsep_proc *, struct imsg *);
81 void vmm_run(struct privsep *, struct privsep_proc *, void *);
82 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
83 
84 static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
85     size_t);
86 
87 int con_fd;
88 struct vmd_vm *current_vm;
89 
90 extern struct vmd *env;
91 
92 extern char *__progname;
93 
94 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM];
95 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM];
96 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM];
97 
98 static struct privsep_proc procs[] = {
99 	{ "parent",	PROC_PARENT,	vmm_dispatch_parent  },
100 };
101 
102 /*
103  * Represents a standard register set for an OS to be booted
104  * as a flat 32 bit address space, before paging is enabled.
105  *
106  * NOT set here are:
107  *  RIP
108  *  RSP
109  *  GDTR BASE
110  *
111  * Specific bootloaders should clone this structure and override
112  * those fields as needed.
113  *
114  * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
115  *        features of the CPU in use.
116  */
117 static const struct vcpu_reg_state vcpu_init_flat32 = {
118 	.vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
119 	.vrs_gprs[VCPU_REGS_RIP] = 0x0,
120 	.vrs_gprs[VCPU_REGS_RSP] = 0x0,
121 	.vrs_crs[VCPU_REGS_CR0] = CR0_CD | CR0_NW | CR0_ET | CR0_PE | CR0_PG,
122 	.vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
123 	.vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
124 	.vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
125 	.vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
126 	.vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
127 	.vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
128 	.vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
129 	.vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
130 	.vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
131 	.vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
132 	.vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
133 };
134 
135 pid_t
136 vmm(struct privsep *ps, struct privsep_proc *p)
137 {
138 	return (proc_run(ps, p, procs, nitems(procs), vmm_run, NULL));
139 }
140 
141 void
142 vmm_run(struct privsep *ps, struct privsep_proc *p, void *arg)
143 {
144 	if (config_init(ps->ps_env) == -1)
145 		fatal("failed to initialize configuration");
146 
147 	signal_del(&ps->ps_evsigchld);
148 	signal_set(&ps->ps_evsigchld, SIGCHLD, vmm_sighdlr, ps);
149 	signal_add(&ps->ps_evsigchld, NULL);
150 
151 #if 0
152 	/*
153 	 * pledge in the vmm process:
154  	 * stdio - for malloc and basic I/O including events.
155 	 * vmm - for the vmm ioctls and operations.
156 	 * proc - for forking and maitaining vms.
157 	 * recvfd - for disks, interfaces and other fds.
158 	 */
159 	/* XXX'ed pledge to hide it from grep as long as it's disabled */
160 	if (XXX("stdio vmm recvfd proc", NULL) == -1)
161 		fatal("pledge");
162 #endif
163 
164 	/* Get and terminate all running VMs */
165 	get_info_vm(ps, NULL, 1);
166 }
167 
168 int
169 vmm_dispatch_parent(int fd, struct privsep_proc *p, struct imsg *imsg)
170 {
171 	struct privsep		*ps = p->p_ps;
172 	int			 res = 0, cmd = 0;
173 	struct vm_create_params	 vcp;
174 	struct vm_terminate_params vtp;
175 	struct vmop_result	 vmr;
176 	uint32_t		 id = 0;
177 	struct vmd_vm		*vm;
178 
179 	switch (imsg->hdr.type) {
180 	case IMSG_VMDOP_START_VM_REQUEST:
181 		IMSG_SIZE_CHECK(imsg, &vcp);
182 		memcpy(&vcp, imsg->data, sizeof(vcp));
183 		res = config_getvm(ps, &vcp, imsg->fd, imsg->hdr.peerid);
184 		if (res == -1) {
185 			res = errno;
186 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
187 		}
188 		break;
189 	case IMSG_VMDOP_START_VM_DISK:
190 		res = config_getdisk(ps, imsg);
191 		if (res == -1) {
192 			res = errno;
193 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
194 		}
195 		break;
196 	case IMSG_VMDOP_START_VM_IF:
197 		res = config_getif(ps, imsg);
198 		if (res == -1) {
199 			res = errno;
200 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
201 		}
202 		break;
203 	case IMSG_VMDOP_START_VM_END:
204 		res = start_vm(imsg, &id);
205 		cmd = IMSG_VMDOP_START_VM_RESPONSE;
206 		break;
207 	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
208 		IMSG_SIZE_CHECK(imsg, &vtp);
209 		memcpy(&vtp, imsg->data, sizeof(vtp));
210 		id = vtp.vtp_vm_id;
211 		res = terminate_vm(&vtp);
212 		cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
213 		if (res == 0) {
214 			/* Remove local reference */
215 			vm = vm_getbyid(id);
216 			vm_remove(vm);
217 		}
218 		break;
219 	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
220 		res = get_info_vm(ps, imsg, 0);
221 		cmd = IMSG_VMDOP_GET_INFO_VM_END_DATA;
222 		break;
223 	case IMSG_CTL_RESET:
224 		config_getreset(env, imsg);
225 		break;
226 	default:
227 		return (-1);
228 	}
229 
230 	switch (cmd) {
231 	case 0:
232 		break;
233 	case IMSG_VMDOP_START_VM_RESPONSE:
234 		if (res != 0) {
235 			vm = vm_getbyvmid(imsg->hdr.peerid);
236 			vm_remove(vm);
237 		}
238 	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
239 		memset(&vmr, 0, sizeof(vmr));
240 		vmr.vmr_result = res;
241 		vmr.vmr_id = id;
242 		if (proc_compose_imsg(ps, PROC_PARENT, -1, cmd,
243 		    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
244 			return (-1);
245 		break;
246 	default:
247 		if (proc_compose_imsg(ps, PROC_PARENT, -1, cmd,
248 		    imsg->hdr.peerid, -1, &res, sizeof(res)) == -1)
249 			return (-1);
250 		break;
251 	}
252 
253 	return (0);
254 }
255 
256 void
257 vmm_sighdlr(int sig, short event, void *arg)
258 {
259 	struct privsep *ps = arg;
260 	int status;
261 	uint32_t vmid;
262 	pid_t pid;
263 	struct vmop_result vmr;
264 	struct vmd_vm *vm;
265 	struct vm_terminate_params vtp;
266 
267 	switch (sig) {
268 	case SIGCHLD:
269 		do {
270 			pid = waitpid(-1, &status, WNOHANG);
271 			if (pid <= 0)
272 				continue;
273 
274 			if (WIFEXITED(status) || WIFSIGNALED(status)) {
275 				vm = vm_getbypid(pid);
276 				if (vm == NULL) {
277 					/*
278 					 * If the VM is gone already, it
279 					 * got terminated via a
280 					 * IMSG_VMDOP_TERMINATE_VM_REQUEST.
281 					 */
282 					continue;
283 				}
284 
285 				vmid = vm->vm_params.vcp_id;
286 				vtp.vtp_vm_id = vmid;
287 				if (terminate_vm(&vtp) == 0) {
288 					memset(&vmr, 0, sizeof(vmr));
289 					vmr.vmr_result = 0;
290 					vmr.vmr_id = vmid;
291 					vm_remove(vm);
292 					if (proc_compose_imsg(ps, PROC_PARENT,
293 					    -1, IMSG_VMDOP_TERMINATE_VM_EVENT,
294 					    0, -1, &vmr, sizeof(vmr)) == -1)
295 						log_warnx("could not signal "
296 						    "termination of VM %u to "
297 						    "parent", vmid);
298 				} else
299 					log_warnx("could not terminate VM %u",
300 					    vmid);
301 			} else
302 				fatalx("unexpected cause of SIGCHLD");
303 		} while (pid > 0 || (pid == -1 && errno == EINTR));
304 		break;
305 	default:
306 		fatalx("unexpected signal");
307 	}
308 }
309 
310 /*
311  * vcpu_reset
312  *
313  * Requests vmm(4) to reset the VCPUs in the indicated VM to
314  * the register state provided
315  *
316  * Parameters
317  *  vmid: VM ID to reset
318  *  vcpu_id: VCPU ID to reset
319  *  vrs: the register state to initialize
320  *
321  * Return values:
322  *  0: success
323  *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not
324  *      valid)
325  */
326 int
327 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
328 {
329 	struct vm_resetcpu_params vrp;
330 
331 	memset(&vrp, 0, sizeof(vrp));
332 	vrp.vrp_vm_id = vmid;
333 	vrp.vrp_vcpu_id = vcpu_id;
334 	memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state));
335 
336 	log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid);
337 
338 	if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) < 0)
339 		return (errno);
340 
341 	return (0);
342 }
343 
344 /*
345  * terminate_vm
346  *
347  * Requests vmm(4) to terminate the VM whose ID is provided in the
348  * supplied vm_terminate_params structure (vtp->vtp_vm_id)
349  *
350  * Parameters
351  *  vtp: vm_create_params struct containing the ID of the VM to terminate
352  *
353  * Return values:
354  *  0: success
355  *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM is not
356  *      valid)
357  */
358 int
359 terminate_vm(struct vm_terminate_params *vtp)
360 {
361 	if (ioctl(env->vmd_fd, VMM_IOC_TERM, vtp) < 0)
362 		return (errno);
363 
364 	return (0);
365 }
366 
367 /*
368  * opentap
369  *
370  * Opens the next available tap device, up to MAX_TAP.
371  *
372  * Returns a file descriptor to the tap node opened, or -1 if no tap
373  * devices were available.
374  */
375 int
376 opentap(void)
377 {
378 	int i, fd;
379 	char path[PATH_MAX];
380 
381 	for (i = 0; i < MAX_TAP; i++) {
382 		snprintf(path, PATH_MAX, "/dev/tap%d", i);
383 		fd = open(path, O_RDWR | O_NONBLOCK);
384 		if (fd != -1)
385 			return (fd);
386 	}
387 
388 	return (-1);
389 }
390 
391 /*
392  * start_vm
393  *
394  * Starts a new VM with the creation parameters supplied (in the incoming
395  * imsg->data field). This function performs a basic sanity check on the
396  * incoming parameters and then performs the following steps to complete
397  * the creation of the VM:
398  *
399  * 1. opens the VM disk image files specified in the VM creation parameters
400  * 2. opens the specified VM kernel
401  * 3. creates a VM console tty pair using openpty
402  * 4. forks, passing the file descriptors opened in steps 1-3 to the child
403  *     vmd responsible for dropping privilege and running the VM's VCPU
404  *     loops.
405  *
406  * Parameters:
407  *  imsg: The incoming imsg body whose 'data' field is a vm_create_params
408  *      struct containing the VM creation parameters.
409  *  id: Returns the VM id as reported by the kernel.
410  *
411  * Return values:
412  *  0: success
413  *  !0 : failure - typically an errno indicating the source of the failure
414  */
415 int
416 start_vm(struct imsg *imsg, uint32_t *id)
417 {
418 	struct vm_create_params	*vcp;
419 	struct vmd_vm		*vm;
420 	size_t			 i;
421 	int			 ret = EINVAL;
422 	int			 fds[2];
423 	struct vcpu_reg_state vrs;
424 
425 	if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
426 		log_warnx("%s: can't find vm", __func__);
427 		ret = ENOENT;
428 		goto err;
429 	}
430 	vcp = &vm->vm_params;
431 
432 	if ((vm->vm_tty = imsg->fd) == -1) {
433 		log_warnx("%s: can't get tty", __func__);
434 		goto err;
435 	}
436 
437 	if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, fds) == -1)
438 		fatal("socketpair");
439 
440 	/* Start child vmd for this VM (fork, chroot, drop privs) */
441 	ret = start_client_vmd();
442 
443 	/* Start child failed? - cleanup and leave */
444 	if (ret == -1) {
445 		log_warnx("%s: start child failed", __func__);
446 		ret = EIO;
447 		goto err;
448 	}
449 
450 	if (ret > 0) {
451 		/* Parent */
452 		vm->vm_pid = ret;
453 
454 		for (i = 0 ; i < vcp->vcp_ndisks; i++) {
455 			close(vm->vm_disks[i]);
456 			vm->vm_disks[i] = -1;
457 		}
458 
459 		for (i = 0 ; i < vcp->vcp_nnics; i++) {
460 			close(vm->vm_ifs[i]);
461 			vm->vm_ifs[i] = -1;
462 		}
463 
464 		close(vm->vm_kernel);
465 		vm->vm_kernel = -1;
466 
467 		close(vm->vm_tty);
468 		vm->vm_tty = -1;
469 
470 		/* read back the kernel-generated vm id from the child */
471 		close(fds[1]);
472 		if (read(fds[0], &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
473 		    sizeof(vcp->vcp_id))
474 			fatal("read vcp id");
475 		close(fds[0]);
476 
477 		if (vcp->vcp_id == 0)
478 			goto err;
479 
480 		*id = vcp->vcp_id;
481 
482 		return (0);
483 	} else {
484 		/* Child */
485 		setproctitle("%s", vcp->vcp_name);
486 		log_procinit(vcp->vcp_name);
487 
488 		create_memory_map(vcp);
489 		ret = alloc_guest_mem(vcp);
490 		if (ret) {
491 			errno = ret;
492 			fatal("could not allocate guest memory - exiting");
493 		}
494 
495 		ret = vmm_create_vm(vcp);
496 		current_vm = vm;
497 
498 		/* send back the kernel-generated vm id (0 on error) */
499 		close(fds[0]);
500 		if (write(fds[1], &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
501 		    sizeof(vcp->vcp_id))
502 			fatal("write vcp id");
503 		close(fds[1]);
504 
505 		if (ret) {
506 			errno = ret;
507 			fatal("create vmm ioctl failed - exiting");
508 		}
509 
510 #if 0
511 		/*
512 		 * pledge in the vm processes:
513 	 	 * stdio - for malloc and basic I/O including events.
514 		 * vmm - for the vmm ioctls and operations.
515 		 */
516 		if (XXX("stdio vmm", NULL) == -1)
517 			fatal("pledge");
518 #endif
519 
520 		/*
521 		 * Set up default "flat 32 bit" register state - RIP,
522 		 * RSP, and GDT info will be set in bootloader
523 	 	 */
524 		memcpy(&vrs, &vcpu_init_flat32, sizeof(struct vcpu_reg_state));
525 
526 		/* Load kernel image */
527 		ret = loadelf_main(vm->vm_kernel, vcp, &vrs);
528 		if (ret) {
529 			errno = ret;
530 			fatal("failed to load kernel - exiting");
531 		}
532 
533 		close(vm->vm_kernel);
534 
535 		con_fd = vm->vm_tty;
536 		if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1)
537 			fatal("failed to set nonblocking mode on console");
538 
539 		/* Execute the vcpu run loop(s) for this VM */
540 		ret = run_vm(vm->vm_disks, vm->vm_ifs, vcp, &vrs);
541 
542 		_exit(ret != 0);
543 	}
544 
545 	return (0);
546 
547  err:
548 	vm_remove(vm);
549 
550 	return (ret);
551 }
552 
553 /*
554  * get_info_vm
555  *
556  * Returns a list of VMs known to vmm(4).
557  *
558  * Parameters:
559  *  ps: the privsep context.
560  *  imsg: the received imsg including the peer id.
561  *  terminate: terminate the listed vm.
562  *
563  * Return values:
564  *  0: success
565  *  !0 : failure (eg, ENOMEM, EIO or another error code from vmm(4) ioctl)
566  */
567 int
568 get_info_vm(struct privsep *ps, struct imsg *imsg, int terminate)
569 {
570 	int ret;
571 	size_t ct, i;
572 	struct vm_info_params vip;
573 	struct vm_info_result *info;
574 	struct vm_terminate_params vtp;
575 	struct vmop_info_result vir;
576 
577 	/*
578 	 * We issue the VMM_IOC_INFO ioctl twice, once with an input
579 	 * buffer size of 0, which results in vmm(4) returning the
580 	 * number of bytes required back to us in vip.vip_size,
581 	 * and then we call it again after malloc'ing the required
582 	 * number of bytes.
583 	 *
584 	 * It is possible that we could fail a second time (eg, if
585 	 * another VM was created in the instant between the two
586 	 * ioctls, but in that case the caller can just try again
587 	 * as vmm(4) will return a zero-sized list in that case.
588 	 */
589 	vip.vip_size = 0;
590 	info = NULL;
591 	ret = 0;
592 	memset(&vir, 0, sizeof(vir));
593 
594 	/* First ioctl to see how many bytes needed (vip.vip_size) */
595 	if (ioctl(env->vmd_fd, VMM_IOC_INFO, &vip) < 0)
596 		return (errno);
597 
598 	if (vip.vip_info_ct != 0)
599 		return (EIO);
600 
601 	info = malloc(vip.vip_size);
602 	if (info == NULL)
603 		return (ENOMEM);
604 
605 	/* Second ioctl to get the actual list */
606 	vip.vip_info = info;
607 	if (ioctl(env->vmd_fd, VMM_IOC_INFO, &vip) < 0) {
608 		ret = errno;
609 		free(info);
610 		return (ret);
611 	}
612 
613 	/* Return info */
614 	ct = vip.vip_size / sizeof(struct vm_info_result);
615 	for (i = 0; i < ct; i++) {
616 		if (terminate) {
617 			vtp.vtp_vm_id = info[i].vir_id;
618 			if ((ret = terminate_vm(&vtp)) != 0)
619 				return (ret);
620 			log_debug("%s: terminated VM %s (id %d)", __func__,
621 			    info[i].vir_name, info[i].vir_id);
622 			continue;
623 		}
624 		memcpy(&vir.vir_info, &info[i], sizeof(vir.vir_info));
625 		if (proc_compose_imsg(ps, PROC_PARENT, -1,
626 		    IMSG_VMDOP_GET_INFO_VM_DATA, imsg->hdr.peerid, -1,
627 		    &vir, sizeof(vir)) == -1)
628 			return (EIO);
629 	}
630 	free(info);
631 	return (0);
632 }
633 
634 
635 /*
636  * start_client_vmd
637  *
638  * forks a copy of the parent vmd, chroots to VMD_USER's home, drops
639  * privileges (changes to user VMD_USER), and returns.
640  * Should the fork operation succeed, but later chroot/privsep
641  * fail, the child exits.
642  *
643  * Return values (returns to both child and parent on success):
644  *  -1 : failure
645  *  0: return to child vmd returns 0
646  *  !0 : return to parent vmd returns the child's pid
647  */
648 int
649 start_client_vmd(void)
650 {
651 	int child_pid;
652 
653 	child_pid = fork();
654 	if (child_pid < 0)
655 		return (-1);
656 
657 	if (!child_pid) {
658 		/* child, already running without privileges */
659 		return (0);
660 	}
661 
662 	/* Parent */
663 	return (child_pid);
664 }
665 
666 /*
667  * create_memory_map
668  *
669  * Sets up the guest physical memory ranges that the VM can access.
670  *
671  * Return values:
672  *  nothing
673  */
674 void
675 create_memory_map(struct vm_create_params *vcp)
676 {
677 	size_t len, mem_bytes, mem_mb;
678 
679 	mem_mb = vcp->vcp_memranges[0].vmr_size;
680 	vcp->vcp_nmemranges = 0;
681 	if (mem_mb < 1 || mem_mb > VMM_MAX_VM_MEM_SIZE)
682 		return;
683 
684 	mem_bytes = mem_mb * 1024 * 1024;
685 
686 	/* First memory region: 0 - LOWMEM_KB (DOS low mem) */
687 	len = LOWMEM_KB * 1024;
688 	vcp->vcp_memranges[0].vmr_gpa = 0x0;
689 	vcp->vcp_memranges[0].vmr_size = len;
690 	mem_bytes -= len;
691 
692 	/*
693 	 * Second memory region: LOWMEM_KB - 1MB.
694 	 *
695 	 * N.B. - Normally ROMs or parts of video RAM are mapped here.
696 	 * We have to add this region, because some systems
697 	 * unconditionally write to 0xb8000 (VGA RAM), and
698 	 * we need to make sure that vmm(4) permits accesses
699 	 * to it. So allocate guest memory for it.
700 	 */
701 	len = 0x100000 - LOWMEM_KB * 1024;
702 	vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
703 	vcp->vcp_memranges[1].vmr_size = len;
704 	mem_bytes -= len;
705 
706 	/* Make sure that we do not place physical memory into MMIO ranges. */
707 	if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - 0x100000)
708 		len = VMM_PCI_MMIO_BAR_BASE - 0x100000;
709 	else
710 		len = mem_bytes;
711 
712 	/* Third memory region: 1MB - (1MB + len) */
713 	vcp->vcp_memranges[2].vmr_gpa = 0x100000;
714 	vcp->vcp_memranges[2].vmr_size = len;
715 	mem_bytes -= len;
716 
717 	if (mem_bytes > 0) {
718 		/* Fourth memory region for the remaining memory (if any) */
719 		vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
720 		vcp->vcp_memranges[3].vmr_size = mem_bytes;
721 		vcp->vcp_nmemranges = 4;
722 	} else
723 		vcp->vcp_nmemranges = 3;
724 }
725 
726 /*
727  * alloc_guest_mem
728  *
729  * Allocates memory for the guest.
730  * Instead of doing a single allocation with one mmap(), we allocate memory
731  * separately for every range for the following reasons:
732  * - ASLR for the individual ranges
733  * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
734  *   map the single mmap'd userspace memory to the individual guest physical
735  *   memory ranges, the underlying amap of the single mmap'd range would have
736  *   to allocate per-page reference counters. The reason is that the
737  *   individual guest physical ranges would reference the single mmap'd region
738  *   only partially. However, if every guest physical range has its own
739  *   corresponding mmap'd userspace allocation, there are no partial
740  *   references: every guest physical range fully references an mmap'd
741  *   range => no per-page reference counters have to be allocated.
742  *
743  * Return values:
744  *  0: success
745  *  !0: failure - errno indicating the source of the failure
746  */
747 int
748 alloc_guest_mem(struct vm_create_params *vcp)
749 {
750 	void *p;
751 	int ret;
752 	size_t i, j;
753 	struct vm_mem_range *vmr;
754 
755 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
756 		vmr = &vcp->vcp_memranges[i];
757 		p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
758 		    MAP_PRIVATE | MAP_ANON, -1, 0);
759 		if (p == MAP_FAILED) {
760 			ret = errno;
761 			for (j = 0; j < i; j++) {
762 				vmr = &vcp->vcp_memranges[j];
763 				munmap((void *)vmr->vmr_va, vmr->vmr_size);
764 			}
765 
766 			return (ret);
767 		}
768 
769 		vmr->vmr_va = (vaddr_t)p;
770 	}
771 
772 	return (0);
773 }
774 
775 /*
776  * vmm_create_vm
777  *
778  * Requests vmm(4) to create a new VM using the supplied creation
779  * parameters. This operation results in the creation of the in-kernel
780  * structures for the VM, but does not start the VM's vcpu(s).
781  *
782  * Parameters:
783  *  vcp: vm_create_params struct containing the VM's desired creation
784  *      configuration
785  *
786  * Return values:
787  *  0: success
788  *  !0 : ioctl to vmm(4) failed
789  */
790 int
791 vmm_create_vm(struct vm_create_params *vcp)
792 {
793 	/* Sanity check arguments */
794 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
795 		return (EINVAL);
796 
797 	if (vcp->vcp_nmemranges == 0 ||
798 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
799 		return (EINVAL);
800 
801 	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
802 		return (EINVAL);
803 
804 	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
805 		return (EINVAL);
806 
807 	if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) < 0)
808 		return (errno);
809 
810 	return (0);
811 }
812 
813 /*
814  * init_emulated_hw
815  *
816  * Initializes the userspace hardware emulation
817  */
818 void
819 init_emulated_hw(struct vm_create_params *vcp, int *child_disks,
820     int *child_taps)
821 {
822 	int i;
823 
824 	/* Reset the IO port map */
825 	memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
826 
827 	/* Init i8253 PIT */
828 	i8253_init(vcp->vcp_id);
829 	ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
830 	ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
831 	ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
832 	ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
833 
834 	/* Init mc146818 RTC */
835 	mc146818_init(vcp->vcp_id);
836 	ioports_map[IO_RTC] = vcpu_exit_mc146818;
837 	ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
838 
839 	/* Init master and slave PICs */
840 	i8259_init();
841 	ioports_map[IO_ICU1] = vcpu_exit_i8259;
842 	ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
843 	ioports_map[IO_ICU2] = vcpu_exit_i8259;
844 	ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
845 
846 	/* Init ns8250 UART */
847 	ns8250_init(con_fd);
848 	for (i = COM1_DATA; i <= COM1_SCR; i++)
849 		ioports_map[i] = vcpu_exit_com;
850 
851 	/* Initialize PCI */
852 	for (i = VMM_PCI_IO_BAR_BASE; i <= VMM_PCI_IO_BAR_END; i++)
853 		ioports_map[i] = vcpu_exit_pci;
854 
855 	ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
856 	ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
857 	pci_init();
858 
859 	/* Initialize virtio devices */
860 	virtio_init(vcp, child_disks, child_taps);
861 }
862 
863 /*
864  * run_vm
865  *
866  * Runs the VM whose creation parameters are specified in vcp
867  *
868  * Parameters:
869  *  child_disks: previously-opened child VM disk file file descriptors
870  *  child_taps: previously-opened child tap file descriptors
871  *  vcp: vm_create_params struct containing the VM's desired creation
872  *      configuration
873  *  vrs: VCPU register state to initialize
874  *
875  * Return values:
876  *  0: the VM exited normally
877  *  !0 : the VM exited abnormally or failed to start
878  */
879 int
880 run_vm(int *child_disks, int *child_taps, struct vm_create_params *vcp,
881     struct vcpu_reg_state *vrs)
882 {
883 	size_t i;
884 	int ret;
885 	pthread_t *tid;
886 	struct vm_run_params **vrp;
887 #if 0
888 	void *exit_status;
889 #endif
890 
891 	if (vcp == NULL)
892 		return (EINVAL);
893 
894 	if (child_disks == NULL && vcp->vcp_ndisks != 0)
895 		return (EINVAL);
896 
897 	if (child_taps == NULL && vcp->vcp_nnics != 0)
898 		return (EINVAL);
899 
900 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
901 		return (EINVAL);
902 
903 	if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
904 		return (EINVAL);
905 
906 	if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM)
907 		return (EINVAL);
908 
909 	if (vcp->vcp_nmemranges == 0 ||
910 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
911 		return (EINVAL);
912 
913 	ret = 0;
914 
915 	event_init();
916 
917 	tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t));
918 	vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *));
919 	if (tid == NULL || vrp == NULL) {
920 		log_warn("%s: memory allocation error - exiting.",
921 		    __progname);
922 		return (ENOMEM);
923 	}
924 
925 	log_debug("%s: initializing hardware for vm %s", __func__,
926 	    vcp->vcp_name);
927 
928 	init_emulated_hw(vcp, child_disks, child_taps);
929 
930 	log_debug("%s: starting vcpu threads for vm %s", __func__,
931 	    vcp->vcp_name);
932 
933 	/*
934 	 * Create and launch one thread for each VCPU. These threads may
935 	 * migrate between PCPUs over time; the need to reload CPU state
936 	 * in such situations is detected and performed by vmm(4) in the
937 	 * kernel.
938 	 */
939 	for (i = 0 ; i < vcp->vcp_ncpus; i++) {
940 		vrp[i] = malloc(sizeof(struct vm_run_params));
941 		if (vrp[i] == NULL) {
942 			log_warn("%s: memory allocation error - "
943 			    "exiting.", __progname);
944 			/* caller will exit, so skip free'ing */
945 			return (ENOMEM);
946 		}
947 		vrp[i]->vrp_exit = malloc(sizeof(union vm_exit));
948 		if (vrp[i]->vrp_exit == NULL) {
949 			log_warn("%s: memory allocation error - "
950 			    "exiting.", __progname);
951 			/* caller will exit, so skip free'ing */
952 			return (ENOMEM);
953 		}
954 		vrp[i]->vrp_vm_id = vcp->vcp_id;
955 		vrp[i]->vrp_vcpu_id = i;
956 
957 		if (vcpu_reset(vcp->vcp_id, i, vrs)) {
958 			log_warnx("%s: cannot reset VCPU %zu - exiting.",
959 			    __progname, i);
960 			return (EIO);
961 		}
962 
963 		ret = pthread_cond_init(&vcpu_run_cond[i], NULL);
964 		if (ret) {
965 			log_warnx("%s: cannot initialize cond var (%d)",
966 			    __progname, ret);
967 			return (ret);
968 		}
969 
970 		ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL);
971 		if (ret) {
972 			log_warnx("%s: cannot initialize mtx (%d)",
973 			    __progname, ret);
974 			return (ret);
975 		}
976 
977 		vcpu_hlt[i] = 0;
978 
979 		/* Start each VCPU run thread at vcpu_run_loop */
980 		ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
981 		if (ret) {
982 			/* caller will _exit after this return */
983 			return (ret);
984 		}
985 	}
986 
987 	log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name);
988 	ret = event_dispatch();
989 
990 #if 0
991 	/* XXX need to handle clean exits now */
992 
993 	/* Wait for all the threads to exit */
994 	for (i = 0; i < vcp->vcp_ncpus; i++) {
995 		if (pthread_join(tid[i], &exit_status)) {
996 			log_warnx("%s: failed to join thread %zd - "
997 			    "exiting", __progname, i);
998 			return (EIO);
999 		}
1000 
1001 		if (exit_status != NULL) {
1002 			log_warnx("%s: vm %d vcpu run thread %zd exited "
1003 			    "abnormally", __progname, vcp->vcp_id, i);
1004 			ret = EIO;
1005 		}
1006 	}
1007 #endif
1008 
1009 	return (ret);
1010 }
1011 
1012 /*
1013  * vcpu_run_loop
1014  *
1015  * Runs a single VCPU until vmm(4) requires help handling an exit,
1016  * or the VM terminates.
1017  *
1018  * Parameters:
1019  *  arg: vcpu_run_params for the VCPU being run by this thread
1020  *
1021  * Return values:
1022  *  NULL: the VCPU shutdown properly
1023  *  !NULL: error processing VCPU run, or the VCPU shutdown abnormally
1024  */
1025 void *
1026 vcpu_run_loop(void *arg)
1027 {
1028 	struct vm_run_params *vrp = (struct vm_run_params *)arg;
1029 	intptr_t ret;
1030 	int irq;
1031 	uint32_t n;
1032 
1033 	vrp->vrp_continue = 0;
1034 	n = vrp->vrp_vcpu_id;
1035 
1036 	for (;;) {
1037 		ret = pthread_mutex_lock(&vcpu_run_mtx[n]);
1038 
1039 		if (ret) {
1040 			log_warnx("%s: can't lock vcpu run mtx (%d)",
1041 			    __func__, (int)ret);
1042 			return ((void *)ret);
1043 		}
1044 
1045 		/* If we are halted, wait */
1046 		if (vcpu_hlt[n]) {
1047 			ret = pthread_cond_wait(&vcpu_run_cond[n],
1048 			    &vcpu_run_mtx[n]);
1049 
1050 			if (ret) {
1051 				log_warnx("%s: can't wait on cond (%d)",
1052 				    __func__, (int)ret);
1053 				(void)pthread_mutex_unlock(&vcpu_run_mtx[n]);
1054 				return ((void *)ret);
1055 			}
1056 		}
1057 
1058 		ret = pthread_mutex_unlock(&vcpu_run_mtx[n]);
1059 		if (ret) {
1060 			log_warnx("%s: can't unlock mutex on cond (%d)",
1061 			    __func__, (int)ret);
1062 			return ((void *)ret);
1063 		}
1064 
1065 		if (vrp->vrp_irqready && i8259_is_pending()) {
1066 			irq = i8259_ack();
1067 			vrp->vrp_irq = irq;
1068 		} else
1069 			vrp->vrp_irq = 0xFFFF;
1070 
1071 		/* Still more pending? */
1072 		if (i8259_is_pending()) {
1073 			/* XXX can probably avoid ioctls here by providing intr in vrp */
1074 			if (vcpu_pic_intr(vrp->vrp_vm_id, vrp->vrp_vcpu_id, 1)) {
1075 				fatal("can't set INTR");
1076 			}
1077 		} else {
1078 			if (vcpu_pic_intr(vrp->vrp_vm_id, vrp->vrp_vcpu_id, 0)) {
1079 				fatal("can't clear INTR");
1080 			}
1081 		}
1082 
1083 		if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) < 0) {
1084 			/* If run ioctl failed, exit */
1085 			ret = errno;
1086 			log_warn("%s: vm %d / vcpu %d run ioctl failed",
1087 			    __func__, vrp->vrp_vm_id, n);
1088 			return ((void *)ret);
1089 		}
1090 
1091 		/* If the VM is terminating, exit normally */
1092 		if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED)
1093 			return (NULL);
1094 
1095 		if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
1096 			/*
1097 			 * vmm(4) needs help handling an exit, handle in
1098 			 * vcpu_exit.
1099 			 */
1100 			if (vcpu_exit(vrp))
1101 				return ((void *)EIO);
1102 		}
1103 	}
1104 
1105 	return (NULL);
1106 }
1107 
1108 int
1109 vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
1110 {
1111 	struct vm_intr_params vip;
1112 
1113 	memset(&vip, 0, sizeof(vip));
1114 
1115 	vip.vip_vm_id = vm_id;
1116 	vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */
1117 	vip.vip_intr = intr;
1118 
1119 	if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) < 0)
1120 		return (errno);
1121 
1122 	return (0);
1123 }
1124 
1125 /*
1126  * vcpu_exit_pci
1127  *
1128  * Handle all I/O to the emulated PCI subsystem.
1129  *
1130  * Parameters:
1131  *  vrp: vcpu run paramters containing guest state for this exit
1132  *
1133  * Return value:
1134  *  Interrupt to inject to the guest VM, or 0xFF if no interrupt should
1135  *      be injected.
1136  */
1137 uint8_t
1138 vcpu_exit_pci(struct vm_run_params *vrp)
1139 {
1140 	union vm_exit *vei = vrp->vrp_exit;
1141 	uint8_t intr;
1142 
1143 	intr = 0xFF;
1144 
1145 	switch (vei->vei.vei_port) {
1146 	case PCI_MODE1_ADDRESS_REG:
1147 		pci_handle_address_reg(vrp);
1148 		break;
1149 	case PCI_MODE1_DATA_REG:
1150 		pci_handle_data_reg(vrp);
1151 		break;
1152 	case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
1153 		intr = pci_handle_io(vrp);
1154 		break;
1155 	default:
1156 		log_warnx("%s: unknown PCI register 0x%llx",
1157 		    __progname, (uint64_t)vei->vei.vei_port);
1158 		break;
1159 	}
1160 
1161 	return (intr);
1162 }
1163 
1164 /*
1165  * vcpu_exit_inout
1166  *
1167  * Handle all I/O exits that need to be emulated in vmd. This includes the
1168  * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
1169  *
1170  * Parameters:
1171  *  vrp: vcpu run parameters containing guest state for this exit
1172  */
1173 void
1174 vcpu_exit_inout(struct vm_run_params *vrp)
1175 {
1176 	union vm_exit *vei = vrp->vrp_exit;
1177 	uint8_t intr = 0xFF;
1178 
1179 	if (ioports_map[vei->vei.vei_port] != NULL)
1180 		intr = ioports_map[vei->vei.vei_port](vrp);
1181 	else if (vei->vei.vei_dir == VEI_DIR_IN)
1182 			vei->vei.vei_data = 0xFFFFFFFF;
1183 
1184 	if (intr != 0xFF)
1185 		vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
1186 }
1187 
1188 /*
1189  * vcpu_exit
1190  *
1191  * Handle a vcpu exit. This function is called when it is determined that
1192  * vmm(4) requires the assistance of vmd to support a particular guest
1193  * exit type (eg, accessing an I/O port or device). Guest state is contained
1194  * in 'vrp', and will be resent to vmm(4) on exit completion.
1195  *
1196  * Upon conclusion of handling the exit, the function determines if any
1197  * interrupts should be injected into the guest, and asserts the proper
1198  * IRQ line whose interrupt should be vectored.
1199  *
1200  * Parameters:
1201  *  vrp: vcpu run parameters containing guest state for this exit
1202  *
1203  * Return values:
1204  *  0: the exit was handled successfully
1205  *  1: an error occurred (eg, unknown exit reason passed in 'vrp')
1206  */
1207 int
1208 vcpu_exit(struct vm_run_params *vrp)
1209 {
1210 	int ret;
1211 
1212 	switch (vrp->vrp_exit_reason) {
1213 	case VMX_EXIT_IO:
1214 		vcpu_exit_inout(vrp);
1215 		break;
1216 	case VMX_EXIT_HLT:
1217 		ret = pthread_mutex_lock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1218 		if (ret) {
1219 			log_warnx("%s: can't lock vcpu mutex (%d)",
1220 			    __func__, ret);
1221 			return (1);
1222 		}
1223 		vcpu_hlt[vrp->vrp_vcpu_id] = 1;
1224 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vrp->vrp_vcpu_id]);
1225 		if (ret) {
1226 			log_warnx("%s: can't unlock vcpu mutex (%d)",
1227 			    __func__, ret);
1228 			return (1);
1229 		}
1230 		break;
1231 	case VMX_EXIT_INT_WINDOW:
1232 		break;
1233 	default:
1234 		log_warnx("%s: unknown exit reason %d",
1235 		    __progname, vrp->vrp_exit_reason);
1236 		return (1);
1237 	}
1238 
1239 	/* XXX this may not be irq 9 all the time */
1240 	/* XXX change this to poll on the tap interface */
1241 	if (vionet_process_rx())
1242 		vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, 9);
1243 
1244 	/* XXX temporary until this is polled */
1245 	if (vcpu_com1_needs_intr())
1246 		vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, 4);
1247 
1248 	vrp->vrp_continue = 1;
1249 
1250 	return (0);
1251 }
1252 
1253 /*
1254  * find_gpa_range
1255  *
1256  * Search for a contiguous guest physical mem range.
1257  *
1258  * Parameters:
1259  *  vcp: VM create parameters that contain the memory map to search in
1260  *  gpa: the starting guest physical address
1261  *  len: the length of the memory range
1262  *
1263  * Return values:
1264  *  NULL: on failure if there is no memory range as described by the parameters
1265  *  Pointer to vm_mem_range that contains the start of the range otherwise.
1266  */
1267 static struct vm_mem_range *
1268 find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
1269 {
1270 	size_t i, n;
1271 	struct vm_mem_range *vmr;
1272 
1273 	/* Find the first vm_mem_range that contains gpa */
1274 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
1275 		vmr = &vcp->vcp_memranges[i];
1276 		if (vmr->vmr_gpa + vmr->vmr_size >= gpa)
1277 			break;
1278 	}
1279 
1280 	/* No range found. */
1281 	if (i == vcp->vcp_nmemranges)
1282 		return (NULL);
1283 
1284 	/*
1285 	 * vmr may cover the range [gpa, gpa + len) only partly. Make
1286 	 * sure that the following vm_mem_ranges are contiguous and
1287 	 * cover the rest.
1288 	 */
1289 	n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
1290 	if (len < n)
1291 		len = 0;
1292 	else
1293 		len -= n;
1294 	gpa = vmr->vmr_gpa + vmr->vmr_size;
1295 	for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
1296 		vmr = &vcp->vcp_memranges[i];
1297 		if (gpa != vmr->vmr_gpa)
1298 			return (NULL);
1299 		if (len <= vmr->vmr_size)
1300 			len = 0;
1301 		else
1302 			len -= vmr->vmr_size;
1303 
1304 		gpa = vmr->vmr_gpa + vmr->vmr_size;
1305 	}
1306 
1307 	if (len != 0)
1308 		return (NULL);
1309 
1310 	return (vmr);
1311 }
1312 
1313 /*
1314  * write_mem
1315  *
1316  * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
1317  *
1318  * Parameters:
1319  *  dst: the destination paddr_t in the guest VM
1320  *  buf: data to copy
1321  *  len: number of bytes to copy
1322  *
1323  * Return values:
1324  *  0: success
1325  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1326  *      exist in the guest.
1327  */
1328 int
1329 write_mem(paddr_t dst, void *buf, size_t len)
1330 {
1331 	char *from = buf, *to;
1332 	size_t n, off;
1333 	struct vm_mem_range *vmr;
1334 
1335 	vmr = find_gpa_range(&current_vm->vm_params, dst, len);
1336 	if (vmr == NULL) {
1337 		errno = EINVAL;
1338 		log_warn("%s: failed - invalid memory range dst = 0x%lx, "
1339 		    "len = 0x%zx", __func__, dst, len);
1340 		return (EINVAL);
1341 	}
1342 
1343 	off = dst - vmr->vmr_gpa;
1344 	while (len != 0) {
1345 		n = vmr->vmr_size - off;
1346 		if (len < n)
1347 			n = len;
1348 
1349 		to = (char *)vmr->vmr_va + off;
1350 		memcpy(to, from, n);
1351 
1352 		from += n;
1353 		len -= n;
1354 		off = 0;
1355 		vmr++;
1356 	}
1357 
1358 	return (0);
1359 }
1360 
1361 /*
1362  * read_mem
1363  *
1364  * Reads memory at guest paddr 'src' into 'buf'.
1365  *
1366  * Parameters:
1367  *  src: the source paddr_t in the guest VM to read from.
1368  *  buf: destination (local) buffer
1369  *  len: number of bytes to read
1370  *
1371  * Return values:
1372  *  0: success
1373  *  EINVAL: if the guest physical memory range [dst, dst + len) does not
1374  *      exist in the guest.
1375  */
1376 int
1377 read_mem(paddr_t src, void *buf, size_t len)
1378 {
1379 	char *from, *to = buf;
1380 	size_t n, off;
1381 	struct vm_mem_range *vmr;
1382 
1383 	vmr = find_gpa_range(&current_vm->vm_params, src, len);
1384 	if (vmr == NULL) {
1385 		errno = EINVAL;
1386 		log_warn("%s: failed - invalid memory range src = 0x%lx, "
1387 		    "len = 0x%zx", __func__, src, len);
1388 		return (EINVAL);
1389 	}
1390 
1391 	off = src - vmr->vmr_gpa;
1392 	while (len != 0) {
1393 		n = vmr->vmr_size - off;
1394 		if (len < n)
1395 			n = len;
1396 
1397 		from = (char *)vmr->vmr_va + off;
1398 		memcpy(to, from, n);
1399 
1400 		to += n;
1401 		len -= n;
1402 		off = 0;
1403 		vmr++;
1404 	}
1405 
1406 	return (0);
1407 }
1408 
1409 /*
1410  * vcpu_assert_pic_irq
1411  *
1412  * Injects the specified IRQ on the supplied vcpu/vm
1413  *
1414  * Parameters:
1415  *  vm_id: VM ID to inject to
1416  *  vcpu_id: VCPU ID to inject to
1417  *  irq: IRQ to inject
1418  */
1419 void
1420 vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
1421 {
1422 	int ret;
1423 
1424 	i8259_assert_irq(irq);
1425 
1426 	if (i8259_is_pending()) {
1427 		if (vcpu_pic_intr(vm_id, vcpu_id, 1))
1428 			fatalx("%s: can't assert INTR", __func__);
1429 
1430 		ret = pthread_mutex_lock(&vcpu_run_mtx[vcpu_id]);
1431 		if (ret)
1432 			fatalx("%s: can't lock vcpu mtx (%d)", __func__, ret);
1433 
1434 		vcpu_hlt[vcpu_id] = 0;
1435 		ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
1436 		if (ret)
1437 			fatalx("%s: can't signal (%d)", __func__, ret);
1438 		ret = pthread_mutex_unlock(&vcpu_run_mtx[vcpu_id]);
1439 		if (ret)
1440 			fatalx("%s: can't unlock vcpu mtx (%d)", __func__, ret);
1441 	}
1442 }
1443 
1444 /*
1445  * fd_hasdata
1446  *
1447  * Determines if data can be read from a file descriptor.
1448  *
1449  * Parameters:
1450  *  fd: the fd to check
1451  *
1452  * Return values:
1453  *  1 if data can be read from an fd, or 0 otherwise.
1454  */
1455 int
1456 fd_hasdata(int fd)
1457 {
1458 	struct pollfd pfd[1];
1459 	int nready, hasdata = 0;
1460 
1461 	pfd[0].fd = fd;
1462 	pfd[0].events = POLLIN;
1463 	nready = poll(pfd, 1, 0);
1464 	if (nready == -1)
1465 		log_warn("checking file descriptor for data failed");
1466 	else if (nready == 1 && pfd[0].revents & POLLIN)
1467 		hasdata = 1;
1468 	return (hasdata);
1469 }
1470