xref: /openbsd/usr.sbin/vmd/vm.c (revision d12ef5f3)
1 /*	$OpenBSD: vm.c,v 1.110 2024/11/21 13:25:30 claudio Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>	/* PAGE_SIZE, MAXCOMLEN */
20 #include <sys/types.h>
21 #include <sys/ioctl.h>
22 #include <sys/mman.h>
23 #include <sys/resource.h>
24 
25 #include <dev/vmm/vmm.h>
26 
27 #include <errno.h>
28 #include <event.h>
29 #include <fcntl.h>
30 #include <imsg.h>
31 #include <poll.h>
32 #include <pthread.h>
33 #include <pthread_np.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <unistd.h>
38 #include <util.h>
39 
40 #include "atomicio.h"
41 #include "pci.h"
42 #include "virtio.h"
43 #include "vmd.h"
44 
45 #define MMIO_NOTYET 0
46 
47 static int run_vm(struct vmop_create_params *, struct vcpu_reg_state *);
48 static void vm_dispatch_vmm(int, short, void *);
49 static void *event_thread(void *);
50 static void *vcpu_run_loop(void *);
51 static int vmm_create_vm(struct vmd_vm *);
52 static int alloc_guest_mem(struct vmd_vm *);
53 static int send_vm(int, struct vmd_vm *);
54 static int dump_vmr(int , struct vm_mem_range *);
55 static int dump_mem(int, struct vmd_vm *);
56 static void restore_vmr(int, struct vm_mem_range *);
57 static void restore_mem(int, struct vm_create_params *);
58 static int restore_vm_params(int, struct vm_create_params *);
59 static void pause_vm(struct vmd_vm *);
60 static void unpause_vm(struct vmd_vm *);
61 static int start_vm(struct vmd_vm *, int);
62 
63 int con_fd;
64 struct vmd_vm *current_vm;
65 
66 extern struct vmd *env;
67 
68 extern char *__progname;
69 
70 pthread_mutex_t threadmutex;
71 pthread_cond_t threadcond;
72 
73 pthread_cond_t vcpu_run_cond[VMM_MAX_VCPUS_PER_VM];
74 pthread_mutex_t vcpu_run_mtx[VMM_MAX_VCPUS_PER_VM];
75 pthread_barrier_t vm_pause_barrier;
76 pthread_cond_t vcpu_unpause_cond[VMM_MAX_VCPUS_PER_VM];
77 pthread_mutex_t vcpu_unpause_mtx[VMM_MAX_VCPUS_PER_VM];
78 
79 pthread_mutex_t vm_mtx;
80 uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM];
81 uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM];
82 
83 /*
84  * vm_main
85  *
86  * Primary entrypoint for launching a vm. Does not return.
87  *
88  * fd: file descriptor for communicating with vmm process.
89  * fd_vmm: file descriptor for communicating with vmm(4) device
90  */
91 void
vm_main(int fd,int fd_vmm)92 vm_main(int fd, int fd_vmm)
93 {
94 	struct vm_create_params	*vcp = NULL;
95 	struct vmd_vm		 vm;
96 	size_t			 sz = 0;
97 	int			 ret = 0;
98 
99 	/*
100 	 * The vm process relies on global state. Set the fd for /dev/vmm.
101 	 */
102 	env->vmd_fd = fd_vmm;
103 
104 	/*
105 	 * We aren't root, so we can't chroot(2). Use unveil(2) instead.
106 	 */
107 	if (unveil(env->argv0, "x") == -1)
108 		fatal("unveil %s", env->argv0);
109 	if (unveil(NULL, NULL) == -1)
110 		fatal("unveil lock");
111 
112 	/*
113 	 * pledge in the vm processes:
114 	 * stdio - for malloc and basic I/O including events.
115 	 * vmm - for the vmm ioctls and operations.
116 	 * proc exec - fork/exec for launching devices.
117 	 * recvfd - for vm send/recv and sending fd to devices.
118 	 */
119 	if (pledge("stdio vmm proc exec recvfd", NULL) == -1)
120 		fatal("pledge");
121 
122 	/* Receive our vm configuration. */
123 	memset(&vm, 0, sizeof(vm));
124 	sz = atomicio(read, fd, &vm, sizeof(vm));
125 	if (sz != sizeof(vm)) {
126 		log_warnx("failed to receive start message");
127 		_exit(EIO);
128 	}
129 
130 	/* Update process with the vm name. */
131 	vcp = &vm.vm_params.vmc_params;
132 	setproctitle("%s", vcp->vcp_name);
133 	log_procinit("vm/%s", vcp->vcp_name);
134 
135 	/* Receive the local prefix settings. */
136 	sz = atomicio(read, fd, &env->vmd_cfg.cfg_localprefix,
137 	    sizeof(env->vmd_cfg.cfg_localprefix));
138 	if (sz != sizeof(env->vmd_cfg.cfg_localprefix)) {
139 		log_warnx("failed to receive local prefix");
140 		_exit(EIO);
141 	}
142 
143 	/*
144 	 * We need, at minimum, a vm_kernel fd to boot a vm. This is either a
145 	 * kernel or a BIOS image.
146 	 */
147 	if (!(vm.vm_state & VM_STATE_RECEIVED)) {
148 		if (vm.vm_kernel == -1) {
149 			log_warnx("%s: failed to receive boot fd",
150 			    vcp->vcp_name);
151 			_exit(EINVAL);
152 		}
153 	}
154 
155 	if (vcp->vcp_sev && env->vmd_psp_fd < 0) {
156 		log_warnx("%s not available", PSP_NODE);
157 		_exit(EINVAL);
158 	}
159 
160 	ret = start_vm(&vm, fd);
161 	_exit(ret);
162 }
163 
164 /*
165  * start_vm
166  *
167  * After forking a new VM process, starts the new VM with the creation
168  * parameters supplied (in the incoming vm->vm_params field). This
169  * function performs a basic sanity check on the incoming parameters
170  * and then performs the following steps to complete the creation of the VM:
171  *
172  * 1. validates and create the new VM
173  * 2. opens the imsg control channel to the parent and drops more privilege
174  * 3. drops additional privileges by calling pledge(2)
175  * 4. loads the kernel from the disk image or file descriptor
176  * 5. runs the VM's VCPU loops.
177  *
178  * Parameters:
179  *  vm: The VM data structure that is including the VM create parameters.
180  *  fd: The imsg socket that is connected to the parent process.
181  *
182  * Return values:
183  *  0: success
184  *  !0 : failure - typically an errno indicating the source of the failure
185  */
186 int
start_vm(struct vmd_vm * vm,int fd)187 start_vm(struct vmd_vm *vm, int fd)
188 {
189 	struct vmop_create_params *vmc = &vm->vm_params;
190 	struct vm_create_params	*vcp = &vmc->vmc_params;
191 	struct vcpu_reg_state	 vrs;
192 	int			 nicfds[VM_MAX_NICS_PER_VM];
193 	int			 ret;
194 	size_t			 i;
195 	struct vm_rwregs_params  vrp;
196 
197 	/*
198 	 * We first try to initialize and allocate memory before bothering
199 	 * vmm(4) with a request to create a new vm.
200 	 */
201 	if (!(vm->vm_state & VM_STATE_RECEIVED))
202 		create_memory_map(vcp);
203 
204 	ret = alloc_guest_mem(vm);
205 	if (ret) {
206 		struct rlimit lim;
207 		char buf[FMT_SCALED_STRSIZE];
208 		if (ret == ENOMEM && getrlimit(RLIMIT_DATA, &lim) == 0) {
209 			if (fmt_scaled(lim.rlim_cur, buf) == 0)
210 				fatalx("could not allocate guest memory (data "
211 				    "limit is %s)", buf);
212 		}
213 		errno = ret;
214 		log_warn("could not allocate guest memory");
215 		return (ret);
216 	}
217 
218 	/* We've allocated guest memory, so now create the vm in vmm(4). */
219 	ret = vmm_create_vm(vm);
220 	if (ret) {
221 		/* Let the vmm process know we failed by sending a 0 vm id. */
222 		vcp->vcp_id = 0;
223 		atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id));
224 		return (ret);
225 	}
226 
227 	/* Setup SEV. */
228 	ret = sev_init(vm);
229 	if (ret) {
230 		log_warnx("could not initialize SEV");
231 		return (ret);
232 	}
233 
234 	/*
235 	 * Some of vmd currently relies on global state (current_vm, con_fd).
236 	 */
237 	current_vm = vm;
238 	con_fd = vm->vm_tty;
239 	if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) {
240 		log_warn("failed to set nonblocking mode on console");
241 		return (1);
242 	}
243 
244 	/*
245 	 * We now let the vmm process know we were successful by sending it our
246 	 * vmm(4) assigned vm id.
247 	 */
248 	if (atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
249 	    sizeof(vcp->vcp_id)) {
250 		log_warn("failed to send created vm id to vmm process");
251 		return (1);
252 	}
253 
254 	/* Prepare either our boot image or receive an existing vm to launch. */
255 	if (vm->vm_state & VM_STATE_RECEIVED) {
256 		ret = atomicio(read, vm->vm_receive_fd, &vrp, sizeof(vrp));
257 		if (ret != sizeof(vrp))
258 			fatal("received incomplete vrp - exiting");
259 		vrs = vrp.vrwp_regs;
260 	} else if (load_firmware(vm, &vrs))
261 		fatalx("failed to load kernel or firmware image");
262 
263 	if (vm->vm_kernel != -1)
264 		close_fd(vm->vm_kernel);
265 
266 	/* Initialize our mutexes. */
267 	ret = pthread_mutex_init(&threadmutex, NULL);
268 	if (ret) {
269 		log_warn("%s: could not initialize thread state mutex",
270 		    __func__);
271 		return (ret);
272 	}
273 	ret = pthread_cond_init(&threadcond, NULL);
274 	if (ret) {
275 		log_warn("%s: could not initialize thread state "
276 		    "condition variable", __func__);
277 		return (ret);
278 	}
279 	ret = pthread_mutex_init(&vm_mtx, NULL);
280 	if (ret) {
281 		log_warn("%s: could not initialize vm state mutex",
282 		    __func__);
283 		return (ret);
284 	}
285 
286 	/* Lock thread mutex now. It's unlocked when waiting on threadcond. */
287 	mutex_lock(&threadmutex);
288 
289 	/*
290 	 * Finalize our communication socket with the vmm process. From here
291 	 * onwards, communication with the vmm process is event-based.
292 	 */
293 	event_init();
294 	if (vmm_pipe(vm, fd, vm_dispatch_vmm) == -1)
295 		fatal("setup vm pipe");
296 
297 	/*
298 	 * Initialize or restore our emulated hardware.
299 	 */
300 	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
301 		nicfds[i] = vm->vm_ifs[i].vif_fd;
302 
303 	if (vm->vm_state & VM_STATE_RECEIVED) {
304 		restore_mem(vm->vm_receive_fd, vcp);
305 		restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds,
306 		    vm->vm_disks, vm->vm_cdrom);
307 		if (restore_vm_params(vm->vm_receive_fd, vcp))
308 			fatal("restore vm params failed");
309 		unpause_vm(vm);
310 	} else
311 		init_emulated_hw(vmc, vm->vm_cdrom, vm->vm_disks, nicfds);
312 
313 	/* Drop privleges further before starting the vcpu run loop(s). */
314 	if (pledge("stdio vmm recvfd", NULL) == -1)
315 		fatal("pledge");
316 
317 	/*
318 	 * Execute the vcpu run loop(s) for this VM.
319 	 */
320 	ret = run_vm(&vm->vm_params, &vrs);
321 
322 	/* Shutdown SEV. */
323 	if (sev_shutdown(vm))
324 		log_warnx("%s: could not shutdown SEV", __func__);
325 
326 	/* Ensure that any in-flight data is written back */
327 	virtio_shutdown(vm);
328 
329 	return (ret);
330 }
331 
332 /*
333  * vm_dispatch_vmm
334  *
335  * imsg callback for messages that are received from the vmm parent process.
336  */
337 void
vm_dispatch_vmm(int fd,short event,void * arg)338 vm_dispatch_vmm(int fd, short event, void *arg)
339 {
340 	struct vmd_vm		*vm = arg;
341 	struct vmop_result	 vmr;
342 	struct vmop_addr_result	 var;
343 	struct imsgev		*iev = &vm->vm_iev;
344 	struct imsgbuf		*ibuf = &iev->ibuf;
345 	struct imsg		 imsg;
346 	ssize_t			 n;
347 	int			 verbose;
348 
349 	if (event & EV_READ) {
350 		if ((n = imsgbuf_read(ibuf)) == -1)
351 			fatal("%s: imsgbuf_read", __func__);
352 		if (n == 0)
353 			_exit(0);
354 	}
355 
356 	if (event & EV_WRITE) {
357 		if (imsgbuf_write(ibuf) == -1) {
358 			if (errno == EPIPE)
359 				_exit(0);
360 			fatal("%s: imsgbuf_write fd %d", __func__, ibuf->fd);
361 		}
362 	}
363 
364 	for (;;) {
365 		if ((n = imsg_get(ibuf, &imsg)) == -1)
366 			fatal("%s: imsg_get", __func__);
367 		if (n == 0)
368 			break;
369 
370 #if DEBUG > 1
371 		log_debug("%s: got imsg %d from %s",
372 		    __func__, imsg.hdr.type,
373 		    vm->vm_params.vmc_params.vcp_name);
374 #endif
375 
376 		switch (imsg.hdr.type) {
377 		case IMSG_CTL_VERBOSE:
378 			IMSG_SIZE_CHECK(&imsg, &verbose);
379 			memcpy(&verbose, imsg.data, sizeof(verbose));
380 			log_setverbose(verbose);
381 			virtio_broadcast_imsg(vm, IMSG_CTL_VERBOSE, &verbose,
382 			    sizeof(verbose));
383 			break;
384 		case IMSG_VMDOP_VM_SHUTDOWN:
385 			if (vmmci_ctl(VMMCI_SHUTDOWN) == -1)
386 				_exit(0);
387 			break;
388 		case IMSG_VMDOP_VM_REBOOT:
389 			if (vmmci_ctl(VMMCI_REBOOT) == -1)
390 				_exit(0);
391 			break;
392 		case IMSG_VMDOP_PAUSE_VM:
393 			vmr.vmr_result = 0;
394 			vmr.vmr_id = vm->vm_vmid;
395 			pause_vm(vm);
396 			imsg_compose_event(&vm->vm_iev,
397 			    IMSG_VMDOP_PAUSE_VM_RESPONSE,
398 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
399 			    sizeof(vmr));
400 			break;
401 		case IMSG_VMDOP_UNPAUSE_VM:
402 			vmr.vmr_result = 0;
403 			vmr.vmr_id = vm->vm_vmid;
404 			unpause_vm(vm);
405 			imsg_compose_event(&vm->vm_iev,
406 			    IMSG_VMDOP_UNPAUSE_VM_RESPONSE,
407 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
408 			    sizeof(vmr));
409 			break;
410 		case IMSG_VMDOP_SEND_VM_REQUEST:
411 			vmr.vmr_id = vm->vm_vmid;
412 			vmr.vmr_result = send_vm(imsg_get_fd(&imsg), vm);
413 			imsg_compose_event(&vm->vm_iev,
414 			    IMSG_VMDOP_SEND_VM_RESPONSE,
415 			    imsg.hdr.peerid, imsg.hdr.pid, -1, &vmr,
416 			    sizeof(vmr));
417 			if (!vmr.vmr_result) {
418 				imsgbuf_flush(&current_vm->vm_iev.ibuf);
419 				_exit(0);
420 			}
421 			break;
422 		case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
423 			IMSG_SIZE_CHECK(&imsg, &var);
424 			memcpy(&var, imsg.data, sizeof(var));
425 
426 			log_debug("%s: received tap addr %s for nic %d",
427 			    vm->vm_params.vmc_params.vcp_name,
428 			    ether_ntoa((void *)var.var_addr), var.var_nic_idx);
429 
430 			vionet_set_hostmac(vm, var.var_nic_idx, var.var_addr);
431 			break;
432 		default:
433 			fatalx("%s: got invalid imsg %d from %s",
434 			    __func__, imsg.hdr.type,
435 			    vm->vm_params.vmc_params.vcp_name);
436 		}
437 		imsg_free(&imsg);
438 	}
439 	imsg_event_add(iev);
440 }
441 
442 /*
443  * vm_shutdown
444  *
445  * Tell the vmm parent process to shutdown or reboot the VM and exit.
446  */
447 __dead void
vm_shutdown(unsigned int cmd)448 vm_shutdown(unsigned int cmd)
449 {
450 	switch (cmd) {
451 	case VMMCI_NONE:
452 	case VMMCI_SHUTDOWN:
453 		(void)imsg_compose_event(&current_vm->vm_iev,
454 		    IMSG_VMDOP_VM_SHUTDOWN, 0, 0, -1, NULL, 0);
455 		break;
456 	case VMMCI_REBOOT:
457 		(void)imsg_compose_event(&current_vm->vm_iev,
458 		    IMSG_VMDOP_VM_REBOOT, 0, 0, -1, NULL, 0);
459 		break;
460 	default:
461 		fatalx("invalid vm ctl command: %d", cmd);
462 	}
463 	imsgbuf_flush(&current_vm->vm_iev.ibuf);
464 
465 	if (sev_shutdown(current_vm))
466 		log_warnx("%s: could not shutdown SEV", __func__);
467 
468 	_exit(0);
469 }
470 
471 int
send_vm(int fd,struct vmd_vm * vm)472 send_vm(int fd, struct vmd_vm *vm)
473 {
474 	struct vm_rwregs_params	   vrp;
475 	struct vm_rwvmparams_params vpp;
476 	struct vmop_create_params *vmc;
477 	struct vm_terminate_params vtp;
478 	unsigned int		   flags = 0;
479 	unsigned int		   i;
480 	int			   ret = 0;
481 	size_t			   sz;
482 
483 	if (dump_send_header(fd)) {
484 		log_warnx("%s: failed to send vm dump header", __func__);
485 		goto err;
486 	}
487 
488 	pause_vm(vm);
489 
490 	vmc = calloc(1, sizeof(struct vmop_create_params));
491 	if (vmc == NULL) {
492 		log_warn("%s: calloc error getting vmc", __func__);
493 		ret = -1;
494 		goto err;
495 	}
496 
497 	flags |= VMOP_CREATE_MEMORY;
498 	memcpy(&vmc->vmc_params, &current_vm->vm_params, sizeof(struct
499 	    vmop_create_params));
500 	vmc->vmc_flags = flags;
501 	vrp.vrwp_vm_id = vm->vm_params.vmc_params.vcp_id;
502 	vrp.vrwp_mask = VM_RWREGS_ALL;
503 	vpp.vpp_mask = VM_RWVMPARAMS_ALL;
504 	vpp.vpp_vm_id = vm->vm_params.vmc_params.vcp_id;
505 
506 	sz = atomicio(vwrite, fd, vmc, sizeof(struct vmop_create_params));
507 	if (sz != sizeof(struct vmop_create_params)) {
508 		ret = -1;
509 		goto err;
510 	}
511 
512 	for (i = 0; i < vm->vm_params.vmc_params.vcp_ncpus; i++) {
513 		vrp.vrwp_vcpu_id = i;
514 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READREGS, &vrp))) {
515 			log_warn("%s: readregs failed", __func__);
516 			goto err;
517 		}
518 
519 		sz = atomicio(vwrite, fd, &vrp,
520 		    sizeof(struct vm_rwregs_params));
521 		if (sz != sizeof(struct vm_rwregs_params)) {
522 			log_warn("%s: dumping registers failed", __func__);
523 			ret = -1;
524 			goto err;
525 		}
526 	}
527 
528 	/* Dump memory before devices to aid in restoration. */
529 	if ((ret = dump_mem(fd, vm)))
530 		goto err;
531 	if ((ret = dump_devs(fd)))
532 		goto err;
533 	if ((ret = pci_dump(fd)))
534 		goto err;
535 	if ((ret = virtio_dump(fd)))
536 		goto err;
537 
538 	for (i = 0; i < vm->vm_params.vmc_params.vcp_ncpus; i++) {
539 		vpp.vpp_vcpu_id = i;
540 		if ((ret = ioctl(env->vmd_fd, VMM_IOC_READVMPARAMS, &vpp))) {
541 			log_warn("%s: readvmparams failed", __func__);
542 			goto err;
543 		}
544 
545 		sz = atomicio(vwrite, fd, &vpp,
546 		    sizeof(struct vm_rwvmparams_params));
547 		if (sz != sizeof(struct vm_rwvmparams_params)) {
548 			log_warn("%s: dumping vm params failed", __func__);
549 			ret = -1;
550 			goto err;
551 		}
552 	}
553 
554 	vtp.vtp_vm_id = vm->vm_params.vmc_params.vcp_id;
555 	if (ioctl(env->vmd_fd, VMM_IOC_TERM, &vtp) == -1) {
556 		log_warnx("%s: term IOC error: %d, %d", __func__,
557 		    errno, ENOENT);
558 	}
559 err:
560 	close(fd);
561 	if (ret)
562 		unpause_vm(vm);
563 	return ret;
564 }
565 
566 int
dump_mem(int fd,struct vmd_vm * vm)567 dump_mem(int fd, struct vmd_vm *vm)
568 {
569 	unsigned int	i;
570 	int		ret;
571 	struct		vm_mem_range *vmr;
572 
573 	for (i = 0; i < vm->vm_params.vmc_params.vcp_nmemranges; i++) {
574 		vmr = &vm->vm_params.vmc_params.vcp_memranges[i];
575 		ret = dump_vmr(fd, vmr);
576 		if (ret)
577 			return ret;
578 	}
579 	return (0);
580 }
581 
582 int
restore_vm_params(int fd,struct vm_create_params * vcp)583 restore_vm_params(int fd, struct vm_create_params *vcp) {
584 	unsigned int			i;
585 	struct vm_rwvmparams_params    vpp;
586 
587 	for (i = 0; i < vcp->vcp_ncpus; i++) {
588 		if (atomicio(read, fd, &vpp, sizeof(vpp)) != sizeof(vpp)) {
589 			log_warn("%s: error restoring vm params", __func__);
590 			return (-1);
591 		}
592 		vpp.vpp_vm_id = vcp->vcp_id;
593 		vpp.vpp_vcpu_id = i;
594 		if (ioctl(env->vmd_fd, VMM_IOC_WRITEVMPARAMS, &vpp) < 0) {
595 			log_debug("%s: writing vm params failed", __func__);
596 			return (-1);
597 		}
598 	}
599 	return (0);
600 }
601 
602 void
restore_mem(int fd,struct vm_create_params * vcp)603 restore_mem(int fd, struct vm_create_params *vcp)
604 {
605 	unsigned int	     i;
606 	struct vm_mem_range *vmr;
607 
608 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
609 		vmr = &vcp->vcp_memranges[i];
610 		restore_vmr(fd, vmr);
611 	}
612 }
613 
614 int
dump_vmr(int fd,struct vm_mem_range * vmr)615 dump_vmr(int fd, struct vm_mem_range *vmr)
616 {
617 	size_t	rem = vmr->vmr_size, read=0;
618 	char	buf[PAGE_SIZE];
619 
620 	while (rem > 0) {
621 		if (read_mem(vmr->vmr_gpa + read, buf, PAGE_SIZE)) {
622 			log_warn("failed to read vmr");
623 			return (-1);
624 		}
625 		if (atomicio(vwrite, fd, buf, sizeof(buf)) != sizeof(buf)) {
626 			log_warn("failed to dump vmr");
627 			return (-1);
628 		}
629 		rem = rem - PAGE_SIZE;
630 		read = read + PAGE_SIZE;
631 	}
632 	return (0);
633 }
634 
635 void
restore_vmr(int fd,struct vm_mem_range * vmr)636 restore_vmr(int fd, struct vm_mem_range *vmr)
637 {
638 	size_t	rem = vmr->vmr_size, wrote=0;
639 	char	buf[PAGE_SIZE];
640 
641 	while (rem > 0) {
642 		if (atomicio(read, fd, buf, sizeof(buf)) != sizeof(buf))
643 			fatal("failed to restore vmr");
644 		if (write_mem(vmr->vmr_gpa + wrote, buf, PAGE_SIZE))
645 			fatal("failed to write vmr");
646 		rem = rem - PAGE_SIZE;
647 		wrote = wrote + PAGE_SIZE;
648 	}
649 }
650 
651 static void
pause_vm(struct vmd_vm * vm)652 pause_vm(struct vmd_vm *vm)
653 {
654 	unsigned int n;
655 	int ret;
656 
657 	mutex_lock(&vm_mtx);
658 	if (vm->vm_state & VM_STATE_PAUSED) {
659 		mutex_unlock(&vm_mtx);
660 		return;
661 	}
662 	current_vm->vm_state |= VM_STATE_PAUSED;
663 	mutex_unlock(&vm_mtx);
664 
665 	ret = pthread_barrier_init(&vm_pause_barrier, NULL,
666 	    vm->vm_params.vmc_params.vcp_ncpus + 1);
667 	if (ret) {
668 		log_warnx("%s: cannot initialize pause barrier (%d)",
669 		    __progname, ret);
670 		return;
671 	}
672 
673 	for (n = 0; n < vm->vm_params.vmc_params.vcp_ncpus; n++) {
674 		ret = pthread_cond_broadcast(&vcpu_run_cond[n]);
675 		if (ret) {
676 			log_warnx("%s: can't broadcast vcpu run cond (%d)",
677 			    __func__, (int)ret);
678 			return;
679 		}
680 	}
681 	ret = pthread_barrier_wait(&vm_pause_barrier);
682 	if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
683 		log_warnx("%s: could not wait on pause barrier (%d)",
684 		    __func__, (int)ret);
685 		return;
686 	}
687 
688 	ret = pthread_barrier_destroy(&vm_pause_barrier);
689 	if (ret) {
690 		log_warnx("%s: could not destroy pause barrier (%d)",
691 		    __progname, ret);
692 		return;
693 	}
694 
695 	pause_vm_md(vm);
696 }
697 
698 static void
unpause_vm(struct vmd_vm * vm)699 unpause_vm(struct vmd_vm *vm)
700 {
701 	unsigned int n;
702 	int ret;
703 
704 	mutex_lock(&vm_mtx);
705 	if (!(vm->vm_state & VM_STATE_PAUSED)) {
706 		mutex_unlock(&vm_mtx);
707 		return;
708 	}
709 	current_vm->vm_state &= ~VM_STATE_PAUSED;
710 	mutex_unlock(&vm_mtx);
711 
712 	for (n = 0; n < vm->vm_params.vmc_params.vcp_ncpus; n++) {
713 		ret = pthread_cond_broadcast(&vcpu_unpause_cond[n]);
714 		if (ret) {
715 			log_warnx("%s: can't broadcast vcpu unpause cond (%d)",
716 			    __func__, (int)ret);
717 			return;
718 		}
719 	}
720 
721 	unpause_vm_md(vm);
722 }
723 
724 /*
725  * vcpu_reset
726  *
727  * Requests vmm(4) to reset the VCPUs in the indicated VM to
728  * the register state provided
729  *
730  * Parameters
731  *  vmid: VM ID to reset
732  *  vcpu_id: VCPU ID to reset
733  *  vrs: the register state to initialize
734  *
735  * Return values:
736  *  0: success
737  *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM ID is not
738  *      valid)
739  */
740 int
vcpu_reset(uint32_t vmid,uint32_t vcpu_id,struct vcpu_reg_state * vrs)741 vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
742 {
743 	struct vm_resetcpu_params vrp;
744 
745 	memset(&vrp, 0, sizeof(vrp));
746 	vrp.vrp_vm_id = vmid;
747 	vrp.vrp_vcpu_id = vcpu_id;
748 	memcpy(&vrp.vrp_init_state, vrs, sizeof(struct vcpu_reg_state));
749 
750 	log_debug("%s: resetting vcpu %d for vm %d", __func__, vcpu_id, vmid);
751 
752 	if (ioctl(env->vmd_fd, VMM_IOC_RESETCPU, &vrp) == -1)
753 		return (errno);
754 
755 	return (0);
756 }
757 
758 /*
759  * alloc_guest_mem
760  *
761  * Allocates memory for the guest.
762  * Instead of doing a single allocation with one mmap(), we allocate memory
763  * separately for every range for the following reasons:
764  * - ASLR for the individual ranges
765  * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
766  *   map the single mmap'd userspace memory to the individual guest physical
767  *   memory ranges, the underlying amap of the single mmap'd range would have
768  *   to allocate per-page reference counters. The reason is that the
769  *   individual guest physical ranges would reference the single mmap'd region
770  *   only partially. However, if every guest physical range has its own
771  *   corresponding mmap'd userspace allocation, there are no partial
772  *   references: every guest physical range fully references an mmap'd
773  *   range => no per-page reference counters have to be allocated.
774  *
775  * Return values:
776  *  0: success
777  *  !0: failure - errno indicating the source of the failure
778  */
779 int
alloc_guest_mem(struct vmd_vm * vm)780 alloc_guest_mem(struct vmd_vm *vm)
781 {
782 	void *p;
783 	int ret = 0;
784 	size_t i, j;
785 	struct vm_create_params *vcp = &vm->vm_params.vmc_params;
786 	struct vm_mem_range *vmr;
787 
788 	for (i = 0; i < vcp->vcp_nmemranges; i++) {
789 		vmr = &vcp->vcp_memranges[i];
790 
791 		/*
792 		 * We only need R/W as userland. vmm(4) will use R/W/X in its
793 		 * mapping.
794 		 *
795 		 * We must use MAP_SHARED so emulated devices will be able
796 		 * to generate shared mappings.
797 		 */
798 		p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
799 		    MAP_ANON | MAP_CONCEAL | MAP_SHARED, -1, 0);
800 		if (p == MAP_FAILED) {
801 			ret = errno;
802 			for (j = 0; j < i; j++) {
803 				vmr = &vcp->vcp_memranges[j];
804 				munmap((void *)vmr->vmr_va, vmr->vmr_size);
805 			}
806 			return (ret);
807 		}
808 		vmr->vmr_va = (vaddr_t)p;
809 	}
810 
811 	return (ret);
812 }
813 
814 /*
815  * vmm_create_vm
816  *
817  * Requests vmm(4) to create a new VM using the supplied creation
818  * parameters. This operation results in the creation of the in-kernel
819  * structures for the VM, but does not start the VM's vcpu(s).
820  *
821  * Parameters:
822  *  vm: pointer to the vm object
823  *
824  * Return values:
825  *  0: success
826  *  !0 : ioctl to vmm(4) failed
827  */
828 static int
vmm_create_vm(struct vmd_vm * vm)829 vmm_create_vm(struct vmd_vm *vm)
830 {
831 	struct vm_create_params *vcp = &vm->vm_params.vmc_params;
832 	size_t i;
833 
834 	/* Sanity check arguments */
835 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
836 		return (EINVAL);
837 
838 	if (vcp->vcp_nmemranges == 0 ||
839 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
840 		return (EINVAL);
841 
842 	if (vm->vm_params.vmc_ndisks > VM_MAX_DISKS_PER_VM)
843 		return (EINVAL);
844 
845 	if (vm->vm_params.vmc_nnics > VM_MAX_NICS_PER_VM)
846 		return (EINVAL);
847 
848 	if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) == -1)
849 		return (errno);
850 
851 	for (i = 0; i < vcp->vcp_ncpus; i++)
852 		vm->vm_sev_asid[i] = vcp->vcp_asid[i];
853 
854 	return (0);
855 }
856 
857 
858 	/*
859  * run_vm
860  *
861  * Runs the VM whose creation parameters are specified in vcp
862  *
863  * Parameters:
864  *  child_cdrom: previously-opened child ISO disk file descriptor
865  *  child_disks: previously-opened child VM disk file file descriptors
866  *  child_taps: previously-opened child tap file descriptors
867  *  vmc: vmop_create_params struct containing the VM's desired creation
868  *      configuration
869  *  vrs: VCPU register state to initialize
870  *
871  * Return values:
872  *  0: the VM exited normally
873  *  !0 : the VM exited abnormally or failed to start
874  */
875 static int
run_vm(struct vmop_create_params * vmc,struct vcpu_reg_state * vrs)876 run_vm(struct vmop_create_params *vmc, struct vcpu_reg_state *vrs)
877 {
878 	struct vm_create_params *vcp = &vmc->vmc_params;
879 	struct vm_rwregs_params vregsp;
880 	uint8_t evdone = 0;
881 	size_t i;
882 	int ret;
883 	pthread_t *tid, evtid;
884 	char tname[MAXCOMLEN + 1];
885 	struct vm_run_params **vrp;
886 	void *exit_status;
887 
888 	if (vcp == NULL)
889 		return (EINVAL);
890 
891 	if (vcp->vcp_nmemranges == 0 ||
892 	    vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
893 		return (EINVAL);
894 
895 	tid = calloc(vcp->vcp_ncpus, sizeof(pthread_t));
896 	vrp = calloc(vcp->vcp_ncpus, sizeof(struct vm_run_params *));
897 	if (tid == NULL || vrp == NULL) {
898 		log_warn("%s: memory allocation error - exiting.",
899 		    __progname);
900 		return (ENOMEM);
901 	}
902 
903 	log_debug("%s: starting %zu vcpu thread(s) for vm %s", __func__,
904 	    vcp->vcp_ncpus, vcp->vcp_name);
905 
906 	/*
907 	 * Create and launch one thread for each VCPU. These threads may
908 	 * migrate between PCPUs over time; the need to reload CPU state
909 	 * in such situations is detected and performed by vmm(4) in the
910 	 * kernel.
911 	 */
912 	for (i = 0 ; i < vcp->vcp_ncpus; i++) {
913 		vrp[i] = malloc(sizeof(struct vm_run_params));
914 		if (vrp[i] == NULL) {
915 			log_warn("%s: memory allocation error - "
916 			    "exiting.", __progname);
917 			/* caller will exit, so skip freeing */
918 			return (ENOMEM);
919 		}
920 		vrp[i]->vrp_exit = malloc(sizeof(struct vm_exit));
921 		if (vrp[i]->vrp_exit == NULL) {
922 			log_warn("%s: memory allocation error - "
923 			    "exiting.", __progname);
924 			/* caller will exit, so skip freeing */
925 			return (ENOMEM);
926 		}
927 		vrp[i]->vrp_vm_id = vcp->vcp_id;
928 		vrp[i]->vrp_vcpu_id = i;
929 
930 		if (vcpu_reset(vcp->vcp_id, i, vrs)) {
931 			log_warnx("%s: cannot reset VCPU %zu - exiting.",
932 			    __progname, i);
933 			return (EIO);
934 		}
935 
936 		if (sev_activate(current_vm, i)) {
937 			log_warnx("%s: SEV activatation failed for VCPU "
938 			    "%zu failed - exiting.", __progname, i);
939 			return (EIO);
940 		}
941 
942 		if (sev_encrypt_memory(current_vm)) {
943 			log_warnx("%s: memory encryption failed for VCPU "
944 			    "%zu failed - exiting.", __progname, i);
945 			return (EIO);
946 		}
947 
948 		/* once more because reset_cpu changes regs */
949 		if (current_vm->vm_state & VM_STATE_RECEIVED) {
950 			vregsp.vrwp_vm_id = vcp->vcp_id;
951 			vregsp.vrwp_vcpu_id = i;
952 			vregsp.vrwp_regs = *vrs;
953 			vregsp.vrwp_mask = VM_RWREGS_ALL;
954 			if ((ret = ioctl(env->vmd_fd, VMM_IOC_WRITEREGS,
955 			    &vregsp)) == -1) {
956 				log_warn("%s: writeregs failed", __func__);
957 				return (ret);
958 			}
959 		}
960 
961 		ret = pthread_cond_init(&vcpu_run_cond[i], NULL);
962 		if (ret) {
963 			log_warnx("%s: cannot initialize cond var (%d)",
964 			    __progname, ret);
965 			return (ret);
966 		}
967 
968 		ret = pthread_mutex_init(&vcpu_run_mtx[i], NULL);
969 		if (ret) {
970 			log_warnx("%s: cannot initialize mtx (%d)",
971 			    __progname, ret);
972 			return (ret);
973 		}
974 
975 		ret = pthread_cond_init(&vcpu_unpause_cond[i], NULL);
976 		if (ret) {
977 			log_warnx("%s: cannot initialize unpause var (%d)",
978 			    __progname, ret);
979 			return (ret);
980 		}
981 
982 		ret = pthread_mutex_init(&vcpu_unpause_mtx[i], NULL);
983 		if (ret) {
984 			log_warnx("%s: cannot initialize unpause mtx (%d)",
985 			    __progname, ret);
986 			return (ret);
987 		}
988 
989 		vcpu_hlt[i] = 0;
990 
991 		/* Start each VCPU run thread at vcpu_run_loop */
992 		ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
993 		if (ret) {
994 			/* caller will _exit after this return */
995 			ret = errno;
996 			log_warn("%s: could not create vcpu thread %zu",
997 			    __func__, i);
998 			return (ret);
999 		}
1000 
1001 		snprintf(tname, sizeof(tname), "vcpu-%zu", i);
1002 		pthread_set_name_np(tid[i], tname);
1003 	}
1004 
1005 	log_debug("%s: waiting on events for VM %s", __func__, vcp->vcp_name);
1006 	ret = pthread_create(&evtid, NULL, event_thread, &evdone);
1007 	if (ret) {
1008 		errno = ret;
1009 		log_warn("%s: could not create event thread", __func__);
1010 		return (ret);
1011 	}
1012 	pthread_set_name_np(evtid, "event");
1013 
1014 	for (;;) {
1015 		ret = pthread_cond_wait(&threadcond, &threadmutex);
1016 		if (ret) {
1017 			log_warn("%s: waiting on thread state condition "
1018 			    "variable failed", __func__);
1019 			return (ret);
1020 		}
1021 
1022 		/*
1023 		 * Did a VCPU thread exit with an error? => return the first one
1024 		 */
1025 		mutex_lock(&vm_mtx);
1026 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1027 			if (vcpu_done[i] == 0)
1028 				continue;
1029 
1030 			if (pthread_join(tid[i], &exit_status)) {
1031 				log_warn("%s: failed to join thread %zd - "
1032 				    "exiting", __progname, i);
1033 				mutex_unlock(&vm_mtx);
1034 				return (EIO);
1035 			}
1036 
1037 			ret = (intptr_t)exit_status;
1038 		}
1039 		mutex_unlock(&vm_mtx);
1040 
1041 		/* Did the event thread exit? => return with an error */
1042 		if (evdone) {
1043 			if (pthread_join(evtid, &exit_status)) {
1044 				log_warn("%s: failed to join event thread - "
1045 				    "exiting", __progname);
1046 				return (EIO);
1047 			}
1048 
1049 			log_warnx("%s: vm %d event thread exited "
1050 			    "unexpectedly", __progname, vcp->vcp_id);
1051 			return (EIO);
1052 		}
1053 
1054 		/* Did all VCPU threads exit successfully? => return */
1055 		mutex_lock(&vm_mtx);
1056 		for (i = 0; i < vcp->vcp_ncpus; i++) {
1057 			if (vcpu_done[i] == 0)
1058 				break;
1059 		}
1060 		mutex_unlock(&vm_mtx);
1061 		if (i == vcp->vcp_ncpus)
1062 			return (ret);
1063 
1064 		/* Some more threads to wait for, start over */
1065 	}
1066 
1067 	return (ret);
1068 }
1069 
1070 static void *
event_thread(void * arg)1071 event_thread(void *arg)
1072 {
1073 	uint8_t *donep = arg;
1074 	intptr_t ret;
1075 
1076 	ret = event_dispatch();
1077 
1078 	*donep = 1;
1079 
1080 	mutex_lock(&threadmutex);
1081 	pthread_cond_signal(&threadcond);
1082 	mutex_unlock(&threadmutex);
1083 
1084 	return (void *)ret;
1085  }
1086 
1087 /*
1088  * vcpu_run_loop
1089  *
1090  * Runs a single VCPU until vmm(4) requires help handling an exit,
1091  * or the VM terminates.
1092  *
1093  * Parameters:
1094  *  arg: vcpu_run_params for the VCPU being run by this thread
1095  *
1096  * Return values:
1097  *  NULL: the VCPU shutdown properly
1098  *  !NULL: error processing VCPU run, or the VCPU shutdown abnormally
1099  */
1100 static void *
vcpu_run_loop(void * arg)1101 vcpu_run_loop(void *arg)
1102 {
1103 	struct vm_run_params *vrp = (struct vm_run_params *)arg;
1104 	intptr_t ret = 0;
1105 	uint32_t n = vrp->vrp_vcpu_id;
1106 	int paused = 0, halted = 0;
1107 
1108 	for (;;) {
1109 		ret = pthread_mutex_lock(&vcpu_run_mtx[n]);
1110 
1111 		if (ret) {
1112 			log_warnx("%s: can't lock vcpu run mtx (%d)",
1113 			    __func__, (int)ret);
1114 			return ((void *)ret);
1115 		}
1116 
1117 		mutex_lock(&vm_mtx);
1118 		paused = (current_vm->vm_state & VM_STATE_PAUSED) != 0;
1119 		halted = vcpu_hlt[n];
1120 		mutex_unlock(&vm_mtx);
1121 
1122 		/* If we are halted and need to pause, pause */
1123 		if (halted && paused) {
1124 			ret = pthread_barrier_wait(&vm_pause_barrier);
1125 			if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) {
1126 				log_warnx("%s: could not wait on pause barrier (%d)",
1127 				    __func__, (int)ret);
1128 				return ((void *)ret);
1129 			}
1130 
1131 			ret = pthread_mutex_lock(&vcpu_unpause_mtx[n]);
1132 			if (ret) {
1133 				log_warnx("%s: can't lock vcpu unpause mtx (%d)",
1134 				    __func__, (int)ret);
1135 				return ((void *)ret);
1136 			}
1137 
1138 			/* Interrupt may be firing, release run mtx. */
1139 			mutex_unlock(&vcpu_run_mtx[n]);
1140 			ret = pthread_cond_wait(&vcpu_unpause_cond[n],
1141 			    &vcpu_unpause_mtx[n]);
1142 			if (ret) {
1143 				log_warnx(
1144 				    "%s: can't wait on unpause cond (%d)",
1145 				    __func__, (int)ret);
1146 				break;
1147 			}
1148 			mutex_lock(&vcpu_run_mtx[n]);
1149 
1150 			ret = pthread_mutex_unlock(&vcpu_unpause_mtx[n]);
1151 			if (ret) {
1152 				log_warnx("%s: can't unlock unpause mtx (%d)",
1153 				    __func__, (int)ret);
1154 				break;
1155 			}
1156 		}
1157 
1158 		/* If we are halted and not paused, wait */
1159 		if (halted) {
1160 			ret = pthread_cond_wait(&vcpu_run_cond[n],
1161 			    &vcpu_run_mtx[n]);
1162 
1163 			if (ret) {
1164 				log_warnx(
1165 				    "%s: can't wait on cond (%d)",
1166 				    __func__, (int)ret);
1167 				(void)pthread_mutex_unlock(
1168 				    &vcpu_run_mtx[n]);
1169 				break;
1170 			}
1171 		}
1172 
1173 		ret = pthread_mutex_unlock(&vcpu_run_mtx[n]);
1174 
1175 		if (ret) {
1176 			log_warnx("%s: can't unlock mutex on cond (%d)",
1177 			    __func__, (int)ret);
1178 			break;
1179 		}
1180 
1181 		if (vrp->vrp_irqready && intr_pending(current_vm)) {
1182 			vrp->vrp_inject.vie_vector = intr_ack(current_vm);
1183 			vrp->vrp_inject.vie_type = VCPU_INJECT_INTR;
1184 		} else
1185 			vrp->vrp_inject.vie_type = VCPU_INJECT_NONE;
1186 
1187 		/* Still more interrupts pending? */
1188 		vrp->vrp_intr_pending = intr_pending(current_vm);
1189 
1190 		if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) {
1191 			/* If run ioctl failed, exit */
1192 			ret = errno;
1193 			log_warn("%s: vm %d / vcpu %d run ioctl failed",
1194 			    __func__, current_vm->vm_vmid, n);
1195 			break;
1196 		}
1197 
1198 		/* If the VM is terminating, exit normally */
1199 		if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) {
1200 			ret = (intptr_t)NULL;
1201 			break;
1202 		}
1203 
1204 		if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
1205 			/*
1206 			 * vmm(4) needs help handling an exit, handle in
1207 			 * vcpu_exit.
1208 			 */
1209 			ret = vcpu_exit(vrp);
1210 			if (ret)
1211 				break;
1212 		}
1213 	}
1214 
1215 	mutex_lock(&vm_mtx);
1216 	vcpu_done[n] = 1;
1217 	mutex_unlock(&vm_mtx);
1218 
1219 	mutex_lock(&threadmutex);
1220 	pthread_cond_signal(&threadcond);
1221 	mutex_unlock(&threadmutex);
1222 
1223 	return ((void *)ret);
1224 }
1225 
1226 int
vcpu_intr(uint32_t vm_id,uint32_t vcpu_id,uint8_t intr)1227 vcpu_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
1228 {
1229 	struct vm_intr_params vip;
1230 
1231 	memset(&vip, 0, sizeof(vip));
1232 
1233 	vip.vip_vm_id = vm_id;
1234 	vip.vip_vcpu_id = vcpu_id; /* XXX always 0? */
1235 	vip.vip_intr = intr;
1236 
1237 	if (ioctl(env->vmd_fd, VMM_IOC_INTR, &vip) == -1)
1238 		return (errno);
1239 
1240 	return (0);
1241 }
1242 
1243 /*
1244  * fd_hasdata
1245  *
1246  * Determines if data can be read from a file descriptor.
1247  *
1248  * Parameters:
1249  *  fd: the fd to check
1250  *
1251  * Return values:
1252  *  1 if data can be read from an fd, or 0 otherwise.
1253  */
1254 int
fd_hasdata(int fd)1255 fd_hasdata(int fd)
1256 {
1257 	struct pollfd pfd[1];
1258 	int nready, hasdata = 0;
1259 
1260 	pfd[0].fd = fd;
1261 	pfd[0].events = POLLIN;
1262 	nready = poll(pfd, 1, 0);
1263 	if (nready == -1)
1264 		log_warn("checking file descriptor for data failed");
1265 	else if (nready == 1 && pfd[0].revents & POLLIN)
1266 		hasdata = 1;
1267 	return (hasdata);
1268 }
1269 
1270 /*
1271  * mutex_lock
1272  *
1273  * Wrapper function for pthread_mutex_lock that does error checking and that
1274  * exits on failure
1275  */
1276 void
mutex_lock(pthread_mutex_t * m)1277 mutex_lock(pthread_mutex_t *m)
1278 {
1279 	int ret;
1280 
1281 	ret = pthread_mutex_lock(m);
1282 	if (ret) {
1283 		errno = ret;
1284 		fatal("could not acquire mutex");
1285 	}
1286 }
1287 
1288 /*
1289  * mutex_unlock
1290  *
1291  * Wrapper function for pthread_mutex_unlock that does error checking and that
1292  * exits on failure
1293  */
1294 void
mutex_unlock(pthread_mutex_t * m)1295 mutex_unlock(pthread_mutex_t *m)
1296 {
1297 	int ret;
1298 
1299 	ret = pthread_mutex_unlock(m);
1300 	if (ret) {
1301 		errno = ret;
1302 		fatal("could not release mutex");
1303 	}
1304 }
1305 
1306 
1307 void
vm_pipe_init(struct vm_dev_pipe * p,void (* cb)(int,short,void *))1308 vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *))
1309 {
1310 	vm_pipe_init2(p, cb, NULL);
1311 }
1312 
1313 /*
1314  * vm_pipe_init2
1315  *
1316  * Initialize a vm_dev_pipe, setting up its file descriptors and its
1317  * event structure with the given callback and argument.
1318  *
1319  * Parameters:
1320  *  p: pointer to vm_dev_pipe struct to initizlize
1321  *  cb: callback to use for READ events on the read end of the pipe
1322  *  arg: pointer to pass to the callback on event trigger
1323  */
1324 void
vm_pipe_init2(struct vm_dev_pipe * p,void (* cb)(int,short,void *),void * arg)1325 vm_pipe_init2(struct vm_dev_pipe *p, void (*cb)(int, short, void *), void *arg)
1326 {
1327 	int ret;
1328 	int fds[2];
1329 
1330 	memset(p, 0, sizeof(struct vm_dev_pipe));
1331 
1332 	ret = pipe2(fds, O_CLOEXEC);
1333 	if (ret)
1334 		fatal("failed to create vm_dev_pipe pipe");
1335 
1336 	p->read = fds[0];
1337 	p->write = fds[1];
1338 
1339 	event_set(&p->read_ev, p->read, EV_READ | EV_PERSIST, cb, arg);
1340 }
1341 
1342 /*
1343  * vm_pipe_send
1344  *
1345  * Send a message to an emulated device vie the provided vm_dev_pipe. This
1346  * relies on the fact sizeof(msg) < PIPE_BUF to ensure atomic writes.
1347  *
1348  * Parameters:
1349  *  p: pointer to initialized vm_dev_pipe
1350  *  msg: message to send in the channel
1351  */
1352 void
vm_pipe_send(struct vm_dev_pipe * p,enum pipe_msg_type msg)1353 vm_pipe_send(struct vm_dev_pipe *p, enum pipe_msg_type msg)
1354 {
1355 	size_t n;
1356 	n = write(p->write, &msg, sizeof(msg));
1357 	if (n != sizeof(msg))
1358 		fatal("failed to write to device pipe");
1359 }
1360 
1361 /*
1362  * vm_pipe_recv
1363  *
1364  * Receive a message for an emulated device via the provided vm_dev_pipe.
1365  * Returns the message value, otherwise will exit on failure. This relies on
1366  * the fact sizeof(enum pipe_msg_type) < PIPE_BUF for atomic reads.
1367  *
1368  * Parameters:
1369  *  p: pointer to initialized vm_dev_pipe
1370  *
1371  * Return values:
1372  *  a value of enum pipe_msg_type or fatal exit on read(2) error
1373  */
1374 enum pipe_msg_type
vm_pipe_recv(struct vm_dev_pipe * p)1375 vm_pipe_recv(struct vm_dev_pipe *p)
1376 {
1377 	size_t n;
1378 	enum pipe_msg_type msg;
1379 	n = read(p->read, &msg, sizeof(msg));
1380 	if (n != sizeof(msg))
1381 		fatal("failed to read from device pipe");
1382 
1383 	return msg;
1384 }
1385 
1386 /*
1387  * Re-map the guest address space using vmm(4)'s VMM_IOC_SHARE
1388  *
1389  * Returns 0 on success, non-zero in event of failure.
1390  */
1391 int
remap_guest_mem(struct vmd_vm * vm,int vmm_fd)1392 remap_guest_mem(struct vmd_vm *vm, int vmm_fd)
1393 {
1394 	struct vm_create_params	*vcp;
1395 	struct vm_mem_range	*vmr;
1396 	struct vm_sharemem_params vsp;
1397 	size_t			 i, j;
1398 	void			*p = NULL;
1399 	int			 ret;
1400 
1401 	if (vm == NULL)
1402 		return (1);
1403 
1404 	vcp = &vm->vm_params.vmc_params;
1405 
1406 	/*
1407 	 * Initialize our VM shared memory request using our original
1408 	 * creation parameters. We'll overwrite the va's after mmap(2).
1409 	 */
1410 	memset(&vsp, 0, sizeof(vsp));
1411 	vsp.vsp_nmemranges = vcp->vcp_nmemranges;
1412 	vsp.vsp_vm_id = vcp->vcp_id;
1413 	memcpy(&vsp.vsp_memranges, &vcp->vcp_memranges,
1414 	    sizeof(vsp.vsp_memranges));
1415 
1416 	/*
1417 	 * Use mmap(2) to identify virtual address space for our mappings.
1418 	 */
1419 	for (i = 0; i < VMM_MAX_MEM_RANGES; i++) {
1420 		if (i < vsp.vsp_nmemranges) {
1421 			vmr = &vsp.vsp_memranges[i];
1422 
1423 			/* Ignore any MMIO ranges. */
1424 			if (vmr->vmr_type == VM_MEM_MMIO) {
1425 				vmr->vmr_va = 0;
1426 				vcp->vcp_memranges[i].vmr_va = 0;
1427 				continue;
1428 			}
1429 
1430 			/* Make initial mappings for the memrange. */
1431 			p = mmap(NULL, vmr->vmr_size, PROT_READ, MAP_ANON, -1,
1432 			    0);
1433 			if (p == MAP_FAILED) {
1434 				ret = errno;
1435 				log_warn("%s: mmap", __func__);
1436 				for (j = 0; j < i; j++) {
1437 					vmr = &vcp->vcp_memranges[j];
1438 					munmap((void *)vmr->vmr_va,
1439 					    vmr->vmr_size);
1440 				}
1441 				return (ret);
1442 			}
1443 			vmr->vmr_va = (vaddr_t)p;
1444 			vcp->vcp_memranges[i].vmr_va = vmr->vmr_va;
1445 		}
1446 	}
1447 
1448 	/*
1449 	 * munmap(2) now that we have va's and ranges that don't overlap. vmm
1450 	 * will use the va's and sizes to recreate the mappings for us.
1451 	 */
1452 	for (i = 0; i < vsp.vsp_nmemranges; i++) {
1453 		vmr = &vsp.vsp_memranges[i];
1454 		if (vmr->vmr_type == VM_MEM_MMIO)
1455 			continue;
1456 		if (munmap((void*)vmr->vmr_va, vmr->vmr_size) == -1)
1457 			fatal("%s: munmap", __func__);
1458 	}
1459 
1460 	/*
1461 	 * Ask vmm to enter the shared mappings for us. They'll point
1462 	 * to the same host physical memory, but will have a randomized
1463 	 * virtual address for the calling process.
1464 	 */
1465 	if (ioctl(vmm_fd, VMM_IOC_SHAREMEM, &vsp) == -1)
1466 		return (errno);
1467 
1468 	return (0);
1469 }
1470 
1471 void
vcpu_halt(uint32_t vcpu_id)1472 vcpu_halt(uint32_t vcpu_id)
1473 {
1474 	mutex_lock(&vm_mtx);
1475 	vcpu_hlt[vcpu_id] = 1;
1476 	mutex_unlock(&vm_mtx);
1477 }
1478 
1479 void
vcpu_unhalt(uint32_t vcpu_id)1480 vcpu_unhalt(uint32_t vcpu_id)
1481 	{
1482 	mutex_lock(&vm_mtx);
1483 	vcpu_hlt[vcpu_id] = 0;
1484 	mutex_unlock(&vm_mtx);
1485 }
1486 
1487 void
vcpu_signal_run(uint32_t vcpu_id)1488 vcpu_signal_run(uint32_t vcpu_id)
1489 {
1490 	int ret;
1491 
1492 	mutex_lock(&vcpu_run_mtx[vcpu_id]);
1493 	ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
1494 	if (ret)
1495 		fatalx("%s: can't signal (%d)", __func__, ret);
1496 	mutex_unlock(&vcpu_run_mtx[vcpu_id]);
1497 }
1498