xref: /openbsd/usr.sbin/vmd/vmm.c (revision 3cab2bb3)
1 /*	$OpenBSD: vmm.c,v 1.96 2020/04/30 03:50:53 pd Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>	/* nitems */
20 #include <sys/ioctl.h>
21 #include <sys/queue.h>
22 #include <sys/wait.h>
23 #include <sys/uio.h>
24 #include <sys/socket.h>
25 #include <sys/time.h>
26 #include <sys/mman.h>
27 
28 #include <dev/ic/i8253reg.h>
29 #include <dev/isa/isareg.h>
30 #include <dev/pci/pcireg.h>
31 
32 #include <machine/param.h>
33 #include <machine/psl.h>
34 #include <machine/specialreg.h>
35 #include <machine/vmmvar.h>
36 
37 #include <net/if.h>
38 
39 #include <errno.h>
40 #include <event.h>
41 #include <fcntl.h>
42 #include <imsg.h>
43 #include <limits.h>
44 #include <poll.h>
45 #include <pthread.h>
46 #include <stddef.h>
47 #include <stdio.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <unistd.h>
51 #include <util.h>
52 
53 #include "vmd.h"
54 #include "vmm.h"
55 
56 void vmm_sighdlr(int, short, void *);
57 int vmm_start_vm(struct imsg *, uint32_t *, pid_t *);
58 int vmm_dispatch_parent(int, struct privsep_proc *, struct imsg *);
59 void vmm_run(struct privsep *, struct privsep_proc *, void *);
60 void vmm_dispatch_vm(int, short, void *);
61 int terminate_vm(struct vm_terminate_params *);
62 int get_info_vm(struct privsep *, struct imsg *, int);
63 int opentap(char *);
64 
65 extern struct vmd *env;
66 
67 static struct privsep_proc procs[] = {
68 	{ "parent",	PROC_PARENT,	vmm_dispatch_parent  },
69 };
70 
71 void
72 vmm(struct privsep *ps, struct privsep_proc *p)
73 {
74 	proc_run(ps, p, procs, nitems(procs), vmm_run, NULL);
75 }
76 
77 void
78 vmm_run(struct privsep *ps, struct privsep_proc *p, void *arg)
79 {
80 	if (config_init(ps->ps_env) == -1)
81 		fatal("failed to initialize configuration");
82 
83 	signal_del(&ps->ps_evsigchld);
84 	signal_set(&ps->ps_evsigchld, SIGCHLD, vmm_sighdlr, ps);
85 	signal_add(&ps->ps_evsigchld, NULL);
86 
87 	/*
88 	 * pledge in the vmm process:
89 	 * stdio - for malloc and basic I/O including events.
90 	 * vmm - for the vmm ioctls and operations.
91 	 * proc - for forking and maitaining vms.
92 	 * send - for sending send/recv fds to vm proc.
93 	 * recvfd - for disks, interfaces and other fds.
94 	 */
95 	if (pledge("stdio vmm sendfd recvfd proc", NULL) == -1)
96 		fatal("pledge");
97 
98 	/* Get and terminate all running VMs */
99 	get_info_vm(ps, NULL, 1);
100 }
101 
102 int
103 vmm_dispatch_parent(int fd, struct privsep_proc *p, struct imsg *imsg)
104 {
105 	struct privsep		*ps = p->p_ps;
106 	int			 res = 0, cmd = 0, verbose, ret;
107 	struct vmd_vm		*vm = NULL;
108 	struct vm_terminate_params vtp;
109 	struct vmop_id		 vid;
110 	struct vmop_result	 vmr;
111 	struct vmop_create_params vmc;
112 	uint32_t		 id = 0, peerid = imsg->hdr.peerid;
113 	pid_t			 pid = 0;
114 	unsigned int		 mode, flags;
115 
116 	switch (imsg->hdr.type) {
117 	case IMSG_VMDOP_START_VM_REQUEST:
118 		res = config_getvm(ps, imsg);
119 		if (res == -1) {
120 			res = errno;
121 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
122 		}
123 		break;
124 	case IMSG_VMDOP_START_VM_CDROM:
125 		res = config_getcdrom(ps, imsg);
126 		if (res == -1) {
127 			res = errno;
128 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
129 		}
130 		break;
131 	case IMSG_VMDOP_START_VM_DISK:
132 		res = config_getdisk(ps, imsg);
133 		if (res == -1) {
134 			res = errno;
135 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
136 		}
137 		break;
138 	case IMSG_VMDOP_START_VM_IF:
139 		res = config_getif(ps, imsg);
140 		if (res == -1) {
141 			res = errno;
142 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
143 		}
144 		break;
145 	case IMSG_VMDOP_START_VM_END:
146 		res = vmm_start_vm(imsg, &id, &pid);
147 		/* Check if the ID can be mapped correctly */
148 		if ((id = vm_id2vmid(id, NULL)) == 0)
149 			res = ENOENT;
150 		cmd = IMSG_VMDOP_START_VM_RESPONSE;
151 		break;
152 	case IMSG_VMDOP_WAIT_VM_REQUEST:
153 		IMSG_SIZE_CHECK(imsg, &vid);
154 		memcpy(&vid, imsg->data, sizeof(vid));
155 		id = vid.vid_id;
156 
157 		DPRINTF("%s: recv'ed WAIT_VM for %d", __func__, id);
158 
159 		cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
160 		if (id == 0) {
161 			res = ENOENT;
162 		} else if ((vm = vm_getbyvmid(id)) != NULL) {
163 			if (vm->vm_peerid != (uint32_t)-1) {
164 				peerid = vm->vm_peerid;
165 				res = EINTR;
166 			} else
167 				cmd = 0;
168 			vm->vm_peerid = imsg->hdr.peerid;
169 		} else {
170 			/* vm doesn't exist, cannot stop vm */
171 			log_debug("%s: cannot stop vm that is not running",
172 			    __func__);
173 			res = VMD_VM_STOP_INVALID;
174 		}
175 		break;
176 	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
177 		IMSG_SIZE_CHECK(imsg, &vid);
178 		memcpy(&vid, imsg->data, sizeof(vid));
179 		id = vid.vid_id;
180 		flags = vid.vid_flags;
181 
182 		DPRINTF("%s: recv'ed TERMINATE_VM for %d", __func__, id);
183 
184 		cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
185 
186 		if (id == 0) {
187 			res = ENOENT;
188 		} else if ((vm = vm_getbyvmid(id)) != NULL) {
189 			if (flags & VMOP_FORCE) {
190 				vtp.vtp_vm_id = vm_vmid2id(vm->vm_vmid, vm);
191 				vm->vm_state |= VM_STATE_SHUTDOWN;
192 				(void)terminate_vm(&vtp);
193 				res = 0;
194 			} else if (!(vm->vm_state & VM_STATE_SHUTDOWN)) {
195 				log_debug("%s: sending shutdown request"
196 				    " to vm %d", __func__, id);
197 
198 				/*
199 				 * Request reboot but mark the VM as shutting
200 				 * down. This way we can terminate the VM after
201 				 * the triple fault instead of reboot and
202 				 * avoid being stuck in the ACPI-less powerdown
203 				 * ("press any key to reboot") of the VM.
204 				 */
205 				vm->vm_state |= VM_STATE_SHUTDOWN;
206 				if (imsg_compose_event(&vm->vm_iev,
207 				    IMSG_VMDOP_VM_REBOOT,
208 				    0, 0, -1, NULL, 0) == -1)
209 					res = errno;
210 				else
211 					res = 0;
212 			} else {
213 				/*
214 				 * VM is currently being shutdown.
215 				 * Check to see if the VM process is still
216 				 * active.  If not, return VMD_VM_STOP_INVALID.
217 				 */
218 				if (vm_vmid2id(vm->vm_vmid, vm) == 0) {
219 					log_debug("%s: no vm running anymore",
220 					    __func__);
221 					res = VMD_VM_STOP_INVALID;
222 				}
223 			}
224 			if ((flags & VMOP_WAIT) &&
225 			    res == 0 && (vm->vm_state & VM_STATE_SHUTDOWN)) {
226 				if (vm->vm_peerid != (uint32_t)-1) {
227 					peerid = vm->vm_peerid;
228 					res = EINTR;
229 				} else
230 					cmd = 0;
231 				vm->vm_peerid = imsg->hdr.peerid;
232 			}
233 		} else {
234 			/* vm doesn't exist, cannot stop vm */
235 			log_debug("%s: cannot stop vm that is not running",
236 			    __func__);
237 			res = VMD_VM_STOP_INVALID;
238 		}
239 		break;
240 	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
241 		res = get_info_vm(ps, imsg, 0);
242 		cmd = IMSG_VMDOP_GET_INFO_VM_END_DATA;
243 		break;
244 	case IMSG_VMDOP_CONFIG:
245 		config_getconfig(env, imsg);
246 		break;
247 	case IMSG_CTL_RESET:
248 		IMSG_SIZE_CHECK(imsg, &mode);
249 		memcpy(&mode, imsg->data, sizeof(mode));
250 
251 		if (mode & CONFIG_VMS) {
252 			/* Terminate and remove all VMs */
253 			vmm_shutdown();
254 			mode &= ~CONFIG_VMS;
255 		}
256 
257 		config_getreset(env, imsg);
258 		break;
259 	case IMSG_CTL_VERBOSE:
260 		IMSG_SIZE_CHECK(imsg, &verbose);
261 		memcpy(&verbose, imsg->data, sizeof(verbose));
262 		log_setverbose(verbose);
263 
264 		/* Forward message to each VM process */
265 		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
266 			imsg_compose_event(&vm->vm_iev,
267 			    imsg->hdr.type, imsg->hdr.peerid, imsg->hdr.pid,
268 			    -1, &verbose, sizeof(verbose));
269 		}
270 		break;
271 	case IMSG_VMDOP_PAUSE_VM:
272 		IMSG_SIZE_CHECK(imsg, &vid);
273 		memcpy(&vid, imsg->data, sizeof(vid));
274 		id = vid.vid_id;
275 		vm = vm_getbyvmid(id);
276 		if ((vm = vm_getbyvmid(id)) == NULL) {
277 			res = ENOENT;
278 			cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
279 			break;
280 		}
281 		imsg_compose_event(&vm->vm_iev,
282 		    imsg->hdr.type, imsg->hdr.peerid, imsg->hdr.pid,
283 		    imsg->fd, &vid, sizeof(vid));
284 		break;
285 	case IMSG_VMDOP_UNPAUSE_VM:
286 		IMSG_SIZE_CHECK(imsg, &vid);
287 		memcpy(&vid, imsg->data, sizeof(vid));
288 		id = vid.vid_id;
289 		if ((vm = vm_getbyvmid(id)) == NULL) {
290 			res = ENOENT;
291 			cmd = IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
292 			break;
293 		}
294 		imsg_compose_event(&vm->vm_iev,
295 		    imsg->hdr.type, imsg->hdr.peerid, imsg->hdr.pid,
296 		    imsg->fd, &vid, sizeof(vid));
297 		break;
298 	case IMSG_VMDOP_SEND_VM_REQUEST:
299 		IMSG_SIZE_CHECK(imsg, &vid);
300 		memcpy(&vid, imsg->data, sizeof(vid));
301 		id = vid.vid_id;
302 		if ((vm = vm_getbyvmid(id)) == NULL) {
303 			res = ENOENT;
304 			close(imsg->fd);
305 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
306 			break;
307 		}
308 		imsg_compose_event(&vm->vm_iev,
309 		    imsg->hdr.type, imsg->hdr.peerid, imsg->hdr.pid,
310 		    imsg->fd, &vid, sizeof(vid));
311 		break;
312 	case IMSG_VMDOP_RECEIVE_VM_REQUEST:
313 		IMSG_SIZE_CHECK(imsg, &vmc);
314 		memcpy(&vmc, imsg->data, sizeof(vmc));
315 		ret = vm_register(ps, &vmc, &vm,
316 		    imsg->hdr.peerid, vmc.vmc_owner.uid);
317 		vm->vm_tty = imsg->fd;
318 		vm->vm_state |= VM_STATE_RECEIVED;
319 		vm->vm_state |= VM_STATE_PAUSED;
320 		break;
321 	case IMSG_VMDOP_RECEIVE_VM_END:
322 		if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
323 			res = ENOENT;
324 			close(imsg->fd);
325 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
326 			break;
327 		}
328 		vm->vm_receive_fd = imsg->fd;
329 		res = vmm_start_vm(imsg, &id, &pid);
330 		/* Check if the ID can be mapped correctly */
331 		if ((id = vm_id2vmid(id, NULL)) == 0)
332 			res = ENOENT;
333 		cmd = IMSG_VMDOP_START_VM_RESPONSE;
334 		break;
335 	default:
336 		return (-1);
337 	}
338 
339 	switch (cmd) {
340 	case 0:
341 		break;
342 	case IMSG_VMDOP_START_VM_RESPONSE:
343 		if (res != 0) {
344 			/* Remove local reference if it exists */
345 			if ((vm = vm_getbyvmid(imsg->hdr.peerid)) != NULL) {
346 				log_debug("%s: removing vm, START_VM_RESPONSE",
347 				    __func__);
348 				vm_remove(vm, __func__);
349 			}
350 		}
351 		if (id == 0)
352 			id = imsg->hdr.peerid;
353 	case IMSG_VMDOP_PAUSE_VM_RESPONSE:
354 	case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
355 	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
356 		memset(&vmr, 0, sizeof(vmr));
357 		vmr.vmr_result = res;
358 		vmr.vmr_id = id;
359 		vmr.vmr_pid = pid;
360 		if (proc_compose_imsg(ps, PROC_PARENT, -1, cmd,
361 		    peerid, -1, &vmr, sizeof(vmr)) == -1)
362 			return (-1);
363 		break;
364 	default:
365 		if (proc_compose_imsg(ps, PROC_PARENT, -1, cmd,
366 		    peerid, -1, &res, sizeof(res)) == -1)
367 			return (-1);
368 		break;
369 	}
370 
371 	return (0);
372 }
373 
374 void
375 vmm_sighdlr(int sig, short event, void *arg)
376 {
377 	struct privsep *ps = arg;
378 	int status, ret = 0;
379 	uint32_t vmid;
380 	pid_t pid;
381 	struct vmop_result vmr;
382 	struct vmd_vm *vm;
383 	struct vm_terminate_params vtp;
384 
385 	log_debug("%s: handling signal %d", __func__, sig);
386 	switch (sig) {
387 	case SIGCHLD:
388 		do {
389 			pid = waitpid(-1, &status, WNOHANG);
390 			if (pid <= 0)
391 				continue;
392 
393 			if (WIFEXITED(status) || WIFSIGNALED(status)) {
394 				vm = vm_getbypid(pid);
395 				if (vm == NULL) {
396 					/*
397 					 * If the VM is gone already, it
398 					 * got terminated via a
399 					 * IMSG_VMDOP_TERMINATE_VM_REQUEST.
400 					 */
401 					continue;
402 				}
403 
404 				if (WIFEXITED(status))
405 					ret = WEXITSTATUS(status);
406 
407 				/* don't reboot on pending shutdown */
408 				if (ret == EAGAIN && (vm->vm_state & VM_STATE_SHUTDOWN))
409 					ret = 0;
410 
411 				vmid = vm->vm_params.vmc_params.vcp_id;
412 				vtp.vtp_vm_id = vmid;
413 
414 				if (terminate_vm(&vtp) == 0)
415 					log_debug("%s: terminated vm %s"
416 					    " (id %d)", __func__,
417 					    vm->vm_params.vmc_params.vcp_name,
418 					    vm->vm_vmid);
419 
420 				memset(&vmr, 0, sizeof(vmr));
421 				vmr.vmr_result = ret;
422 				vmr.vmr_id = vm_id2vmid(vmid, vm);
423 				if (proc_compose_imsg(ps, PROC_PARENT,
424 				    -1, IMSG_VMDOP_TERMINATE_VM_EVENT,
425 				    vm->vm_peerid, -1,
426 				    &vmr, sizeof(vmr)) == -1)
427 					log_warnx("could not signal "
428 					    "termination of VM %u to "
429 					    "parent", vm->vm_vmid);
430 
431 				vm_remove(vm, __func__);
432 			} else
433 				fatalx("unexpected cause of SIGCHLD");
434 		} while (pid > 0 || (pid == -1 && errno == EINTR));
435 		break;
436 	default:
437 		fatalx("unexpected signal");
438 	}
439 }
440 
441 /*
442  * vmm_shutdown
443  *
444  * Terminate VMs on shutdown to avoid "zombie VM" processes.
445  */
446 void
447 vmm_shutdown(void)
448 {
449 	struct vm_terminate_params vtp;
450 	struct vmd_vm *vm, *vm_next;
451 
452 	TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
453 		vtp.vtp_vm_id = vm_vmid2id(vm->vm_vmid, vm);
454 
455 		/* XXX suspend or request graceful shutdown */
456 		(void)terminate_vm(&vtp);
457 		vm_remove(vm, __func__);
458 	}
459 }
460 
461 /*
462  * vmm_pipe
463  *
464  * Create a new imsg control channel between vmm parent and a VM
465  * (can be called on both sides).
466  */
467 int
468 vmm_pipe(struct vmd_vm *vm, int fd, void (*cb)(int, short, void *))
469 {
470 	struct imsgev	*iev = &vm->vm_iev;
471 
472 	if (fcntl(fd, F_SETFL, O_NONBLOCK) == -1) {
473 		log_warn("failed to set nonblocking mode on vm pipe");
474 		return (-1);
475 	}
476 
477 	imsg_init(&iev->ibuf, fd);
478 	iev->handler = cb;
479 	iev->data = vm;
480 	imsg_event_add(iev);
481 
482 	return (0);
483 }
484 
485 /*
486  * vmm_dispatch_vm
487  *
488  * imsg callback for messages that are received from a VM child process.
489  */
490 void
491 vmm_dispatch_vm(int fd, short event, void *arg)
492 {
493 	struct vmd_vm		*vm = arg;
494 	struct vmop_result	 vmr;
495 	struct imsgev		*iev = &vm->vm_iev;
496 	struct imsgbuf		*ibuf = &iev->ibuf;
497 	struct imsg		 imsg;
498 	ssize_t			 n;
499 	unsigned int		 i;
500 
501 	if (event & EV_READ) {
502 		if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN)
503 			fatal("%s: imsg_read", __func__);
504 		if (n == 0) {
505 			/* this pipe is dead, so remove the event handler */
506 			event_del(&iev->ev);
507 			return;
508 		}
509 	}
510 
511 	if (event & EV_WRITE) {
512 		if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN)
513 			fatal("%s: msgbuf_write fd %d", __func__, ibuf->fd);
514 		if (n == 0) {
515 			/* this pipe is dead, so remove the event handler */
516 			event_del(&iev->ev);
517 			return;
518 		}
519 	}
520 
521 	for (;;) {
522 		if ((n = imsg_get(ibuf, &imsg)) == -1)
523 			fatal("%s: imsg_get", __func__);
524 		if (n == 0)
525 			break;
526 
527 		DPRINTF("%s: got imsg %d from %s",
528 		    __func__, imsg.hdr.type,
529 		    vm->vm_params.vmc_params.vcp_name);
530 
531 		switch (imsg.hdr.type) {
532 		case IMSG_VMDOP_VM_SHUTDOWN:
533 			vm->vm_state |= VM_STATE_SHUTDOWN;
534 			break;
535 		case IMSG_VMDOP_VM_REBOOT:
536 			vm->vm_state &= ~VM_STATE_SHUTDOWN;
537 			break;
538 		case IMSG_VMDOP_SEND_VM_RESPONSE:
539 			IMSG_SIZE_CHECK(&imsg, &vmr);
540 		case IMSG_VMDOP_PAUSE_VM_RESPONSE:
541 		case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
542 			for (i = 0; i < sizeof(procs); i++) {
543 				if (procs[i].p_id == PROC_PARENT) {
544 					proc_forward_imsg(procs[i].p_ps,
545 					    &imsg, PROC_PARENT, -1);
546 					break;
547 				}
548 			}
549 			break;
550 
551 		default:
552 			fatalx("%s: got invalid imsg %d from %s",
553 			    __func__, imsg.hdr.type,
554 			    vm->vm_params.vmc_params.vcp_name);
555 		}
556 		imsg_free(&imsg);
557 	}
558 	imsg_event_add(iev);
559 }
560 
561 /*
562  * terminate_vm
563  *
564  * Requests vmm(4) to terminate the VM whose ID is provided in the
565  * supplied vm_terminate_params structure (vtp->vtp_vm_id)
566  *
567  * Parameters
568  *  vtp: vm_terminate_params struct containing the ID of the VM to terminate
569  *
570  * Return values:
571  *  0: success
572  *  !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM is not
573  *      valid)
574  */
575 int
576 terminate_vm(struct vm_terminate_params *vtp)
577 {
578 	if (ioctl(env->vmd_fd, VMM_IOC_TERM, vtp) == -1)
579 		return (errno);
580 
581 	return (0);
582 }
583 
584 /*
585  * opentap
586  *
587  * Opens the next available tap device, up to MAX_TAP.
588  *
589  * Parameters
590  *  ifname: a buffer of at least IF_NAMESIZE bytes.
591  *
592  * Returns a file descriptor to the tap node opened, or -1 if no tap
593  * devices were available.
594  */
595 int
596 opentap(char *ifname)
597 {
598 	int i, fd;
599 	char path[PATH_MAX];
600 
601 	for (i = 0; i < MAX_TAP; i++) {
602 		snprintf(path, PATH_MAX, "/dev/tap%d", i);
603 		fd = open(path, O_RDWR | O_NONBLOCK);
604 		if (fd != -1) {
605 			snprintf(ifname, IF_NAMESIZE, "tap%d", i);
606 			return (fd);
607 		}
608 	}
609 	strlcpy(ifname, "tap", IF_NAMESIZE);
610 
611 	return (-1);
612 }
613 
614 /*
615  * vmm_start_vm
616  *
617  * Prepares and forks a new VM process.
618  *
619  * Parameters:
620  *  imsg: The VM data structure that is including the VM create parameters.
621  *  id: Returns the VM id as reported by the kernel and obtained from the VM.
622  *  pid: Returns the VM pid to the parent.
623  *
624  * Return values:
625  *  0: success
626  *  !0 : failure - typically an errno indicating the source of the failure
627  */
628 int
629 vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
630 {
631 	struct vm_create_params	*vcp;
632 	struct vmd_vm		*vm;
633 	int			 ret = EINVAL;
634 	int			 fds[2];
635 	size_t			 i, j;
636 
637 	if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
638 		log_warnx("%s: can't find vm", __func__);
639 		ret = ENOENT;
640 		goto err;
641 	}
642 	vcp = &vm->vm_params.vmc_params;
643 
644 	if (!(vm->vm_state & VM_STATE_RECEIVED)) {
645 		if ((vm->vm_tty = imsg->fd) == -1) {
646 			log_warnx("%s: can't get tty", __func__);
647 			goto err;
648 		}
649 	}
650 
651 	if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, fds) == -1)
652 		fatal("socketpair");
653 
654 	/* Start child vmd for this VM (fork, chroot, drop privs) */
655 	ret = fork();
656 
657 	/* Start child failed? - cleanup and leave */
658 	if (ret == -1) {
659 		log_warnx("%s: start child failed", __func__);
660 		ret = EIO;
661 		goto err;
662 	}
663 
664 	if (ret > 0) {
665 		/* Parent */
666 		vm->vm_pid = ret;
667 		close(fds[1]);
668 
669 		for (i = 0 ; i < vcp->vcp_ndisks; i++) {
670 			for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
671 				if (vm->vm_disks[i][j] != -1)
672 					close(vm->vm_disks[i][j]);
673 				vm->vm_disks[i][j] = -1;
674 			}
675 		}
676 		for (i = 0 ; i < vcp->vcp_nnics; i++) {
677 			close(vm->vm_ifs[i].vif_fd);
678 			vm->vm_ifs[i].vif_fd = -1;
679 		}
680 		if (vm->vm_kernel != -1) {
681 			close(vm->vm_kernel);
682 			vm->vm_kernel = -1;
683 		}
684 		if (vm->vm_cdrom != -1) {
685 			close(vm->vm_cdrom);
686 			vm->vm_cdrom = -1;
687 		}
688 		if (vm->vm_tty != -1) {
689 			close(vm->vm_tty);
690 			vm->vm_tty = -1;
691 		}
692 
693 		/* read back the kernel-generated vm id from the child */
694 		if (read(fds[0], &vcp->vcp_id, sizeof(vcp->vcp_id)) !=
695 		    sizeof(vcp->vcp_id))
696 			fatal("read vcp id");
697 
698 		if (vcp->vcp_id == 0)
699 			goto err;
700 
701 		*id = vcp->vcp_id;
702 		*pid = vm->vm_pid;
703 
704 		if (vmm_pipe(vm, fds[0], vmm_dispatch_vm) == -1)
705 			fatal("setup vm pipe");
706 
707 		return (0);
708 	} else {
709 		/* Child */
710 		close(fds[0]);
711 		close(PROC_PARENT_SOCK_FILENO);
712 
713 		ret = start_vm(vm, fds[1]);
714 
715 		_exit(ret);
716 	}
717 
718 	return (0);
719 
720  err:
721 	vm_remove(vm, __func__);
722 
723 	return (ret);
724 }
725 
726 /*
727  * get_info_vm
728  *
729  * Returns a list of VMs known to vmm(4).
730  *
731  * Parameters:
732  *  ps: the privsep context.
733  *  imsg: the received imsg including the peer id.
734  *  terminate: terminate the listed vm.
735  *
736  * Return values:
737  *  0: success
738  *  !0 : failure (eg, ENOMEM, EIO or another error code from vmm(4) ioctl)
739  */
740 int
741 get_info_vm(struct privsep *ps, struct imsg *imsg, int terminate)
742 {
743 	int ret;
744 	size_t ct, i;
745 	struct vm_info_params vip;
746 	struct vm_info_result *info;
747 	struct vm_terminate_params vtp;
748 	struct vmop_info_result vir;
749 
750 	/*
751 	 * We issue the VMM_IOC_INFO ioctl twice, once with an input
752 	 * buffer size of 0, which results in vmm(4) returning the
753 	 * number of bytes required back to us in vip.vip_size,
754 	 * and then we call it again after malloc'ing the required
755 	 * number of bytes.
756 	 *
757 	 * It is possible that we could fail a second time (eg, if
758 	 * another VM was created in the instant between the two
759 	 * ioctls, but in that case the caller can just try again
760 	 * as vmm(4) will return a zero-sized list in that case.
761 	 */
762 	vip.vip_size = 0;
763 	info = NULL;
764 	ret = 0;
765 	memset(&vir, 0, sizeof(vir));
766 
767 	/* First ioctl to see how many bytes needed (vip.vip_size) */
768 	if (ioctl(env->vmd_fd, VMM_IOC_INFO, &vip) == -1)
769 		return (errno);
770 
771 	if (vip.vip_info_ct != 0)
772 		return (EIO);
773 
774 	info = malloc(vip.vip_size);
775 	if (info == NULL)
776 		return (ENOMEM);
777 
778 	/* Second ioctl to get the actual list */
779 	vip.vip_info = info;
780 	if (ioctl(env->vmd_fd, VMM_IOC_INFO, &vip) == -1) {
781 		ret = errno;
782 		free(info);
783 		return (ret);
784 	}
785 
786 	/* Return info */
787 	ct = vip.vip_size / sizeof(struct vm_info_result);
788 	for (i = 0; i < ct; i++) {
789 		if (terminate) {
790 			vtp.vtp_vm_id = info[i].vir_id;
791 			if ((ret = terminate_vm(&vtp)) != 0)
792 				return (ret);
793 			log_debug("%s: terminated vm %s (id %d)", __func__,
794 			    info[i].vir_name, info[i].vir_id);
795 			continue;
796 		}
797 		memcpy(&vir.vir_info, &info[i], sizeof(vir.vir_info));
798 		vir.vir_info.vir_id = vm_id2vmid(info[i].vir_id, NULL);
799 		if (proc_compose_imsg(ps, PROC_PARENT, -1,
800 		    IMSG_VMDOP_GET_INFO_VM_DATA, imsg->hdr.peerid, -1,
801 		    &vir, sizeof(vir)) == -1)
802 			return (EIO);
803 	}
804 	free(info);
805 	return (0);
806 }
807