xref: /openbsd/usr.sbin/vmd/vmd.c (revision 3bef86f7)
1 /*	$OpenBSD: vmd.c,v 1.153 2024/01/18 14:49:59 claudio Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/queue.h>
21 #include <sys/wait.h>
22 #include <sys/stat.h>
23 #include <sys/sysctl.h>
24 #include <sys/tty.h>
25 #include <sys/ttycom.h>
26 #include <sys/ioctl.h>
27 
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <termios.h>
32 #include <errno.h>
33 #include <event.h>
34 #include <fcntl.h>
35 #include <pwd.h>
36 #include <signal.h>
37 #include <syslog.h>
38 #include <unistd.h>
39 #include <util.h>
40 #include <ctype.h>
41 #include <grp.h>
42 
43 #include <machine/specialreg.h>
44 #include <machine/vmmvar.h>
45 
46 #include "proc.h"
47 #include "atomicio.h"
48 #include "vmd.h"
49 
50 __dead void usage(void);
51 
52 int	 main(int, char **);
53 int	 vmd_configure(void);
54 void	 vmd_sighdlr(int sig, short event, void *arg);
55 void	 vmd_shutdown(void);
56 int	 vmd_control_run(void);
57 int	 vmd_dispatch_control(int, struct privsep_proc *, struct imsg *);
58 int	 vmd_dispatch_vmm(int, struct privsep_proc *, struct imsg *);
59 int	 vmd_dispatch_agentx(int, struct privsep_proc *, struct imsg *);
60 int	 vmd_dispatch_priv(int, struct privsep_proc *, struct imsg *);
61 int	 vmd_check_vmh(struct vm_dump_header *);
62 
63 int	 vm_instance(struct privsep *, struct vmd_vm **,
64 	    struct vmop_create_params *, uid_t);
65 int	 vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t);
66 int	 vm_claimid(const char *, int, uint32_t *);
67 void	 start_vm_batch(int, short, void*);
68 
69 static inline void vm_terminate(struct vmd_vm *, const char *);
70 
71 struct vmd	*env;
72 
73 static struct privsep_proc procs[] = {
74 	/* Keep "priv" on top as procs[0] */
75 	{ "priv",	PROC_PRIV,	vmd_dispatch_priv, priv },
76 	{ "control",	PROC_CONTROL,	vmd_dispatch_control, control },
77 	{ "vmm",	PROC_VMM,	vmd_dispatch_vmm, vmm,
78 	  vmm_shutdown, "/" },
79 	{ "agentx", 	PROC_AGENTX,	vmd_dispatch_agentx, vm_agentx,
80 	  vm_agentx_shutdown, "/" }
81 };
82 
83 enum privsep_procid privsep_process;
84 
85 struct event staggered_start_timer;
86 
87 /* For the privileged process */
88 static struct privsep_proc *proc_priv = &procs[0];
89 static struct passwd proc_privpw;
90 static const uint8_t zero_mac[ETHER_ADDR_LEN];
91 
92 const char		 default_conffile[] = VMD_CONF;
93 const char		*conffile = default_conffile;
94 
95 int
96 vmd_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg)
97 {
98 	struct privsep			*ps = p->p_ps;
99 	int				 res = 0, ret = 0, cmd = 0, verbose;
100 	int				 ifd;
101 	unsigned int			 v = 0, flags;
102 	struct vmop_create_params	 vmc;
103 	struct vmop_id			 vid;
104 	struct vmop_result		 vmr;
105 	struct vm_dump_header		 vmh;
106 	struct vmd_vm			*vm = NULL;
107 	char				*str = NULL;
108 	uint32_t			 id = 0;
109 	struct control_sock		*rcs;
110 
111 	switch (imsg->hdr.type) {
112 	case IMSG_VMDOP_START_VM_REQUEST:
113 		IMSG_SIZE_CHECK(imsg, &vmc);
114 		memcpy(&vmc, imsg->data, sizeof(vmc));
115 		vmc.vmc_kernel = imsg_get_fd(imsg);
116 
117 		/* Try registering our VM in our list of known VMs. */
118 		if (vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid)) {
119 			res = errno;
120 
121 			/* Did we have a failure during lookup of a parent? */
122 			if (vm == NULL) {
123 				cmd = IMSG_VMDOP_START_VM_RESPONSE;
124 				break;
125 			}
126 
127 			/* Does the VM already exist? */
128 			if (res == EALREADY) {
129 				/* Is it already running? */
130 				if (vm->vm_state & VM_STATE_RUNNING) {
131 					cmd = IMSG_VMDOP_START_VM_RESPONSE;
132 					break;
133 				}
134 
135 				/* If not running, are our flags ok? */
136 				if (vmc.vmc_flags &&
137 				    vmc.vmc_flags != VMOP_CREATE_KERNEL) {
138 					cmd = IMSG_VMDOP_START_VM_RESPONSE;
139 					break;
140 				}
141 			}
142 			res = 0;
143 		}
144 
145 		/* Try to start the launch of the VM. */
146 		res = config_setvm(ps, vm, imsg->hdr.peerid,
147 		    vm->vm_params.vmc_owner.uid);
148 		if (res)
149 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
150 		break;
151 	case IMSG_VMDOP_WAIT_VM_REQUEST:
152 	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
153 		IMSG_SIZE_CHECK(imsg, &vid);
154 		memcpy(&vid, imsg->data, sizeof(vid));
155 		flags = vid.vid_flags;
156 		cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
157 
158 		if ((id = vid.vid_id) == 0) {
159 			/* Lookup vm (id) by name */
160 			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
161 				res = ENOENT;
162 				break;
163 			}
164 			id = vm->vm_vmid;
165 		} else if ((vm = vm_getbyvmid(id)) == NULL) {
166 			res = ENOENT;
167 			break;
168 		}
169 
170 		/* Validate curent state of vm */
171 		if ((vm->vm_state & VM_STATE_SHUTDOWN) &&
172 			    (flags & VMOP_FORCE) == 0) {
173 				res = EALREADY;
174 				break;
175 		} else if (!(vm->vm_state & VM_STATE_RUNNING)) {
176 			res = EINVAL;
177 			break;
178 		} else if (vm_checkperm(vm, &vm->vm_params.vmc_owner, vid.vid_uid)) {
179 			res = EPERM;
180 			break;
181 		}
182 
183 		/* Only relay TERMINATION requests, not WAIT requests */
184 		if (imsg->hdr.type == IMSG_VMDOP_TERMINATE_VM_REQUEST) {
185 			memset(&vid, 0, sizeof(vid));
186 			vid.vid_id = id;
187 			vid.vid_flags = flags;
188 
189 			if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
190 				imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1)
191 				return (-1);
192 		}
193 		break;
194 	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
195 		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
196 		break;
197 	case IMSG_VMDOP_LOAD:
198 		IMSG_SIZE_CHECK(imsg, str); /* at least one byte for path */
199 		str = get_string((uint8_t *)imsg->data,
200 		    IMSG_DATA_SIZE(imsg));
201 	case IMSG_VMDOP_RELOAD:
202 		if (vmd_reload(0, str) == -1)
203 			cmd = IMSG_CTL_FAIL;
204 		else
205 			cmd = IMSG_CTL_OK;
206 		free(str);
207 		break;
208 	case IMSG_CTL_RESET:
209 		IMSG_SIZE_CHECK(imsg, &v);
210 		memcpy(&v, imsg->data, sizeof(v));
211 		if (vmd_reload(v, NULL) == -1)
212 			cmd = IMSG_CTL_FAIL;
213 		else
214 			cmd = IMSG_CTL_OK;
215 		break;
216 	case IMSG_CTL_VERBOSE:
217 		IMSG_SIZE_CHECK(imsg, &verbose);
218 		memcpy(&verbose, imsg->data, sizeof(verbose));
219 		log_setverbose(verbose);
220 
221 		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
222 		proc_forward_imsg(ps, imsg, PROC_PRIV, -1);
223 		cmd = IMSG_CTL_OK;
224 		break;
225 	case IMSG_VMDOP_PAUSE_VM:
226 	case IMSG_VMDOP_UNPAUSE_VM:
227 		IMSG_SIZE_CHECK(imsg, &vid);
228 		memcpy(&vid, imsg->data, sizeof(vid));
229 		if (vid.vid_id == 0) {
230 			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
231 				res = ENOENT;
232 				cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
233 				    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
234 				    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
235 				break;
236 			} else {
237 				vid.vid_id = vm->vm_vmid;
238 			}
239 		} else if ((vm = vm_getbyid(vid.vid_id)) == NULL) {
240 			res = ENOENT;
241 			cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
242 			    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
243 			    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
244 			break;
245 		}
246 		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
247 		    vid.vid_uid) != 0) {
248 			res = EPERM;
249 			cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
250 			    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
251 			    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
252 			break;
253 		}
254 		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
255 		    imsg->hdr.peerid, -1, &vid, sizeof(vid));
256 		break;
257 	case IMSG_VMDOP_SEND_VM_REQUEST:
258 		IMSG_SIZE_CHECK(imsg, &vid);
259 		memcpy(&vid, imsg->data, sizeof(vid));
260 		id = vid.vid_id;
261 		ifd = imsg_get_fd(imsg);
262 		if (vid.vid_id == 0) {
263 			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
264 				res = ENOENT;
265 				cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
266 				close(ifd);
267 				break;
268 			} else {
269 				vid.vid_id = vm->vm_vmid;
270 			}
271 		} else if ((vm = vm_getbyvmid(vid.vid_id)) == NULL) {
272 			res = ENOENT;
273 			cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
274 			close(ifd);
275 			break;
276 		}
277 		vmr.vmr_id = vid.vid_id;
278 		log_debug("%s: sending fd to vmm", __func__);
279 		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
280 		    imsg->hdr.peerid, ifd, &vid, sizeof(vid));
281 		break;
282 	case IMSG_VMDOP_RECEIVE_VM_REQUEST:
283 		IMSG_SIZE_CHECK(imsg, &vid);
284 		memcpy(&vid, imsg->data, sizeof(vid));
285 		ifd = imsg_get_fd(imsg);
286 		if (ifd == -1) {
287 			log_warnx("%s: invalid fd", __func__);
288 			return (-1);
289 		}
290 		if (atomicio(read, ifd, &vmh, sizeof(vmh)) != sizeof(vmh)) {
291 			log_warnx("%s: error reading vmh from received vm",
292 			    __func__);
293 			res = EIO;
294 			close(ifd);
295 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
296 			break;
297 		}
298 
299 		if (vmd_check_vmh(&vmh)) {
300 			res = ENOENT;
301 			close(ifd);
302 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
303 			break;
304 		}
305 		if (atomicio(read, ifd, &vmc, sizeof(vmc)) != sizeof(vmc)) {
306 			log_warnx("%s: error reading vmc from received vm",
307 			    __func__);
308 			res = EIO;
309 			close(ifd);
310 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
311 			break;
312 		}
313 		strlcpy(vmc.vmc_params.vcp_name, vid.vid_name,
314 		    sizeof(vmc.vmc_params.vcp_name));
315 		vmc.vmc_params.vcp_id = 0;
316 
317 		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
318 		if (ret != 0) {
319 			res = errno;
320 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
321 			close(ifd);
322 		} else {
323 			vm->vm_state |= VM_STATE_RECEIVED;
324 			config_setvm(ps, vm, imsg->hdr.peerid,
325 			    vmc.vmc_owner.uid);
326 			log_debug("%s: sending fd to vmm", __func__);
327 			proc_compose_imsg(ps, PROC_VMM, -1,
328 			    IMSG_VMDOP_RECEIVE_VM_END, vm->vm_vmid, ifd,
329 			    NULL, 0);
330 		}
331 		break;
332 	case IMSG_VMDOP_DONE:
333 		control_reset(&ps->ps_csock);
334 		TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry)
335 			control_reset(rcs);
336 		cmd = 0;
337 		break;
338 	default:
339 		return (-1);
340 	}
341 
342 	switch (cmd) {
343 	case 0:
344 		break;
345 	case IMSG_VMDOP_START_VM_RESPONSE:
346 	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
347 		memset(&vmr, 0, sizeof(vmr));
348 		vmr.vmr_result = res;
349 		vmr.vmr_id = id;
350 		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
351 		    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
352 			return (-1);
353 		break;
354 	default:
355 		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
356 		    imsg->hdr.peerid, -1, &res, sizeof(res)) == -1)
357 			return (-1);
358 		break;
359 	}
360 
361 	return (0);
362 }
363 
364 int
365 vmd_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg)
366 {
367 	struct vmop_result	 vmr;
368 	struct privsep		*ps = p->p_ps;
369 	int			 res = 0;
370 	struct vmd_vm		*vm;
371 	struct vm_create_params	*vcp;
372 	struct vmop_info_result	 vir;
373 
374 	switch (imsg->hdr.type) {
375 	case IMSG_VMDOP_PAUSE_VM_RESPONSE:
376 		IMSG_SIZE_CHECK(imsg, &vmr);
377 		memcpy(&vmr, imsg->data, sizeof(vmr));
378 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
379 			break;
380 		proc_compose_imsg(ps, PROC_CONTROL, -1,
381 		    imsg->hdr.type, imsg->hdr.peerid, -1,
382 		    imsg->data, sizeof(imsg->data));
383 		log_info("%s: paused vm %d successfully",
384 		    vm->vm_params.vmc_params.vcp_name,
385 		    vm->vm_vmid);
386 		vm->vm_state |= VM_STATE_PAUSED;
387 		break;
388 	case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
389 		IMSG_SIZE_CHECK(imsg, &vmr);
390 		memcpy(&vmr, imsg->data, sizeof(vmr));
391 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
392 			break;
393 		proc_compose_imsg(ps, PROC_CONTROL, -1,
394 		    imsg->hdr.type, imsg->hdr.peerid, -1,
395 		    imsg->data, sizeof(imsg->data));
396 		log_info("%s: unpaused vm %d successfully.",
397 		    vm->vm_params.vmc_params.vcp_name,
398 		    vm->vm_vmid);
399 		vm->vm_state &= ~VM_STATE_PAUSED;
400 		break;
401 	case IMSG_VMDOP_START_VM_RESPONSE:
402 		IMSG_SIZE_CHECK(imsg, &vmr);
403 		memcpy(&vmr, imsg->data, sizeof(vmr));
404 		if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL)
405 			break;
406 		vm->vm_pid = vmr.vmr_pid;
407 		vcp = &vm->vm_params.vmc_params;
408 		vcp->vcp_id = vmr.vmr_id;
409 
410 		/*
411 		 * If the peerid is not -1, forward the response back to the
412 		 * the control socket.  If it is -1, the request originated
413 		 * from the parent, not the control socket.
414 		 */
415 		if (vm->vm_peerid != (uint32_t)-1) {
416 			(void)strlcpy(vmr.vmr_ttyname, vm->vm_ttyname,
417 			    sizeof(vmr.vmr_ttyname));
418 			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
419 			    imsg->hdr.type, vm->vm_peerid, -1,
420 			    &vmr, sizeof(vmr)) == -1) {
421 				errno = vmr.vmr_result;
422 				log_warn("%s: failed to forward vm result",
423 				    vcp->vcp_name);
424 				vm_terminate(vm, __func__);
425 				return (-1);
426 			}
427 		}
428 
429 		if (vmr.vmr_result) {
430 			log_warnx("%s: failed to start vm", vcp->vcp_name);
431 			vm_terminate(vm, __func__);
432 			errno = vmr.vmr_result;
433 			break;
434 		}
435 
436 		/* Now configure all the interfaces */
437 		if (vm_priv_ifconfig(ps, vm) == -1) {
438 			log_warn("%s: failed to configure vm", vcp->vcp_name);
439 			vm_terminate(vm, __func__);
440 			break;
441 		}
442 
443 		log_info("started %s (vm %d) successfully, tty %s",
444 		    vcp->vcp_name, vm->vm_vmid, vm->vm_ttyname);
445 		break;
446 	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
447 		IMSG_SIZE_CHECK(imsg, &vmr);
448 		memcpy(&vmr, imsg->data, sizeof(vmr));
449 
450 		if (vmr.vmr_result) {
451 			DPRINTF("%s: forwarding TERMINATE VM for vm id %d",
452 			    __func__, vmr.vmr_id);
453 			proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
454 		} else {
455 			if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
456 				break;
457 			/* Mark VM as shutting down */
458 			vm->vm_state |= VM_STATE_SHUTDOWN;
459 		}
460 		break;
461 	case IMSG_VMDOP_SEND_VM_RESPONSE:
462 		IMSG_SIZE_CHECK(imsg, &vmr);
463 		memcpy(&vmr, imsg->data, sizeof(vmr));
464 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
465 			break;
466 		if (!vmr.vmr_result) {
467 			log_info("%s: sent vm %d successfully.",
468 			    vm->vm_params.vmc_params.vcp_name,
469 			    vm->vm_vmid);
470 			vm_terminate(vm, __func__);
471 		}
472 
473 		/* Send a response if a control client is waiting for it */
474 		if (imsg->hdr.peerid != (uint32_t)-1) {
475 			/* the error is meaningless for deferred responses */
476 			vmr.vmr_result = 0;
477 
478 			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
479 			    IMSG_VMDOP_SEND_VM_RESPONSE,
480 			    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
481 				return (-1);
482 		}
483 		break;
484 	case IMSG_VMDOP_TERMINATE_VM_EVENT:
485 		IMSG_SIZE_CHECK(imsg, &vmr);
486 		memcpy(&vmr, imsg->data, sizeof(vmr));
487 		DPRINTF("%s: handling TERMINATE_EVENT for vm id %d ret %d",
488 		    __func__, vmr.vmr_id, vmr.vmr_result);
489 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) {
490 			log_debug("%s: vm %d is no longer available",
491 			    __func__, vmr.vmr_id);
492 			break;
493 		}
494 		if (vmr.vmr_result != EAGAIN ||
495 		    vm->vm_params.vmc_bootdevice) {
496 			vm_terminate(vm, __func__);
497 		} else {
498 			/* Stop VM instance but keep the tty open */
499 			vm_stop(vm, 1, __func__);
500 			config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid);
501 		}
502 
503 		/* The error is meaningless for deferred responses */
504 		vmr.vmr_result = 0;
505 
506 		if (proc_compose_imsg(ps, PROC_CONTROL, -1,
507 			IMSG_VMDOP_TERMINATE_VM_EVENT,
508 			imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
509 			return (-1);
510 		break;
511 	case IMSG_VMDOP_GET_INFO_VM_DATA:
512 		IMSG_SIZE_CHECK(imsg, &vir);
513 		memcpy(&vir, imsg->data, sizeof(vir));
514 		if ((vm = vm_getbyvmid(vir.vir_info.vir_id)) != NULL) {
515 			memset(vir.vir_ttyname, 0, sizeof(vir.vir_ttyname));
516 			if (vm->vm_ttyname[0] != '\0')
517 				strlcpy(vir.vir_ttyname, vm->vm_ttyname,
518 				    sizeof(vir.vir_ttyname));
519 			log_debug("%s: running vm: %d, vm_state: 0x%x",
520 			    __func__, vm->vm_vmid, vm->vm_state);
521 			vir.vir_state = vm->vm_state;
522 			/* get the user id who started the vm */
523 			vir.vir_uid = vm->vm_uid;
524 			vir.vir_gid = vm->vm_params.vmc_owner.gid;
525 		}
526 		if (proc_compose_imsg(ps,
527 		    imsg->hdr.peerid == IMSG_AGENTX_PEERID ?
528 		    PROC_AGENTX : PROC_CONTROL, -1, imsg->hdr.type,
529 		    imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) {
530 			log_debug("%s: GET_INFO_VM failed for vm %d, removing",
531 			    __func__, vm->vm_vmid);
532 			vm_terminate(vm, __func__);
533 			return (-1);
534 		}
535 		break;
536 	case IMSG_VMDOP_GET_INFO_VM_END_DATA:
537 		/*
538 		 * PROC_VMM has responded with the *running* VMs, now we
539 		 * append the others. These use the special value 0 for their
540 		 * kernel id to indicate that they are not running.
541 		 */
542 		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
543 			if (!(vm->vm_state & VM_STATE_RUNNING)) {
544 				memset(&vir, 0, sizeof(vir));
545 				vir.vir_info.vir_id = vm->vm_vmid;
546 				strlcpy(vir.vir_info.vir_name,
547 				    vm->vm_params.vmc_params.vcp_name,
548 				    VMM_MAX_NAME_LEN);
549 				vir.vir_info.vir_memory_size =
550 				    vm->vm_params.vmc_params.
551 				    vcp_memranges[0].vmr_size;
552 				vir.vir_info.vir_ncpus =
553 				    vm->vm_params.vmc_params.vcp_ncpus;
554 				/* get the configured user id for this vm */
555 				vir.vir_uid = vm->vm_params.vmc_owner.uid;
556 				vir.vir_gid = vm->vm_params.vmc_owner.gid;
557 				log_debug("%s: vm: %d, vm_state: 0x%x",
558 				    __func__, vm->vm_vmid, vm->vm_state);
559 				vir.vir_state = vm->vm_state;
560 				if (proc_compose_imsg(ps,
561 				    imsg->hdr.peerid == IMSG_AGENTX_PEERID ?
562 				    PROC_AGENTX : PROC_CONTROL, -1,
563 				    IMSG_VMDOP_GET_INFO_VM_DATA,
564 				    imsg->hdr.peerid, -1, &vir,
565 				    sizeof(vir)) == -1) {
566 					log_debug("%s: GET_INFO_VM_END failed",
567 					    __func__);
568 					vm_terminate(vm, __func__);
569 					return (-1);
570 				}
571 			}
572 		}
573 		IMSG_SIZE_CHECK(imsg, &res);
574 		proc_forward_imsg(ps, imsg,
575 		    imsg->hdr.peerid == IMSG_AGENTX_PEERID ?
576 		    PROC_AGENTX : PROC_CONTROL, -1);
577 		break;
578 	default:
579 		return (-1);
580 	}
581 
582 	return (0);
583 }
584 
585 int
586 vmd_dispatch_agentx(int fd, struct privsep_proc *p, struct imsg *imsg)
587 {
588 	struct privsep			*ps = p->p_ps;
589 
590 	switch (imsg->hdr.type) {
591 	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
592 		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
593 		return (0);
594 	default:
595 		break;
596 	}
597 	return (-1);
598 }
599 
600 int
601 vmd_dispatch_priv(int fd, struct privsep_proc *p, struct imsg *imsg)
602 {
603 	struct vmop_addr_result	 var;
604 
605 	switch (imsg->hdr.type) {
606 	case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
607 		IMSG_SIZE_CHECK(imsg, &var);
608 		memcpy(&var, imsg->data, sizeof(var));
609 		proc_forward_imsg(p->p_ps, imsg, PROC_VMM, -1);
610 		break;
611 	default:
612 		return (-1);
613 	}
614 
615 	return (0);
616 }
617 
618 int
619 vmd_check_vmh(struct vm_dump_header *vmh)
620 {
621 	int i;
622 	unsigned int code, leaf;
623 	unsigned int a, b, c, d;
624 
625 	if (strncmp(vmh->vmh_signature, VM_DUMP_SIGNATURE, strlen(VM_DUMP_SIGNATURE)) != 0) {
626 		log_warnx("%s: incompatible dump signature", __func__);
627 		return (-1);
628 	}
629 
630 	if (vmh->vmh_version != VM_DUMP_VERSION) {
631 		log_warnx("%s: incompatible dump version", __func__);
632 		return (-1);
633 	}
634 
635 	for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
636 		code = vmh->vmh_cpuids[i].code;
637 		leaf = vmh->vmh_cpuids[i].leaf;
638 		if (leaf != 0x00) {
639 			log_debug("%s: invalid leaf 0x%x for code 0x%x",
640 			    __func__, leaf, code);
641 			return (-1);
642 		}
643 
644 		switch (code) {
645 		case 0x00:
646 			CPUID_LEAF(code, leaf, a, b, c, d);
647 			if (vmh->vmh_cpuids[i].a > a) {
648 				log_debug("%s: incompatible cpuid level",
649 				    __func__);
650 				return (-1);
651 			}
652 			if (!(vmh->vmh_cpuids[i].b == b &&
653 			    vmh->vmh_cpuids[i].c == c &&
654 			    vmh->vmh_cpuids[i].d == d)) {
655 				log_debug("%s: incompatible cpu brand",
656 				    __func__);
657 				return (-1);
658 			}
659 			break;
660 
661 		case 0x01:
662 			CPUID_LEAF(code, leaf, a, b, c, d);
663 			if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
664 			    (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
665 				log_debug("%s: incompatible cpu features "
666 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
667 				    code, leaf);
668 				return (-1);
669 			}
670 			if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
671 			    (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
672 				log_debug("%s: incompatible cpu features "
673 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
674 				    code, leaf);
675 				return (-1);
676 			}
677 			break;
678 
679 		case 0x07:
680 			CPUID_LEAF(code, leaf, a, b, c, d);
681 			if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
682 			    (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
683 				log_debug("%s: incompatible cpu features "
684 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
685 				    code, leaf);
686 				return (-1);
687 			}
688 			if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
689 			    (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
690 				log_debug("%s: incompatible cpu features "
691 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
692 				    code, leaf);
693 				return (-1);
694 			}
695 			break;
696 
697 		case 0x0d:
698 			CPUID_LEAF(code, leaf, a, b, c, d);
699 			if (vmh->vmh_cpuids[i].b > b) {
700 				log_debug("%s: incompatible cpu: insufficient "
701 				    "max save area for enabled XCR0 features",
702 				    __func__);
703 				return (-1);
704 			}
705 			if (vmh->vmh_cpuids[i].c > c) {
706 				log_debug("%s: incompatible cpu: insufficient "
707 				    "max save area for supported XCR0 features",
708 				    __func__);
709 				return (-1);
710 			}
711 			break;
712 
713 		case 0x80000001:
714 			CPUID_LEAF(code, leaf, a, b, c, d);
715 			if ((vmh->vmh_cpuids[i].a & a) !=
716 			    vmh->vmh_cpuids[i].a) {
717 				log_debug("%s: incompatible cpu features "
718 				    "code: 0x%x leaf: 0x%x  reg: a", __func__,
719 				    code, leaf);
720 				return (-1);
721 			}
722 			if ((vmh->vmh_cpuids[i].c & c) !=
723 			    vmh->vmh_cpuids[i].c) {
724 				log_debug("%s: incompatible cpu features "
725 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
726 				    code, leaf);
727 				return (-1);
728 			}
729 			if ((vmh->vmh_cpuids[i].d & d) !=
730 			    vmh->vmh_cpuids[i].d) {
731 				log_debug("%s: incompatible cpu features "
732 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
733 				    code, leaf);
734 				return (-1);
735 			}
736 			break;
737 
738 		default:
739 			log_debug("%s: unknown code 0x%x", __func__, code);
740 			return (-1);
741 		}
742 	}
743 
744 	return (0);
745 }
746 
747 void
748 vmd_sighdlr(int sig, short event, void *arg)
749 {
750 	if (privsep_process != PROC_PARENT)
751 		return;
752 	log_debug("%s: handling signal", __func__);
753 
754 	switch (sig) {
755 	case SIGHUP:
756 		log_info("%s: reload requested with SIGHUP", __func__);
757 
758 		/*
759 		 * This is safe because libevent uses async signal handlers
760 		 * that run in the event loop and not in signal context.
761 		 */
762 		(void)vmd_reload(0, NULL);
763 		break;
764 	case SIGPIPE:
765 		log_info("%s: ignoring SIGPIPE", __func__);
766 		break;
767 	case SIGUSR1:
768 		log_info("%s: ignoring SIGUSR1", __func__);
769 		break;
770 	case SIGTERM:
771 	case SIGINT:
772 		vmd_shutdown();
773 		break;
774 	default:
775 		fatalx("unexpected signal");
776 	}
777 }
778 
779 __dead void
780 usage(void)
781 {
782 	extern char *__progname;
783 	fprintf(stderr, "usage: %s [-dnv] [-D macro=value] [-f file]\n",
784 	    __progname);
785 	exit(1);
786 }
787 
788 int
789 main(int argc, char **argv)
790 {
791 	struct privsep		*ps;
792 	int			 ch;
793 	enum privsep_procid	 proc_id = PROC_PARENT;
794 	int			 proc_instance = 0, vm_launch = 0;
795 	int			 vmm_fd = -1, vm_fd = -1;
796 	const char		*errp, *title = NULL;
797 	int			 argc0 = argc;
798 	char			 dev_type = '\0';
799 
800 	log_init(0, LOG_DAEMON);
801 
802 	if ((env = calloc(1, sizeof(*env))) == NULL)
803 		fatal("calloc: env");
804 
805 	while ((ch = getopt(argc, argv, "D:P:I:V:X:df:i:nt:vp:")) != -1) {
806 		switch (ch) {
807 		case 'D':
808 			if (cmdline_symset(optarg) < 0)
809 				log_warnx("could not parse macro definition %s",
810 				    optarg);
811 			break;
812 		case 'd':
813 			env->vmd_debug = 2;
814 			break;
815 		case 'f':
816 			conffile = optarg;
817 			break;
818 		case 'v':
819 			env->vmd_verbose++;
820 			break;
821 		/* vmd fork/exec */
822 		case 'n':
823 			env->vmd_noaction = 1;
824 			break;
825 		case 'P':
826 			title = optarg;
827 			proc_id = proc_getid(procs, nitems(procs), title);
828 			if (proc_id == PROC_MAX)
829 				fatalx("invalid process name");
830 			break;
831 		case 'I':
832 			proc_instance = strtonum(optarg, 0,
833 			    PROC_MAX_INSTANCES, &errp);
834 			if (errp)
835 				fatalx("invalid process instance");
836 			break;
837 		/* child vm and device fork/exec */
838 		case 'p':
839 			title = optarg;
840 			break;
841 		case 'V':
842 			vm_launch = VMD_LAUNCH_VM;
843 			vm_fd = strtonum(optarg, 0, 128, &errp);
844 			if (errp)
845 				fatalx("invalid vm fd");
846 			break;
847 		case 'X':
848 			vm_launch = VMD_LAUNCH_DEV;
849 			vm_fd = strtonum(optarg, 0, 128, &errp);
850 			if (errp)
851 				fatalx("invalid device fd");
852 			break;
853 		case 't':
854 			dev_type = *optarg;
855 			switch (dev_type) {
856 			case VMD_DEVTYPE_NET:
857 			case VMD_DEVTYPE_DISK:
858 				break;
859 			default: fatalx("invalid device type");
860 			}
861 			break;
862 		case 'i':
863 			vmm_fd = strtonum(optarg, 0, 128, &errp);
864 			if (errp)
865 				fatalx("invalid vmm fd");
866 			break;
867 		default:
868 			usage();
869 		}
870 	}
871 
872 	argc -= optind;
873 	if (argc > 0)
874 		usage();
875 
876 	if (env->vmd_noaction && !env->vmd_debug)
877 		env->vmd_debug = 1;
878 
879 	log_init(env->vmd_debug, LOG_DAEMON);
880 	log_setverbose(env->vmd_verbose);
881 
882 	/* Re-exec from the vmm child process requires an absolute path. */
883 	if (proc_id == PROC_PARENT && *argv[0] != '/' && !env->vmd_noaction)
884 		fatalx("re-exec requires execution with an absolute path");
885 	env->argv0 = argv[0];
886 
887 	/* check for root privileges */
888 	if (env->vmd_noaction == 0 && !vm_launch) {
889 		if (geteuid())
890 			fatalx("need root privileges");
891 	}
892 
893 	ps = &env->vmd_ps;
894 	ps->ps_env = env;
895 	env->vmd_fd = vmm_fd;
896 
897 	if (config_init(env) == -1)
898 		fatal("failed to initialize configuration");
899 
900 	if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL)
901 		fatal("unknown user %s", VMD_USER);
902 
903 	/* First proc runs as root without pledge but in default chroot */
904 	proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */
905 	proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */
906 
907 	/*
908 	 * If we're launching a new vm or its device, we short out here.
909 	 */
910 	if (vm_launch == VMD_LAUNCH_VM) {
911 		vm_main(vm_fd, vmm_fd);
912 		/* NOTREACHED */
913 	} else if (vm_launch == VMD_LAUNCH_DEV) {
914 		if (dev_type == VMD_DEVTYPE_NET) {
915 			log_procinit("vm/%s/vionet", title);
916 			vionet_main(vm_fd, vmm_fd);
917 			/* NOTREACHED */
918 		} else if (dev_type == VMD_DEVTYPE_DISK) {
919 			log_procinit("vm/%s/vioblk", title);
920 			vioblk_main(vm_fd, vmm_fd);
921 			/* NOTREACHED */
922 		}
923 		fatalx("unsupported device type '%c'", dev_type);
924 	}
925 
926 	/* Open /dev/vmm early. */
927 	if (env->vmd_noaction == 0 && proc_id == PROC_PARENT) {
928 		env->vmd_fd = open(VMM_NODE, O_RDWR);
929 		if (env->vmd_fd == -1)
930 			fatal("%s", VMM_NODE);
931 	}
932 
933 	/* Configure the control socket */
934 	ps->ps_csock.cs_name = SOCKET_NAME;
935 	TAILQ_INIT(&ps->ps_rcsocks);
936 
937 	/* Configuration will be parsed after forking the children */
938 	env->vmd_conffile = conffile;
939 
940 	if (env->vmd_noaction)
941 		ps->ps_noaction = 1;
942 	ps->ps_instance = proc_instance;
943 	if (title != NULL)
944 		ps->ps_title[proc_id] = title;
945 
946 	/* only the parent returns */
947 	proc_init(ps, procs, nitems(procs), env->vmd_debug, argc0, argv,
948 	    proc_id);
949 
950 	if (!env->vmd_debug && daemon(0, 0) == -1)
951 		fatal("can't daemonize");
952 
953 	if (ps->ps_noaction == 0)
954 		log_info("startup");
955 
956 	event_init();
957 
958 	signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps);
959 	signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps);
960 	signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps);
961 	signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps);
962 	signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps);
963 
964 	signal_add(&ps->ps_evsigint, NULL);
965 	signal_add(&ps->ps_evsigterm, NULL);
966 	signal_add(&ps->ps_evsighup, NULL);
967 	signal_add(&ps->ps_evsigpipe, NULL);
968 	signal_add(&ps->ps_evsigusr1, NULL);
969 
970 	if (!env->vmd_noaction)
971 		proc_connect(ps);
972 
973 	if (vmd_configure() == -1)
974 		fatalx("configuration failed");
975 
976 	event_dispatch();
977 
978 	log_debug("exiting");
979 
980 	return (0);
981 }
982 
983 void
984 start_vm_batch(int fd, short type, void *args)
985 {
986 	int		i = 0;
987 	struct vmd_vm	*vm;
988 
989 	log_debug("%s: starting batch of %d vms", __func__,
990 	    env->vmd_cfg.parallelism);
991 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
992 		if (!(vm->vm_state & VM_STATE_WAITING)) {
993 			log_debug("%s: not starting vm %s (disabled)",
994 			    __func__,
995 			    vm->vm_params.vmc_params.vcp_name);
996 			continue;
997 		}
998 		i++;
999 		if (i > env->vmd_cfg.parallelism) {
1000 			evtimer_add(&staggered_start_timer,
1001 			    &env->vmd_cfg.delay);
1002 			break;
1003 		}
1004 		vm->vm_state &= ~VM_STATE_WAITING;
1005 		config_setvm(&env->vmd_ps, vm, -1, vm->vm_params.vmc_owner.uid);
1006 	}
1007 	log_debug("%s: done starting vms", __func__);
1008 }
1009 
1010 int
1011 vmd_configure(void)
1012 {
1013 	int			ncpus;
1014 	struct vmd_switch	*vsw;
1015 	int ncpu_mib[] = {CTL_HW, HW_NCPUONLINE};
1016 	size_t ncpus_sz = sizeof(ncpus);
1017 
1018 	if ((env->vmd_ptmfd = open(PATH_PTMDEV, O_RDWR|O_CLOEXEC)) == -1)
1019 		fatal("open %s", PATH_PTMDEV);
1020 
1021 	/*
1022 	 * pledge in the parent process:
1023 	 * stdio - for malloc and basic I/O including events.
1024 	 * rpath - for reload to open and read the configuration files.
1025 	 * wpath - for opening disk images and tap devices.
1026 	 * tty - for openpty and TIOCUCNTL.
1027 	 * proc - run kill to terminate its children safely.
1028 	 * sendfd - for disks, interfaces and other fds.
1029 	 * recvfd - for send and receive.
1030 	 * getpw - lookup user or group id by name.
1031 	 * chown, fattr - change tty ownership
1032 	 * flock - locking disk files
1033 	 */
1034 	if (pledge("stdio rpath wpath proc tty recvfd sendfd getpw"
1035 	    " chown fattr flock", NULL) == -1)
1036 		fatal("pledge");
1037 
1038 	if (parse_config(env->vmd_conffile) == -1) {
1039 		proc_kill(&env->vmd_ps);
1040 		exit(1);
1041 	}
1042 
1043 	if (env->vmd_noaction) {
1044 		fprintf(stderr, "configuration OK\n");
1045 		proc_kill(&env->vmd_ps);
1046 		exit(0);
1047 	}
1048 
1049 	/* Send VMM device fd to vmm proc. */
1050 	proc_compose_imsg(&env->vmd_ps, PROC_VMM, -1,
1051 	    IMSG_VMDOP_RECEIVE_VMM_FD, -1, env->vmd_fd, NULL, 0);
1052 
1053 	/* Send shared global configuration to all children */
1054 	if (config_setconfig(env) == -1)
1055 		return (-1);
1056 
1057 	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1058 		if (vsw->sw_running)
1059 			continue;
1060 		if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
1061 			log_warn("%s: failed to create switch %s",
1062 			    __func__, vsw->sw_name);
1063 			switch_remove(vsw);
1064 			return (-1);
1065 		}
1066 	}
1067 
1068 	if (!(env->vmd_cfg.cfg_flags & VMD_CFG_STAGGERED_START)) {
1069 		env->vmd_cfg.delay.tv_sec = VMD_DEFAULT_STAGGERED_START_DELAY;
1070 		if (sysctl(ncpu_mib, nitems(ncpu_mib), &ncpus, &ncpus_sz, NULL, 0) == -1)
1071 			ncpus = 1;
1072 		env->vmd_cfg.parallelism = ncpus;
1073 		log_debug("%s: setting staggered start configuration to "
1074 		    "parallelism: %d and delay: %lld",
1075 		    __func__, ncpus, (long long) env->vmd_cfg.delay.tv_sec);
1076 	}
1077 
1078 	log_debug("%s: starting vms in staggered fashion", __func__);
1079 	evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
1080 	/* start first batch */
1081 	start_vm_batch(0, 0, NULL);
1082 
1083 	return (0);
1084 }
1085 
1086 int
1087 vmd_reload(unsigned int reset, const char *filename)
1088 {
1089 	struct vmd_vm		*vm, *next_vm;
1090 	struct vmd_switch	*vsw;
1091 	int			 reload = 0;
1092 
1093 	/* Switch back to the default config file */
1094 	if (filename == NULL || *filename == '\0') {
1095 		filename = env->vmd_conffile;
1096 		reload = 1;
1097 	}
1098 
1099 	log_debug("%s: level %d config file %s", __func__, reset, filename);
1100 
1101 	if (reset) {
1102 		/* Purge the configuration */
1103 		config_purge(env, reset);
1104 		config_setreset(env, reset);
1105 	} else {
1106 		/*
1107 		 * Load or reload the configuration.
1108 		 *
1109 		 * Reloading removes all non-running VMs before processing the
1110 		 * config file, whereas loading only adds to the existing list
1111 		 * of VMs.
1112 		 */
1113 
1114 		if (reload) {
1115 			TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry,
1116 			    next_vm) {
1117 				if (!(vm->vm_state & VM_STATE_RUNNING)) {
1118 					DPRINTF("%s: calling vm_remove",
1119 					    __func__);
1120 					vm_remove(vm, __func__);
1121 				}
1122 			}
1123 		}
1124 
1125 		if (parse_config(filename) == -1) {
1126 			log_debug("%s: failed to load config file %s",
1127 			    __func__, filename);
1128 			return (-1);
1129 		}
1130 
1131 		if (reload) {
1132 			/* Update shared global configuration in all children */
1133 			if (config_setconfig(env) == -1)
1134 				return (-1);
1135 		}
1136 
1137 		TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1138 			if (vsw->sw_running)
1139 				continue;
1140 			if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
1141 				log_warn("%s: failed to create switch %s",
1142 				    __func__, vsw->sw_name);
1143 				switch_remove(vsw);
1144 				return (-1);
1145 			}
1146 		}
1147 
1148 		log_debug("%s: starting vms in staggered fashion", __func__);
1149 		evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
1150 		/* start first batch */
1151 		start_vm_batch(0, 0, NULL);
1152 
1153 		}
1154 
1155 	return (0);
1156 }
1157 
1158 void
1159 vmd_shutdown(void)
1160 {
1161 	struct vmd_vm *vm, *vm_next;
1162 
1163 	log_debug("%s: performing shutdown", __func__);
1164 
1165 	TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
1166 		vm_remove(vm, __func__);
1167 	}
1168 
1169 	proc_kill(&env->vmd_ps);
1170 	free(env);
1171 
1172 	log_warnx("terminating");
1173 	exit(0);
1174 }
1175 
1176 struct vmd_vm *
1177 vm_getbyvmid(uint32_t vmid)
1178 {
1179 	struct vmd_vm	*vm;
1180 
1181 	if (vmid == 0)
1182 		return (NULL);
1183 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1184 		if (vm->vm_vmid == vmid)
1185 			return (vm);
1186 	}
1187 
1188 	return (NULL);
1189 }
1190 
1191 struct vmd_vm *
1192 vm_getbyid(uint32_t id)
1193 {
1194 	struct vmd_vm	*vm;
1195 
1196 	if (id == 0)
1197 		return (NULL);
1198 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1199 		if (vm->vm_params.vmc_params.vcp_id == id)
1200 			return (vm);
1201 	}
1202 
1203 	return (NULL);
1204 }
1205 
1206 uint32_t
1207 vm_id2vmid(uint32_t id, struct vmd_vm *vm)
1208 {
1209 	if (vm == NULL && (vm = vm_getbyid(id)) == NULL)
1210 		return (0);
1211 	DPRINTF("%s: vmm id %u is vmid %u", __func__,
1212 	    id, vm->vm_vmid);
1213 	return (vm->vm_vmid);
1214 }
1215 
1216 uint32_t
1217 vm_vmid2id(uint32_t vmid, struct vmd_vm *vm)
1218 {
1219 	if (vm == NULL && (vm = vm_getbyvmid(vmid)) == NULL)
1220 		return (0);
1221 	DPRINTF("%s: vmid %u is vmm id %u", __func__,
1222 	    vmid, vm->vm_params.vmc_params.vcp_id);
1223 	return (vm->vm_params.vmc_params.vcp_id);
1224 }
1225 
1226 struct vmd_vm *
1227 vm_getbyname(const char *name)
1228 {
1229 	struct vmd_vm	*vm;
1230 
1231 	if (name == NULL)
1232 		return (NULL);
1233 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1234 		if (strcmp(vm->vm_params.vmc_params.vcp_name, name) == 0)
1235 			return (vm);
1236 	}
1237 
1238 	return (NULL);
1239 }
1240 
1241 struct vmd_vm *
1242 vm_getbypid(pid_t pid)
1243 {
1244 	struct vmd_vm	*vm;
1245 
1246 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1247 		if (vm->vm_pid == pid)
1248 			return (vm);
1249 	}
1250 
1251 	return (NULL);
1252 }
1253 
1254 void
1255 vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
1256 {
1257 	struct privsep	*ps = &env->vmd_ps;
1258 	unsigned int	 i, j;
1259 
1260 	if (vm == NULL)
1261 		return;
1262 
1263 	log_debug("%s: %s %s stopping vm %d%s",
1264 	    __func__, ps->ps_title[privsep_process], caller,
1265 	    vm->vm_vmid, keeptty ? ", keeping tty open" : "");
1266 
1267 	vm->vm_state &= ~(VM_STATE_RECEIVED | VM_STATE_RUNNING
1268 	    | VM_STATE_SHUTDOWN);
1269 
1270 	if (vm->vm_iev.ibuf.fd != -1) {
1271 		event_del(&vm->vm_iev.ev);
1272 		close(vm->vm_iev.ibuf.fd);
1273 	}
1274 	for (i = 0; i < VM_MAX_DISKS_PER_VM; i++) {
1275 		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
1276 			if (vm->vm_disks[i][j] != -1) {
1277 				close(vm->vm_disks[i][j]);
1278 				vm->vm_disks[i][j] = -1;
1279 			}
1280 		}
1281 	}
1282 	for (i = 0; i < VM_MAX_NICS_PER_VM; i++) {
1283 		if (vm->vm_ifs[i].vif_fd != -1) {
1284 			close(vm->vm_ifs[i].vif_fd);
1285 			vm->vm_ifs[i].vif_fd = -1;
1286 		}
1287 		free(vm->vm_ifs[i].vif_name);
1288 		free(vm->vm_ifs[i].vif_switch);
1289 		free(vm->vm_ifs[i].vif_group);
1290 		vm->vm_ifs[i].vif_name = NULL;
1291 		vm->vm_ifs[i].vif_switch = NULL;
1292 		vm->vm_ifs[i].vif_group = NULL;
1293 	}
1294 	if (vm->vm_kernel != -1) {
1295 		close(vm->vm_kernel);
1296 		vm->vm_kernel = -1;
1297 	}
1298 	if (vm->vm_cdrom != -1) {
1299 		close(vm->vm_cdrom);
1300 		vm->vm_cdrom = -1;
1301 	}
1302 	if (!keeptty) {
1303 		vm_closetty(vm);
1304 		vm->vm_uid = 0;
1305 	}
1306 }
1307 
1308 void
1309 vm_remove(struct vmd_vm *vm, const char *caller)
1310 {
1311 	struct privsep	*ps = &env->vmd_ps;
1312 
1313 	if (vm == NULL)
1314 		return;
1315 
1316 	log_debug("%s: %s %s removing vm %d from running config",
1317 	    __func__, ps->ps_title[privsep_process], caller,
1318 	    vm->vm_vmid);
1319 
1320 	TAILQ_REMOVE(env->vmd_vms, vm, vm_entry);
1321 
1322 	vm_stop(vm, 0, caller);
1323 	if (vm->vm_kernel_path != NULL && !vm->vm_from_config)
1324 		free(vm->vm_kernel_path);
1325 	free(vm);
1326 }
1327 
1328 int
1329 vm_claimid(const char *name, int uid, uint32_t *id)
1330 {
1331 	struct name2id *n2i = NULL;
1332 
1333 	TAILQ_FOREACH(n2i, env->vmd_known, entry)
1334 		if (strcmp(n2i->name, name) == 0 && n2i->uid == uid)
1335 			goto out;
1336 
1337 	if (++env->vmd_nvm == 0) {
1338 		log_warnx("too many vms");
1339 		return (-1);
1340 	}
1341 	if ((n2i = calloc(1, sizeof(struct name2id))) == NULL) {
1342 		log_warnx("could not alloc vm name");
1343 		return (-1);
1344 	}
1345 	n2i->id = env->vmd_nvm;
1346 	n2i->uid = uid;
1347 	if (strlcpy(n2i->name, name, sizeof(n2i->name)) >= sizeof(n2i->name)) {
1348 		log_warnx("vm name too long");
1349 		free(n2i);
1350 		return (-1);
1351 	}
1352 	TAILQ_INSERT_TAIL(env->vmd_known, n2i, entry);
1353 
1354 out:
1355 	*id = n2i->id;
1356 	return (0);
1357 }
1358 
1359 int
1360 vm_register(struct privsep *ps, struct vmop_create_params *vmc,
1361     struct vmd_vm **ret_vm, uint32_t id, uid_t uid)
1362 {
1363 	struct vmd_vm		*vm = NULL, *vm_parent = NULL;
1364 	struct vm_create_params	*vcp = &vmc->vmc_params;
1365 	struct vmop_owner	*vmo = NULL;
1366 	uint32_t		 nid, rng;
1367 	unsigned int		 i, j;
1368 	struct vmd_switch	*sw;
1369 	char			*s;
1370 	int			 ret = 0;
1371 
1372 	/* Check if this is an instance of another VM */
1373 	if ((ret = vm_instance(ps, &vm_parent, vmc, uid)) != 0) {
1374 		errno = ret; /* XXX might set invalid errno */
1375 		return (-1);
1376 	}
1377 
1378 	errno = 0;
1379 	*ret_vm = NULL;
1380 
1381 	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1382 	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1383 		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
1384 		    uid) != 0) {
1385 			errno = EPERM;
1386 			goto fail;
1387 		}
1388 		vm->vm_kernel = vmc->vmc_kernel;
1389 		*ret_vm = vm;
1390 		errno = EALREADY;
1391 		goto fail;
1392 	}
1393 
1394 	if (vm_parent != NULL)
1395 		vmo = &vm_parent->vm_params.vmc_insowner;
1396 
1397 	/* non-root users can only start existing VMs or instances */
1398 	if (vm_checkperm(NULL, vmo, uid) != 0) {
1399 		log_warnx("permission denied");
1400 		errno = EPERM;
1401 		goto fail;
1402 	}
1403 	if (vmc->vmc_flags == 0) {
1404 		log_warnx("invalid configuration, no devices");
1405 		errno = VMD_DISK_MISSING;
1406 		goto fail;
1407 	}
1408 	if (vcp->vcp_ncpus == 0)
1409 		vcp->vcp_ncpus = 1;
1410 	if (vcp->vcp_memranges[0].vmr_size == 0)
1411 		vcp->vcp_memranges[0].vmr_size = VM_DEFAULT_MEMORY;
1412 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) {
1413 		log_warnx("invalid number of CPUs");
1414 		goto fail;
1415 	} else if (vmc->vmc_ndisks > VM_MAX_DISKS_PER_VM) {
1416 		log_warnx("invalid number of disks");
1417 		goto fail;
1418 	} else if (vmc->vmc_nnics > VM_MAX_NICS_PER_VM) {
1419 		log_warnx("invalid number of interfaces");
1420 		goto fail;
1421 	} else if (vmc->vmc_kernel == -1 && vmc->vmc_ndisks == 0
1422 	    && strlen(vmc->vmc_cdrom) == 0) {
1423 		log_warnx("no kernel or disk/cdrom specified");
1424 		goto fail;
1425 	} else if (strlen(vcp->vcp_name) == 0) {
1426 		log_warnx("invalid VM name");
1427 		goto fail;
1428 	} else if (*vcp->vcp_name == '-' || *vcp->vcp_name == '.' ||
1429 	    *vcp->vcp_name == '_') {
1430 		log_warnx("invalid VM name");
1431 		goto fail;
1432 	} else {
1433 		for (s = vcp->vcp_name; *s != '\0'; ++s) {
1434 			if (!(isalnum((unsigned char)*s) || *s == '.' || \
1435 			    *s == '-' || *s == '_')) {
1436 				log_warnx("invalid VM name");
1437 				goto fail;
1438 			}
1439 		}
1440 	}
1441 
1442 	if ((vm = calloc(1, sizeof(*vm))) == NULL)
1443 		goto fail;
1444 
1445 	memcpy(&vm->vm_params, vmc, sizeof(vm->vm_params));
1446 	vmc = &vm->vm_params;
1447 	vcp = &vmc->vmc_params;
1448 	vm->vm_pid = -1;
1449 	vm->vm_tty = -1;
1450 	vm->vm_receive_fd = -1;
1451 	vm->vm_kernel = -1;
1452 	vm->vm_state &= ~VM_STATE_PAUSED;
1453 
1454 	if (vmc->vmc_kernel > -1)
1455 		vm->vm_kernel = vmc->vmc_kernel;
1456 
1457 	for (i = 0; i < VM_MAX_DISKS_PER_VM; i++)
1458 		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
1459 			vm->vm_disks[i][j] = -1;
1460 	for (i = 0; i < VM_MAX_NICS_PER_VM; i++)
1461 		vm->vm_ifs[i].vif_fd = -1;
1462 	for (i = 0; i < vmc->vmc_nnics; i++) {
1463 		if ((sw = switch_getbyname(vmc->vmc_ifswitch[i])) != NULL) {
1464 			/* inherit per-interface flags from the switch */
1465 			vmc->vmc_ifflags[i] |= (sw->sw_flags & VMIFF_OPTMASK);
1466 		}
1467 
1468 		/*
1469 		 * If the MAC address is zero, always randomize it in vmd(8)
1470 		 * because we cannot rely on the guest OS to do the right
1471 		 * thing like OpenBSD does.  Based on ether_fakeaddr()
1472 		 * from the kernel, incremented by one to differentiate
1473 		 * the source.
1474 		 */
1475 		if (memcmp(zero_mac, &vmc->vmc_macs[i], ETHER_ADDR_LEN) == 0) {
1476 			rng = arc4random();
1477 			vmc->vmc_macs[i][0] = 0xfe;
1478 			vmc->vmc_macs[i][1] = 0xe1;
1479 			vmc->vmc_macs[i][2] = 0xba + 1;
1480 			vmc->vmc_macs[i][3] = 0xd0 | ((i + 1) & 0xf);
1481 			vmc->vmc_macs[i][4] = rng;
1482 			vmc->vmc_macs[i][5] = rng >> 8;
1483 		}
1484 	}
1485 	vm->vm_cdrom = -1;
1486 	vm->vm_iev.ibuf.fd = -1;
1487 
1488 	/*
1489 	 * Assign a new internal Id if not specified and we succeed in
1490 	 * claiming a new Id.
1491 	 */
1492 	if (id != 0)
1493 		vm->vm_vmid = id;
1494 	else if (vm_claimid(vcp->vcp_name, uid, &nid) == -1)
1495 		goto fail;
1496 	else
1497 		vm->vm_vmid = nid;
1498 
1499 	log_debug("%s: registering vm %d", __func__, vm->vm_vmid);
1500 	TAILQ_INSERT_TAIL(env->vmd_vms, vm, vm_entry);
1501 
1502 	*ret_vm = vm;
1503 	return (0);
1504  fail:
1505 	if (errno == 0)
1506 		errno = EINVAL;
1507 	return (-1);
1508 }
1509 
1510 int
1511 vm_instance(struct privsep *ps, struct vmd_vm **vm_parent,
1512     struct vmop_create_params *vmc, uid_t uid)
1513 {
1514 	char			*name;
1515 	struct vm_create_params	*vcp = &vmc->vmc_params;
1516 	struct vmop_create_params *vmcp;
1517 	struct vm_create_params	*vcpp;
1518 	unsigned int		 i, j;
1519 
1520 	/* return without error if the parent is NULL (nothing to inherit) */
1521 	if ((vmc->vmc_flags & VMOP_CREATE_INSTANCE) == 0 ||
1522 	    vmc->vmc_instance[0] == '\0')
1523 		return (0);
1524 
1525 	if ((*vm_parent = vm_getbyname(vmc->vmc_instance)) == NULL) {
1526 		return (VMD_PARENT_INVALID);
1527 	}
1528 
1529 	vmcp = &(*vm_parent)->vm_params;
1530 	vcpp = &vmcp->vmc_params;
1531 
1532 	/* Are we allowed to create an instance from this VM? */
1533 	if (vm_checkperm(NULL, &vmcp->vmc_insowner, uid) != 0) {
1534 		log_warnx("vm \"%s\" no permission to create vm instance",
1535 		    vcpp->vcp_name);
1536 		return (ENAMETOOLONG);
1537 	}
1538 
1539 	name = vcp->vcp_name;
1540 
1541 	if (vm_getbyname(vcp->vcp_name) != NULL ||
1542 	    vm_getbyvmid(vcp->vcp_id) != NULL) {
1543 		return (EPROCLIM);
1544 	}
1545 
1546 	/* CPU */
1547 	if (vcp->vcp_ncpus == 0)
1548 		vcp->vcp_ncpus = vcpp->vcp_ncpus;
1549 	if (vm_checkinsflag(vmcp, VMOP_CREATE_CPU, uid) != 0 &&
1550 	    vcp->vcp_ncpus != vcpp->vcp_ncpus) {
1551 		log_warnx("vm \"%s\" no permission to set cpus", name);
1552 		return (EPERM);
1553 	}
1554 
1555 	/* memory */
1556 	if (vcp->vcp_memranges[0].vmr_size == 0)
1557 		vcp->vcp_memranges[0].vmr_size =
1558 		    vcpp->vcp_memranges[0].vmr_size;
1559 	if (vm_checkinsflag(vmcp, VMOP_CREATE_MEMORY, uid) != 0 &&
1560 	    vcp->vcp_memranges[0].vmr_size !=
1561 	    vcpp->vcp_memranges[0].vmr_size) {
1562 		log_warnx("vm \"%s\" no permission to set memory", name);
1563 		return (EPERM);
1564 	}
1565 
1566 	/* disks cannot be inherited */
1567 	if (vm_checkinsflag(vmcp, VMOP_CREATE_DISK, uid) != 0 &&
1568 	    vmc->vmc_ndisks) {
1569 		log_warnx("vm \"%s\" no permission to set disks", name);
1570 		return (EPERM);
1571 	}
1572 	for (i = 0; i < vmc->vmc_ndisks; i++) {
1573 		/* Check if this disk is already used in the parent */
1574 		for (j = 0; j < vmcp->vmc_ndisks; j++) {
1575 			if (strcmp(vmc->vmc_disks[i],
1576 			    vmcp->vmc_disks[j]) == 0) {
1577 				log_warnx("vm \"%s\" disk %s cannot be reused",
1578 				    name, vmc->vmc_disks[i]);
1579 				return (EBUSY);
1580 			}
1581 		}
1582 		vmc->vmc_checkaccess |= VMOP_CREATE_DISK;
1583 	}
1584 
1585 	/* interfaces */
1586 	if (vmc->vmc_nnics > 0 &&
1587 	    vm_checkinsflag(vmcp, VMOP_CREATE_NETWORK, uid) != 0 &&
1588 	    vmc->vmc_nnics != vmcp->vmc_nnics) {
1589 		log_warnx("vm \"%s\" no permission to set interfaces", name);
1590 		return (EPERM);
1591 	}
1592 	for (i = 0; i < vmcp->vmc_nnics; i++) {
1593 		/* Interface got overwritten */
1594 		if (i < vmc->vmc_nnics)
1595 			continue;
1596 
1597 		/* Copy interface from parent */
1598 		vmc->vmc_ifflags[i] = vmcp->vmc_ifflags[i];
1599 		(void)strlcpy(vmc->vmc_ifnames[i], vmcp->vmc_ifnames[i],
1600 		    sizeof(vmc->vmc_ifnames[i]));
1601 		(void)strlcpy(vmc->vmc_ifswitch[i], vmcp->vmc_ifswitch[i],
1602 		    sizeof(vmc->vmc_ifswitch[i]));
1603 		(void)strlcpy(vmc->vmc_ifgroup[i], vmcp->vmc_ifgroup[i],
1604 		    sizeof(vmc->vmc_ifgroup[i]));
1605 		memcpy(vmc->vmc_macs[i], vmcp->vmc_macs[i],
1606 		    sizeof(vmc->vmc_macs[i]));
1607 		vmc->vmc_ifrdomain[i] = vmcp->vmc_ifrdomain[i];
1608 		vmc->vmc_nnics++;
1609 	}
1610 	for (i = 0; i < vmc->vmc_nnics; i++) {
1611 		for (j = 0; j < vmcp->vmc_nnics; j++) {
1612 			if (memcmp(zero_mac, vmc->vmc_macs[i],
1613 			    sizeof(vmc->vmc_macs[i])) != 0 &&
1614 			    memcmp(vmcp->vmc_macs[i], vmc->vmc_macs[i],
1615 			    sizeof(vmc->vmc_macs[i])) != 0) {
1616 				log_warnx("vm \"%s\" lladdr cannot be reused",
1617 				    name);
1618 				return (EBUSY);
1619 			}
1620 			if (strlen(vmc->vmc_ifnames[i]) &&
1621 			    strcmp(vmc->vmc_ifnames[i],
1622 			    vmcp->vmc_ifnames[j]) == 0) {
1623 				log_warnx("vm \"%s\" %s cannot be reused",
1624 				    vmc->vmc_ifnames[i], name);
1625 				return (EBUSY);
1626 			}
1627 		}
1628 	}
1629 
1630 	/* kernel */
1631 	if (vmc->vmc_kernel > -1 || ((*vm_parent)->vm_kernel_path != NULL &&
1632 		strnlen((*vm_parent)->vm_kernel_path, PATH_MAX) < PATH_MAX)) {
1633 		if (vm_checkinsflag(vmcp, VMOP_CREATE_KERNEL, uid) != 0) {
1634 			log_warnx("vm \"%s\" no permission to set boot image",
1635 			    name);
1636 			return (EPERM);
1637 		}
1638 		vmc->vmc_checkaccess |= VMOP_CREATE_KERNEL;
1639 	}
1640 
1641 	/* cdrom */
1642 	if (strlen(vmc->vmc_cdrom) > 0) {
1643 		if (vm_checkinsflag(vmcp, VMOP_CREATE_CDROM, uid) != 0) {
1644 			log_warnx("vm \"%s\" no permission to set cdrom", name);
1645 			return (EPERM);
1646 		}
1647 		vmc->vmc_checkaccess |= VMOP_CREATE_CDROM;
1648 	} else if (strlcpy(vmc->vmc_cdrom, vmcp->vmc_cdrom,
1649 	    sizeof(vmc->vmc_cdrom)) >= sizeof(vmc->vmc_cdrom)) {
1650 		log_warnx("vm \"%s\" cdrom name too long", name);
1651 		return (EINVAL);
1652 	}
1653 
1654 	/* user */
1655 	if (vmc->vmc_owner.uid == 0)
1656 		vmc->vmc_owner.uid = vmcp->vmc_owner.uid;
1657 	else if (vmc->vmc_owner.uid != uid &&
1658 	    vmc->vmc_owner.uid != vmcp->vmc_owner.uid) {
1659 		log_warnx("vm \"%s\" user mismatch", name);
1660 		return (EPERM);
1661 	}
1662 
1663 	/* group */
1664 	if (vmc->vmc_owner.gid == 0)
1665 		vmc->vmc_owner.gid = vmcp->vmc_owner.gid;
1666 	else if (vmc->vmc_owner.gid != vmcp->vmc_owner.gid) {
1667 		log_warnx("vm \"%s\" group mismatch", name);
1668 		return (EPERM);
1669 	}
1670 
1671 	/* child instances */
1672 	if (vmc->vmc_insflags) {
1673 		log_warnx("vm \"%s\" cannot change instance permissions", name);
1674 		return (EPERM);
1675 	}
1676 	if (vmcp->vmc_insflags & VMOP_CREATE_INSTANCE) {
1677 		vmc->vmc_insowner.gid = vmcp->vmc_insowner.gid;
1678 		vmc->vmc_insowner.uid = vmcp->vmc_insowner.gid;
1679 		vmc->vmc_insflags = vmcp->vmc_insflags;
1680 	} else {
1681 		vmc->vmc_insowner.gid = 0;
1682 		vmc->vmc_insowner.uid = 0;
1683 		vmc->vmc_insflags = 0;
1684 	}
1685 
1686 	/* finished, remove instance flags */
1687 	vmc->vmc_flags &= ~VMOP_CREATE_INSTANCE;
1688 
1689 	return (0);
1690 }
1691 
1692 /*
1693  * vm_checkperm
1694  *
1695  * Checks if the user represented by the 'uid' parameter is allowed to
1696  * manipulate the VM described by the 'vm' parameter (or connect to said VM's
1697  * console.)
1698  *
1699  * Parameters:
1700  *  vm: the VM whose permission is to be checked
1701  *  vmo: the required uid/gid to be checked
1702  *  uid: the user ID of the user making the request
1703  *
1704  * Return values:
1705  *   0: the permission should be granted
1706  *  -1: the permission check failed (also returned if vm == null)
1707  */
1708 int
1709 vm_checkperm(struct vmd_vm *vm, struct vmop_owner *vmo, uid_t uid)
1710 {
1711 	struct group	*gr;
1712 	struct passwd	*pw;
1713 	char		**grmem;
1714 
1715 	/* root has no restrictions */
1716 	if (uid == 0)
1717 		return (0);
1718 
1719 	if (vmo == NULL)
1720 		return (-1);
1721 
1722 	/* check user */
1723 	if (vm == NULL) {
1724 		if  (vmo->uid == uid)
1725 			return (0);
1726 	} else {
1727 		/*
1728 		 * check user of running vm (the owner of a running vm can
1729 		 * be different to (or more specific than) the configured owner.
1730 		 */
1731 		if (((vm->vm_state & VM_STATE_RUNNING) && vm->vm_uid == uid) ||
1732 		    (!(vm->vm_state & VM_STATE_RUNNING) && vmo->uid == uid))
1733 			return (0);
1734 	}
1735 
1736 	/* check groups */
1737 	if (vmo->gid != -1) {
1738 		if ((pw = getpwuid(uid)) == NULL)
1739 			return (-1);
1740 		if (pw->pw_gid == vmo->gid)
1741 			return (0);
1742 		if ((gr = getgrgid(vmo->gid)) != NULL) {
1743 			for (grmem = gr->gr_mem; *grmem; grmem++)
1744 				if (strcmp(*grmem, pw->pw_name) == 0)
1745 					return (0);
1746 		}
1747 	}
1748 
1749 	return (-1);
1750 }
1751 
1752 /*
1753  * vm_checkinsflag
1754  *
1755  * Checks whether the non-root user is allowed to set an instance option.
1756  *
1757  * Parameters:
1758  *  vmc: the VM create parameters
1759  *  flag: the flag to be checked
1760  *  uid: the user ID of the user making the request
1761  *
1762  * Return values:
1763  *   0: the permission should be granted
1764  *  -1: the permission check failed (also returned if vm == null)
1765  */
1766 int
1767 vm_checkinsflag(struct vmop_create_params *vmc, unsigned int flag, uid_t uid)
1768 {
1769 	/* root has no restrictions */
1770 	if (uid == 0)
1771 		return (0);
1772 
1773 	if ((vmc->vmc_insflags & flag) == 0)
1774 		return (-1);
1775 
1776 	return (0);
1777 }
1778 
1779 /*
1780  * vm_checkaccess
1781  *
1782  * Checks if the user represented by the 'uid' parameter is allowed to
1783  * access the file described by the 'path' parameter.
1784  *
1785  * Parameters:
1786  *  fd: the file descriptor of the opened file
1787  *  uflag: check if the userid has access to the file
1788  *  uid: the user ID of the user making the request
1789  *  amode: the access flags of R_OK and W_OK
1790  *
1791  * Return values:
1792  *   0: the permission should be granted
1793  *  -1: the permission check failed
1794  */
1795 int
1796 vm_checkaccess(int fd, unsigned int uflag, uid_t uid, int amode)
1797 {
1798 	struct group	*gr;
1799 	struct passwd	*pw;
1800 	char		**grmem;
1801 	struct stat	 st;
1802 	mode_t		 mode;
1803 
1804 	if (fd == -1)
1805 		return (-1);
1806 
1807 	/*
1808 	 * File has to be accessible and a regular file
1809 	 */
1810 	if (fstat(fd, &st) == -1 || !S_ISREG(st.st_mode))
1811 		return (-1);
1812 
1813 	/* root has no restrictions */
1814 	if (uid == 0 || uflag == 0)
1815 		return (0);
1816 
1817 	/* check other */
1818 	mode = amode & W_OK ? S_IWOTH : 0;
1819 	mode |= amode & R_OK ? S_IROTH : 0;
1820 	if ((st.st_mode & mode) == mode)
1821 		return (0);
1822 
1823 	/* check user */
1824 	mode = amode & W_OK ? S_IWUSR : 0;
1825 	mode |= amode & R_OK ? S_IRUSR : 0;
1826 	if (uid == st.st_uid && (st.st_mode & mode) == mode)
1827 		return (0);
1828 
1829 	/* check groups */
1830 	mode = amode & W_OK ? S_IWGRP : 0;
1831 	mode |= amode & R_OK ? S_IRGRP : 0;
1832 	if ((st.st_mode & mode) != mode)
1833 		return (-1);
1834 	if ((pw = getpwuid(uid)) == NULL)
1835 		return (-1);
1836 	if (pw->pw_gid == st.st_gid)
1837 		return (0);
1838 	if ((gr = getgrgid(st.st_gid)) != NULL) {
1839 		for (grmem = gr->gr_mem; *grmem; grmem++)
1840 			if (strcmp(*grmem, pw->pw_name) == 0)
1841 				return (0);
1842 	}
1843 
1844 	return (-1);
1845 }
1846 
1847 int
1848 vm_opentty(struct vmd_vm *vm)
1849 {
1850 	struct ptmget		 ptm;
1851 	struct stat		 st;
1852 	struct group		*gr;
1853 	uid_t			 uid;
1854 	gid_t			 gid;
1855 	mode_t			 mode;
1856 	int			 on;
1857 
1858 	/*
1859 	 * Open tty with pre-opened PTM fd
1860 	 */
1861 	if ((ioctl(env->vmd_ptmfd, PTMGET, &ptm) == -1))
1862 		return (-1);
1863 
1864 	/*
1865 	 * We use user ioctl(2) mode to pass break commands.
1866 	 */
1867 	on = 1;
1868 	if (ioctl(ptm.cfd, TIOCUCNTL, &on) == -1)
1869 		fatal("could not enable user ioctl mode");
1870 
1871 	vm->vm_tty = ptm.cfd;
1872 	close(ptm.sfd);
1873 	if (strlcpy(vm->vm_ttyname, ptm.sn, sizeof(vm->vm_ttyname))
1874 	    >= sizeof(vm->vm_ttyname)) {
1875 		log_warnx("%s: truncated ttyname", __func__);
1876 		goto fail;
1877 	}
1878 
1879 	uid = vm->vm_uid;
1880 	gid = vm->vm_params.vmc_owner.gid;
1881 
1882 	if (vm->vm_params.vmc_owner.gid != -1) {
1883 		mode = 0660;
1884 	} else if ((gr = getgrnam("tty")) != NULL) {
1885 		gid = gr->gr_gid;
1886 		mode = 0620;
1887 	} else {
1888 		mode = 0600;
1889 		gid = 0;
1890 	}
1891 
1892 	log_debug("%s: vm %s tty %s uid %d gid %d mode %o",
1893 	    __func__, vm->vm_params.vmc_params.vcp_name,
1894 	    vm->vm_ttyname, uid, gid, mode);
1895 
1896 	/*
1897 	 * Change ownership and mode of the tty as required.
1898 	 * Loosely based on the implementation of sshpty.c
1899 	 */
1900 	if (stat(vm->vm_ttyname, &st) == -1)
1901 		goto fail;
1902 
1903 	if (st.st_uid != uid || st.st_gid != gid) {
1904 		if (chown(vm->vm_ttyname, uid, gid) == -1) {
1905 			log_warn("chown %s %d %d failed, uid %d",
1906 			    vm->vm_ttyname, uid, gid, getuid());
1907 
1908 			/* Ignore failure on read-only filesystems */
1909 			if (!((errno == EROFS) &&
1910 			    (st.st_uid == uid || st.st_uid == 0)))
1911 				goto fail;
1912 		}
1913 	}
1914 
1915 	if ((st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)) != mode) {
1916 		if (chmod(vm->vm_ttyname, mode) == -1) {
1917 			log_warn("chmod %s %o failed, uid %d",
1918 			    vm->vm_ttyname, mode, getuid());
1919 
1920 			/* Ignore failure on read-only filesystems */
1921 			if (!((errno == EROFS) &&
1922 			    (st.st_uid == uid || st.st_uid == 0)))
1923 				goto fail;
1924 		}
1925 	}
1926 
1927 	return (0);
1928  fail:
1929 	vm_closetty(vm);
1930 	return (-1);
1931 }
1932 
1933 void
1934 vm_closetty(struct vmd_vm *vm)
1935 {
1936 	if (vm->vm_tty != -1) {
1937 		/* Release and close the tty */
1938 		if (fchown(vm->vm_tty, 0, 0) == -1)
1939 			log_warn("chown %s 0 0 failed", vm->vm_ttyname);
1940 		if (fchmod(vm->vm_tty, 0666) == -1)
1941 			log_warn("chmod %s 0666 failed", vm->vm_ttyname);
1942 		close(vm->vm_tty);
1943 		vm->vm_tty = -1;
1944 	}
1945 	memset(&vm->vm_ttyname, 0, sizeof(vm->vm_ttyname));
1946 }
1947 
1948 void
1949 switch_remove(struct vmd_switch *vsw)
1950 {
1951 	if (vsw == NULL)
1952 		return;
1953 
1954 	TAILQ_REMOVE(env->vmd_switches, vsw, sw_entry);
1955 
1956 	free(vsw->sw_group);
1957 	free(vsw->sw_name);
1958 	free(vsw);
1959 }
1960 
1961 struct vmd_switch *
1962 switch_getbyname(const char *name)
1963 {
1964 	struct vmd_switch	*vsw;
1965 
1966 	if (name == NULL)
1967 		return (NULL);
1968 	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1969 		if (strcmp(vsw->sw_name, name) == 0)
1970 			return (vsw);
1971 	}
1972 
1973 	return (NULL);
1974 }
1975 
1976 char *
1977 get_string(uint8_t *ptr, size_t len)
1978 {
1979 	size_t	 i;
1980 
1981 	for (i = 0; i < len; i++)
1982 		if (!isprint((unsigned char)ptr[i]))
1983 			break;
1984 
1985 	return strndup(ptr, i);
1986 }
1987 
1988 uint32_t
1989 prefixlen2mask(uint8_t prefixlen)
1990 {
1991 	if (prefixlen == 0)
1992 		return (0);
1993 
1994 	if (prefixlen > 32)
1995 		prefixlen = 32;
1996 
1997 	return (htonl(0xffffffff << (32 - prefixlen)));
1998 }
1999 
2000 void
2001 prefixlen2mask6(uint8_t prefixlen, struct in6_addr *mask)
2002 {
2003 	struct in6_addr	 s6;
2004 	int		 i;
2005 
2006 	if (prefixlen > 128)
2007 		prefixlen = 128;
2008 
2009 	memset(&s6, 0, sizeof(s6));
2010 	for (i = 0; i < prefixlen / 8; i++)
2011 		s6.s6_addr[i] = 0xff;
2012 	i = prefixlen % 8;
2013 	if (i)
2014 		s6.s6_addr[prefixlen / 8] = 0xff00 >> i;
2015 
2016 	memcpy(mask, &s6, sizeof(s6));
2017 }
2018 
2019 void
2020 getmonotime(struct timeval *tv)
2021 {
2022 	struct timespec	 ts;
2023 
2024 	if (clock_gettime(CLOCK_MONOTONIC, &ts))
2025 		fatal("clock_gettime");
2026 
2027 	TIMESPEC_TO_TIMEVAL(tv, &ts);
2028 }
2029 
2030 static inline void
2031 vm_terminate(struct vmd_vm *vm, const char *caller)
2032 {
2033 	if (vm->vm_from_config)
2034 		vm_stop(vm, 0, caller);
2035 	else {
2036 		/* vm_remove calls vm_stop */
2037 		vm_remove(vm, caller);
2038 	}
2039 }
2040 
2041 /*
2042  * Utility function for closing vm file descriptors. Assumes an fd of -1 was
2043  * already closed or never opened.
2044  *
2045  * Returns 0 on success, otherwise -1 on failure.
2046  */
2047 int
2048 close_fd(int fd)
2049 {
2050 	int	ret;
2051 
2052 	if (fd == -1)
2053 		return (0);
2054 
2055 #ifdef POSIX_CLOSE_RESTART
2056 	do { ret = close(fd); } while (ret == -1 && errno == EINTR);
2057 #else
2058 	ret = close(fd);
2059 #endif /* POSIX_CLOSE_RESTART */
2060 
2061 	if (ret == -1 && errno == EIO)
2062 		log_warn("%s(%d)", __func__, fd);
2063 
2064 	return (ret);
2065 }
2066