xref: /openbsd/usr.sbin/vmd/vmd.c (revision d89ec533)
1 /*	$OpenBSD: vmd.c,v 1.128 2021/12/13 18:28:40 deraadt Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/queue.h>
21 #include <sys/wait.h>
22 #include <sys/stat.h>
23 #include <sys/sysctl.h>
24 #include <sys/tty.h>
25 #include <sys/ttycom.h>
26 #include <sys/ioctl.h>
27 
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <termios.h>
32 #include <errno.h>
33 #include <event.h>
34 #include <fcntl.h>
35 #include <pwd.h>
36 #include <signal.h>
37 #include <syslog.h>
38 #include <unistd.h>
39 #include <util.h>
40 #include <ctype.h>
41 #include <pwd.h>
42 #include <grp.h>
43 
44 #include <machine/specialreg.h>
45 #include <machine/vmmvar.h>
46 
47 #include "proc.h"
48 #include "atomicio.h"
49 #include "vmd.h"
50 
51 __dead void usage(void);
52 
53 int	 main(int, char **);
54 int	 vmd_configure(void);
55 void	 vmd_sighdlr(int sig, short event, void *arg);
56 void	 vmd_shutdown(void);
57 int	 vmd_control_run(void);
58 int	 vmd_dispatch_control(int, struct privsep_proc *, struct imsg *);
59 int	 vmd_dispatch_vmm(int, struct privsep_proc *, struct imsg *);
60 int	 vmd_dispatch_priv(int, struct privsep_proc *, struct imsg *);
61 int	 vmd_check_vmh(struct vm_dump_header *);
62 
63 int	 vm_instance(struct privsep *, struct vmd_vm **,
64 	    struct vmop_create_params *, uid_t);
65 int	 vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t);
66 int	 vm_claimid(const char *, int, uint32_t *);
67 void	 start_vm_batch(int, short, void*);
68 
69 struct vmd	*env;
70 
71 static struct privsep_proc procs[] = {
72 	/* Keep "priv" on top as procs[0] */
73 	{ "priv",	PROC_PRIV,	vmd_dispatch_priv, priv },
74 	{ "control",	PROC_CONTROL,	vmd_dispatch_control, control },
75 	{ "vmm",	PROC_VMM,	vmd_dispatch_vmm, vmm, vmm_shutdown },
76 };
77 
78 enum privsep_procid privsep_process;
79 
80 struct event staggered_start_timer;
81 
82 /* For the privileged process */
83 static struct privsep_proc *proc_priv = &procs[0];
84 static struct passwd proc_privpw;
85 static const uint8_t zero_mac[ETHER_ADDR_LEN];
86 
87 int
88 vmd_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg)
89 {
90 	struct privsep			*ps = p->p_ps;
91 	int				 res = 0, ret = 0, cmd = 0, verbose;
92 	unsigned int			 v = 0, flags;
93 	struct vmop_create_params	 vmc;
94 	struct vmop_id			 vid;
95 	struct vmop_result		 vmr;
96 	struct vm_dump_header		 vmh;
97 	struct vmd_vm			*vm = NULL;
98 	char				*str = NULL;
99 	uint32_t			 id = 0;
100 	struct control_sock		*rcs;
101 
102 	switch (imsg->hdr.type) {
103 	case IMSG_VMDOP_START_VM_REQUEST:
104 		IMSG_SIZE_CHECK(imsg, &vmc);
105 		memcpy(&vmc, imsg->data, sizeof(vmc));
106 		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
107 		if (vmc.vmc_flags == 0) {
108 			/* start an existing VM with pre-configured options */
109 			if (!(ret == -1 && errno == EALREADY &&
110 			    !(vm->vm_state & VM_STATE_RUNNING))) {
111 				res = errno;
112 				cmd = IMSG_VMDOP_START_VM_RESPONSE;
113 			}
114 		} else if (ret != 0) {
115 			res = errno;
116 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
117 		}
118 		if (res == 0) {
119 			res = config_setvm(ps, vm, imsg->hdr.peerid,
120 			    vm->vm_params.vmc_owner.uid);
121 			if (res)
122 				cmd = IMSG_VMDOP_START_VM_RESPONSE;
123 		}
124 		break;
125 	case IMSG_VMDOP_WAIT_VM_REQUEST:
126 	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
127 		IMSG_SIZE_CHECK(imsg, &vid);
128 		memcpy(&vid, imsg->data, sizeof(vid));
129 		flags = vid.vid_flags;
130 		cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
131 
132 		if ((id = vid.vid_id) == 0) {
133 			/* Lookup vm (id) by name */
134 			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
135 				res = ENOENT;
136 				break;
137 			} else if ((vm->vm_state & VM_STATE_SHUTDOWN) &&
138 			    (flags & VMOP_FORCE) == 0) {
139 				res = EALREADY;
140 				break;
141 			} else if (!(vm->vm_state & VM_STATE_RUNNING)) {
142 				res = EINVAL;
143 				break;
144 			}
145 			id = vm->vm_vmid;
146 		} else if ((vm = vm_getbyvmid(id)) == NULL) {
147 			res = ENOENT;
148 			break;
149 		}
150 		if (vm_checkperm(vm, &vm->vm_params.vmc_owner, vid.vid_uid)) {
151 			res = EPERM;
152 			break;
153 		}
154 
155 		/* Only relay TERMINATION requests, not WAIT requests */
156 		if (imsg->hdr.type == IMSG_VMDOP_TERMINATE_VM_REQUEST) {
157 			memset(&vid, 0, sizeof(vid));
158 			vid.vid_id = id;
159 			vid.vid_flags = flags;
160 
161 			if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
162 				imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1)
163 				return (-1);
164 		}
165 		break;
166 	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
167 		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
168 		break;
169 	case IMSG_VMDOP_LOAD:
170 		IMSG_SIZE_CHECK(imsg, str); /* at least one byte for path */
171 		str = get_string((uint8_t *)imsg->data,
172 		    IMSG_DATA_SIZE(imsg));
173 	case IMSG_VMDOP_RELOAD:
174 		if (vmd_reload(0, str) == -1)
175 			cmd = IMSG_CTL_FAIL;
176 		else
177 			cmd = IMSG_CTL_OK;
178 		free(str);
179 		break;
180 	case IMSG_CTL_RESET:
181 		IMSG_SIZE_CHECK(imsg, &v);
182 		memcpy(&v, imsg->data, sizeof(v));
183 		if (vmd_reload(v, NULL) == -1)
184 			cmd = IMSG_CTL_FAIL;
185 		else
186 			cmd = IMSG_CTL_OK;
187 		break;
188 	case IMSG_CTL_VERBOSE:
189 		IMSG_SIZE_CHECK(imsg, &verbose);
190 		memcpy(&verbose, imsg->data, sizeof(verbose));
191 		log_setverbose(verbose);
192 
193 		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
194 		proc_forward_imsg(ps, imsg, PROC_PRIV, -1);
195 		cmd = IMSG_CTL_OK;
196 		break;
197 	case IMSG_VMDOP_PAUSE_VM:
198 	case IMSG_VMDOP_UNPAUSE_VM:
199 		IMSG_SIZE_CHECK(imsg, &vid);
200 		memcpy(&vid, imsg->data, sizeof(vid));
201 		if (vid.vid_id == 0) {
202 			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
203 				res = ENOENT;
204 				cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
205 				    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
206 				    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
207 				break;
208 			} else {
209 				vid.vid_id = vm->vm_vmid;
210 			}
211 		} else if ((vm = vm_getbyid(vid.vid_id)) == NULL) {
212 			res = ENOENT;
213 			cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
214 			    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
215 			    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
216 			break;
217 		}
218 		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
219 		    vid.vid_uid) != 0) {
220 			res = EPERM;
221 			cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
222 			    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
223 			    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
224 			break;
225 		}
226 		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
227 		    imsg->hdr.peerid, -1, &vid, sizeof(vid));
228 		break;
229 	case IMSG_VMDOP_SEND_VM_REQUEST:
230 		IMSG_SIZE_CHECK(imsg, &vid);
231 		memcpy(&vid, imsg->data, sizeof(vid));
232 		id = vid.vid_id;
233 		if (vid.vid_id == 0) {
234 			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
235 				res = ENOENT;
236 				cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
237 				close(imsg->fd);
238 				break;
239 			} else {
240 				vid.vid_id = vm->vm_vmid;
241 			}
242 		} else if ((vm = vm_getbyvmid(vid.vid_id)) == NULL) {
243 			res = ENOENT;
244 			cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
245 			close(imsg->fd);
246 			break;
247 		}
248 		vmr.vmr_id = vid.vid_id;
249 		log_debug("%s: sending fd to vmm", __func__);
250 		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
251 		    imsg->hdr.peerid, imsg->fd, &vid, sizeof(vid));
252 		break;
253 	case IMSG_VMDOP_RECEIVE_VM_REQUEST:
254 		IMSG_SIZE_CHECK(imsg, &vid);
255 		memcpy(&vid, imsg->data, sizeof(vid));
256 		if (imsg->fd == -1) {
257 			log_warnx("%s: invalid fd", __func__);
258 			return (-1);
259 		}
260 		if (atomicio(read, imsg->fd, &vmh, sizeof(vmh)) !=
261 		    sizeof(vmh)) {
262 			log_warnx("%s: error reading vmh from received vm",
263 			    __func__);
264 			res = EIO;
265 			close(imsg->fd);
266 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
267 			break;
268 		}
269 
270 		if (vmd_check_vmh(&vmh)) {
271 			res = ENOENT;
272 			close(imsg->fd);
273 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
274 			break;
275 		}
276 		if (atomicio(read, imsg->fd, &vmc, sizeof(vmc)) !=
277 		    sizeof(vmc)) {
278 			log_warnx("%s: error reading vmc from received vm",
279 			    __func__);
280 			res = EIO;
281 			close(imsg->fd);
282 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
283 			break;
284 		}
285 		strlcpy(vmc.vmc_params.vcp_name, vid.vid_name,
286 		    sizeof(vmc.vmc_params.vcp_name));
287 		vmc.vmc_params.vcp_id = 0;
288 
289 		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
290 		if (ret != 0) {
291 			res = errno;
292 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
293 			close(imsg->fd);
294 		} else {
295 			vm->vm_state |= VM_STATE_RECEIVED;
296 			config_setvm(ps, vm, imsg->hdr.peerid,
297 			    vmc.vmc_owner.uid);
298 			log_debug("%s: sending fd to vmm", __func__);
299 			proc_compose_imsg(ps, PROC_VMM, -1,
300 			    IMSG_VMDOP_RECEIVE_VM_END, vm->vm_vmid, imsg->fd,
301 			    NULL, 0);
302 		}
303 		break;
304 	case IMSG_VMDOP_DONE:
305 		control_reset(&ps->ps_csock);
306 		TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry)
307 			control_reset(rcs);
308 		cmd = 0;
309 		break;
310 	default:
311 		return (-1);
312 	}
313 
314 	switch (cmd) {
315 	case 0:
316 		break;
317 	case IMSG_VMDOP_START_VM_RESPONSE:
318 	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
319 		memset(&vmr, 0, sizeof(vmr));
320 		vmr.vmr_result = res;
321 		vmr.vmr_id = id;
322 		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
323 		    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
324 			return (-1);
325 		break;
326 	default:
327 		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
328 		    imsg->hdr.peerid, -1, &res, sizeof(res)) == -1)
329 			return (-1);
330 		break;
331 	}
332 
333 	return (0);
334 }
335 
336 int
337 vmd_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg)
338 {
339 	struct vmop_result	 vmr;
340 	struct privsep		*ps = p->p_ps;
341 	int			 res = 0;
342 	struct vmd_vm		*vm;
343 	struct vm_create_params	*vcp;
344 	struct vmop_info_result	 vir;
345 
346 	switch (imsg->hdr.type) {
347 	case IMSG_VMDOP_PAUSE_VM_RESPONSE:
348 		IMSG_SIZE_CHECK(imsg, &vmr);
349 		memcpy(&vmr, imsg->data, sizeof(vmr));
350 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
351 			break;
352 		proc_compose_imsg(ps, PROC_CONTROL, -1,
353 		    imsg->hdr.type, imsg->hdr.peerid, -1,
354 		    imsg->data, sizeof(imsg->data));
355 		log_info("%s: paused vm %d successfully",
356 		    vm->vm_params.vmc_params.vcp_name,
357 		    vm->vm_vmid);
358 		vm->vm_state |= VM_STATE_PAUSED;
359 		break;
360 	case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
361 		IMSG_SIZE_CHECK(imsg, &vmr);
362 		memcpy(&vmr, imsg->data, sizeof(vmr));
363 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
364 			break;
365 		proc_compose_imsg(ps, PROC_CONTROL, -1,
366 		    imsg->hdr.type, imsg->hdr.peerid, -1,
367 		    imsg->data, sizeof(imsg->data));
368 		log_info("%s: unpaused vm %d successfully.",
369 		    vm->vm_params.vmc_params.vcp_name,
370 		    vm->vm_vmid);
371 		vm->vm_state &= ~VM_STATE_PAUSED;
372 		break;
373 	case IMSG_VMDOP_START_VM_RESPONSE:
374 		IMSG_SIZE_CHECK(imsg, &vmr);
375 		memcpy(&vmr, imsg->data, sizeof(vmr));
376 		if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL)
377 			break;
378 		vm->vm_pid = vmr.vmr_pid;
379 		vcp = &vm->vm_params.vmc_params;
380 		vcp->vcp_id = vmr.vmr_id;
381 
382 		/*
383 		 * If the peerid is not -1, forward the response back to the
384 		 * the control socket.  If it is -1, the request originated
385 		 * from the parent, not the control socket.
386 		 */
387 		if (vm->vm_peerid != (uint32_t)-1) {
388 			(void)strlcpy(vmr.vmr_ttyname, vm->vm_ttyname,
389 			    sizeof(vmr.vmr_ttyname));
390 			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
391 			    imsg->hdr.type, vm->vm_peerid, -1,
392 			    &vmr, sizeof(vmr)) == -1) {
393 				errno = vmr.vmr_result;
394 				log_warn("%s: failed to foward vm result",
395 				    vcp->vcp_name);
396 				vm_remove(vm, __func__);
397 				return (-1);
398 			}
399 		}
400 
401 		if (vmr.vmr_result) {
402 			errno = vmr.vmr_result;
403 			log_warn("%s: failed to start vm", vcp->vcp_name);
404 			vm_remove(vm, __func__);
405 			break;
406 		}
407 
408 		/* Now configure all the interfaces */
409 		if (vm_priv_ifconfig(ps, vm) == -1) {
410 			log_warn("%s: failed to configure vm", vcp->vcp_name);
411 			vm_remove(vm, __func__);
412 			break;
413 		}
414 
415 		log_info("%s: started vm %d successfully, tty %s",
416 		    vcp->vcp_name, vm->vm_vmid, vm->vm_ttyname);
417 		break;
418 	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
419 		IMSG_SIZE_CHECK(imsg, &vmr);
420 		memcpy(&vmr, imsg->data, sizeof(vmr));
421 
422 		if (vmr.vmr_result) {
423 			DPRINTF("%s: forwarding TERMINATE VM for vm id %d",
424 			    __func__, vmr.vmr_id);
425 			proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
426 		} else {
427 			if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
428 				break;
429 			/* Mark VM as shutting down */
430 			vm->vm_state |= VM_STATE_SHUTDOWN;
431 		}
432 		break;
433 	case IMSG_VMDOP_SEND_VM_RESPONSE:
434 		IMSG_SIZE_CHECK(imsg, &vmr);
435 		memcpy(&vmr, imsg->data, sizeof(vmr));
436 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
437 			break;
438 		if (!vmr.vmr_result) {
439 			log_info("%s: sent vm %d successfully.",
440 			    vm->vm_params.vmc_params.vcp_name,
441 			    vm->vm_vmid);
442 			if (vm->vm_from_config)
443 				vm_stop(vm, 0, __func__);
444 			else
445 				vm_remove(vm, __func__);
446 		}
447 
448 		/* Send a response if a control client is waiting for it */
449 		if (imsg->hdr.peerid != (uint32_t)-1) {
450 			/* the error is meaningless for deferred responses */
451 			vmr.vmr_result = 0;
452 
453 			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
454 			    IMSG_VMDOP_SEND_VM_RESPONSE,
455 			    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
456 				return (-1);
457 		}
458 		break;
459 	case IMSG_VMDOP_TERMINATE_VM_EVENT:
460 		IMSG_SIZE_CHECK(imsg, &vmr);
461 		memcpy(&vmr, imsg->data, sizeof(vmr));
462 		DPRINTF("%s: handling TERMINATE_EVENT for vm id %d ret %d",
463 		    __func__, vmr.vmr_id, vmr.vmr_result);
464 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) {
465 			log_debug("%s: vm %d is no longer available",
466 			    __func__, vmr.vmr_id);
467 			break;
468 		}
469 		if (vmr.vmr_result != EAGAIN ||
470 		    vm->vm_params.vmc_bootdevice) {
471 			if (vm->vm_from_config)
472 				vm_stop(vm, 0, __func__);
473 			else
474 				vm_remove(vm, __func__);
475 		} else {
476 			/* Stop VM instance but keep the tty open */
477 			vm_stop(vm, 1, __func__);
478 			config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid);
479 		}
480 
481 		/* The error is meaningless for deferred responses */
482 		vmr.vmr_result = 0;
483 
484 		if (proc_compose_imsg(ps, PROC_CONTROL, -1,
485 			IMSG_VMDOP_TERMINATE_VM_EVENT,
486 			imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
487 			return (-1);
488 		break;
489 	case IMSG_VMDOP_GET_INFO_VM_DATA:
490 		IMSG_SIZE_CHECK(imsg, &vir);
491 		memcpy(&vir, imsg->data, sizeof(vir));
492 		if ((vm = vm_getbyvmid(vir.vir_info.vir_id)) != NULL) {
493 			memset(vir.vir_ttyname, 0, sizeof(vir.vir_ttyname));
494 			if (vm->vm_ttyname != NULL)
495 				strlcpy(vir.vir_ttyname, vm->vm_ttyname,
496 				    sizeof(vir.vir_ttyname));
497 			log_debug("%s: running vm: %d, vm_state: 0x%x",
498 			    __func__, vm->vm_vmid, vm->vm_state);
499 			vir.vir_state = vm->vm_state;
500 			/* get the user id who started the vm */
501 			vir.vir_uid = vm->vm_uid;
502 			vir.vir_gid = vm->vm_params.vmc_owner.gid;
503 		}
504 		if (proc_compose_imsg(ps, PROC_CONTROL, -1, imsg->hdr.type,
505 		    imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) {
506 			log_debug("%s: GET_INFO_VM failed for vm %d, removing",
507 			    __func__, vm->vm_vmid);
508 			vm_remove(vm, __func__);
509 			return (-1);
510 		}
511 		break;
512 	case IMSG_VMDOP_GET_INFO_VM_END_DATA:
513 		/*
514 		 * PROC_VMM has responded with the *running* VMs, now we
515 		 * append the others. These use the special value 0 for their
516 		 * kernel id to indicate that they are not running.
517 		 */
518 		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
519 			if (!(vm->vm_state & VM_STATE_RUNNING)) {
520 				memset(&vir, 0, sizeof(vir));
521 				vir.vir_info.vir_id = vm->vm_vmid;
522 				strlcpy(vir.vir_info.vir_name,
523 				    vm->vm_params.vmc_params.vcp_name,
524 				    VMM_MAX_NAME_LEN);
525 				vir.vir_info.vir_memory_size =
526 				    vm->vm_params.vmc_params.
527 				    vcp_memranges[0].vmr_size;
528 				vir.vir_info.vir_ncpus =
529 				    vm->vm_params.vmc_params.vcp_ncpus;
530 				/* get the configured user id for this vm */
531 				vir.vir_uid = vm->vm_params.vmc_owner.uid;
532 				vir.vir_gid = vm->vm_params.vmc_owner.gid;
533 				log_debug("%s: vm: %d, vm_state: 0x%x",
534 				    __func__, vm->vm_vmid, vm->vm_state);
535 				vir.vir_state = vm->vm_state;
536 				if (proc_compose_imsg(ps, PROC_CONTROL, -1,
537 				    IMSG_VMDOP_GET_INFO_VM_DATA,
538 				    imsg->hdr.peerid, -1, &vir,
539 				    sizeof(vir)) == -1) {
540 					log_debug("%s: GET_INFO_VM_END failed",
541 					    __func__);
542 					vm_remove(vm, __func__);
543 					return (-1);
544 				}
545 			}
546 		}
547 		IMSG_SIZE_CHECK(imsg, &res);
548 		proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
549 		break;
550 	default:
551 		return (-1);
552 	}
553 
554 	return (0);
555 }
556 
557 int
558 vmd_dispatch_priv(int fd, struct privsep_proc *p, struct imsg *imsg)
559 {
560 	struct vmop_addr_result	 var;
561 
562 	switch (imsg->hdr.type) {
563 	case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
564 		IMSG_SIZE_CHECK(imsg, &var);
565 		memcpy(&var, imsg->data, sizeof(var));
566 		proc_forward_imsg(p->p_ps, imsg, PROC_VMM, -1);
567 		break;
568 	default:
569 		return (-1);
570 	}
571 
572 	return (0);
573 }
574 
575 int
576 vmd_check_vmh(struct vm_dump_header *vmh)
577 {
578 	int i;
579 	unsigned int code, leaf;
580 	unsigned int a, b, c, d;
581 
582 	if (strncmp(vmh->vmh_signature, VM_DUMP_SIGNATURE, strlen(VM_DUMP_SIGNATURE)) != 0) {
583 		log_warnx("%s: incompatible dump signature", __func__);
584 		return (-1);
585 	}
586 
587 	if (vmh->vmh_version != VM_DUMP_VERSION) {
588 		log_warnx("%s: incompatible dump version", __func__);
589 		return (-1);
590 	}
591 
592 	for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
593 		code = vmh->vmh_cpuids[i].code;
594 		leaf = vmh->vmh_cpuids[i].leaf;
595 		if (leaf != 0x00) {
596 			log_debug("%s: invalid leaf 0x%x for code 0x%x",
597 			    __func__, leaf, code);
598 			return (-1);
599 		}
600 
601 		switch (code) {
602 		case 0x00:
603 			CPUID_LEAF(code, leaf, a, b, c, d);
604 			if (vmh->vmh_cpuids[i].a > a) {
605 				log_debug("%s: incompatible cpuid level",
606 				    __func__);
607 				return (-1);
608 			}
609 			if (!(vmh->vmh_cpuids[i].b == b &&
610 			    vmh->vmh_cpuids[i].c == c &&
611 			    vmh->vmh_cpuids[i].d == d)) {
612 				log_debug("%s: incompatible cpu brand",
613 				    __func__);
614 				return (-1);
615 			}
616 			break;
617 
618 		case 0x01:
619 			CPUID_LEAF(code, leaf, a, b, c, d);
620 			if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
621 			    (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
622 				log_debug("%s: incompatible cpu features "
623 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
624 				    code, leaf);
625 				return (-1);
626 			}
627 			if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
628 			    (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
629 				log_debug("%s: incompatible cpu features "
630 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
631 				    code, leaf);
632 				return (-1);
633 			}
634 			break;
635 
636 		case 0x07:
637 			CPUID_LEAF(code, leaf, a, b, c, d);
638 			if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
639 			    (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
640 				log_debug("%s: incompatible cpu features "
641 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
642 				    code, leaf);
643 				return (-1);
644 			}
645 			if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
646 			    (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
647 				log_debug("%s: incompatible cpu features "
648 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
649 				    code, leaf);
650 				return (-1);
651 			}
652 			break;
653 
654 		case 0x0d:
655 			CPUID_LEAF(code, leaf, a, b, c, d);
656 			if (vmh->vmh_cpuids[i].b > b) {
657 				log_debug("%s: incompatible cpu: insufficient "
658 				    "max save area for enabled XCR0 features",
659 				    __func__);
660 				return (-1);
661 			}
662 			if (vmh->vmh_cpuids[i].c > c) {
663 				log_debug("%s: incompatible cpu: insufficient "
664 				    "max save area for supported XCR0 features",
665 				    __func__);
666 				return (-1);
667 			}
668 			break;
669 
670 		case 0x80000001:
671 			CPUID_LEAF(code, leaf, a, b, c, d);
672 			if ((vmh->vmh_cpuids[i].a & a) !=
673 			    vmh->vmh_cpuids[i].a) {
674 				log_debug("%s: incompatible cpu features "
675 				    "code: 0x%x leaf: 0x%x  reg: a", __func__,
676 				    code, leaf);
677 				return (-1);
678 			}
679 			if ((vmh->vmh_cpuids[i].c & c) !=
680 			    vmh->vmh_cpuids[i].c) {
681 				log_debug("%s: incompatible cpu features "
682 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
683 				    code, leaf);
684 				return (-1);
685 			}
686 			if ((vmh->vmh_cpuids[i].d & d) !=
687 			    vmh->vmh_cpuids[i].d) {
688 				log_debug("%s: incompatible cpu features "
689 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
690 				    code, leaf);
691 				return (-1);
692 			}
693 			break;
694 
695 		default:
696 			log_debug("%s: unknown code 0x%x", __func__, code);
697 			return (-1);
698 		}
699 	}
700 
701 	return (0);
702 }
703 
704 void
705 vmd_sighdlr(int sig, short event, void *arg)
706 {
707 	if (privsep_process != PROC_PARENT)
708 		return;
709 	log_debug("%s: handling signal", __func__);
710 
711 	switch (sig) {
712 	case SIGHUP:
713 		log_info("%s: reload requested with SIGHUP", __func__);
714 
715 		/*
716 		 * This is safe because libevent uses async signal handlers
717 		 * that run in the event loop and not in signal context.
718 		 */
719 		(void)vmd_reload(0, NULL);
720 		break;
721 	case SIGPIPE:
722 		log_info("%s: ignoring SIGPIPE", __func__);
723 		break;
724 	case SIGUSR1:
725 		log_info("%s: ignoring SIGUSR1", __func__);
726 		break;
727 	case SIGTERM:
728 	case SIGINT:
729 		vmd_shutdown();
730 		break;
731 	default:
732 		fatalx("unexpected signal");
733 	}
734 }
735 
736 __dead void
737 usage(void)
738 {
739 	extern char *__progname;
740 	fprintf(stderr, "usage: %s [-dnv] [-D macro=value] [-f file]\n",
741 	    __progname);
742 	exit(1);
743 }
744 
745 int
746 main(int argc, char **argv)
747 {
748 	struct privsep		*ps;
749 	int			 ch;
750 	const char		*conffile = VMD_CONF;
751 	enum privsep_procid	 proc_id = PROC_PARENT;
752 	int			 proc_instance = 0;
753 	const char		*errp, *title = NULL;
754 	int			 argc0 = argc;
755 
756 	log_init(0, LOG_DAEMON);
757 
758 	if ((env = calloc(1, sizeof(*env))) == NULL)
759 		fatal("calloc: env");
760 
761 	while ((ch = getopt(argc, argv, "D:P:I:df:vn")) != -1) {
762 		switch (ch) {
763 		case 'D':
764 			if (cmdline_symset(optarg) < 0)
765 				log_warnx("could not parse macro definition %s",
766 				    optarg);
767 			break;
768 		case 'd':
769 			env->vmd_debug = 2;
770 			break;
771 		case 'f':
772 			conffile = optarg;
773 			break;
774 		case 'v':
775 			env->vmd_verbose++;
776 			break;
777 		case 'n':
778 			env->vmd_noaction = 1;
779 			break;
780 		case 'P':
781 			title = optarg;
782 			proc_id = proc_getid(procs, nitems(procs), title);
783 			if (proc_id == PROC_MAX)
784 				fatalx("invalid process name");
785 			break;
786 		case 'I':
787 			proc_instance = strtonum(optarg, 0,
788 			    PROC_MAX_INSTANCES, &errp);
789 			if (errp)
790 				fatalx("invalid process instance");
791 			break;
792 		default:
793 			usage();
794 		}
795 	}
796 
797 	argc -= optind;
798 	if (argc > 0)
799 		usage();
800 
801 	if (env->vmd_noaction && !env->vmd_debug)
802 		env->vmd_debug = 1;
803 
804 	log_init(env->vmd_debug, LOG_DAEMON);
805 	log_setverbose(env->vmd_verbose);
806 
807 	/* check for root privileges */
808 	if (env->vmd_noaction == 0) {
809 		if (geteuid())
810 			fatalx("need root privileges");
811 	}
812 
813 	ps = &env->vmd_ps;
814 	ps->ps_env = env;
815 	env->vmd_fd = -1;
816 
817 	if (config_init(env) == -1)
818 		fatal("failed to initialize configuration");
819 
820 	if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL)
821 		fatal("unknown user %s", VMD_USER);
822 
823 	/* First proc runs as root without pledge but in default chroot */
824 	proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */
825 	proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */
826 
827 	/* Open /dev/vmm */
828 	if (env->vmd_noaction == 0) {
829 		env->vmd_fd = open(VMM_NODE, O_RDWR);
830 		if (env->vmd_fd == -1)
831 			fatal("%s", VMM_NODE);
832 	}
833 
834 	/* Configure the control socket */
835 	ps->ps_csock.cs_name = SOCKET_NAME;
836 	TAILQ_INIT(&ps->ps_rcsocks);
837 
838 	/* Configuration will be parsed after forking the children */
839 	env->vmd_conffile = conffile;
840 
841 	if (env->vmd_noaction)
842 		ps->ps_noaction = 1;
843 	ps->ps_instance = proc_instance;
844 	if (title != NULL)
845 		ps->ps_title[proc_id] = title;
846 
847 	/* only the parent returns */
848 	proc_init(ps, procs, nitems(procs), env->vmd_debug, argc0, argv,
849 	    proc_id);
850 
851 	log_procinit("parent");
852 	if (!env->vmd_debug && daemon(0, 0) == -1)
853 		fatal("can't daemonize");
854 
855 	if (ps->ps_noaction == 0)
856 		log_info("startup");
857 
858 	event_init();
859 
860 	signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps);
861 	signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps);
862 	signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps);
863 	signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps);
864 	signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps);
865 
866 	signal_add(&ps->ps_evsigint, NULL);
867 	signal_add(&ps->ps_evsigterm, NULL);
868 	signal_add(&ps->ps_evsighup, NULL);
869 	signal_add(&ps->ps_evsigpipe, NULL);
870 	signal_add(&ps->ps_evsigusr1, NULL);
871 
872 	if (!env->vmd_noaction)
873 		proc_connect(ps);
874 
875 	if (vmd_configure() == -1)
876 		fatalx("configuration failed");
877 
878 	event_dispatch();
879 
880 	log_debug("parent exiting");
881 
882 	return (0);
883 }
884 
885 void
886 start_vm_batch(int fd, short type, void *args)
887 {
888 	int		i = 0;
889 	struct vmd_vm	*vm;
890 
891 	log_debug("%s: starting batch of %d vms", __func__,
892 	    env->vmd_cfg.parallelism);
893 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
894 		if (!(vm->vm_state & VM_STATE_WAITING)) {
895 			log_debug("%s: not starting vm %s (disabled)",
896 			    __func__,
897 			    vm->vm_params.vmc_params.vcp_name);
898 			continue;
899 		}
900 		i++;
901 		if (i > env->vmd_cfg.parallelism) {
902 			evtimer_add(&staggered_start_timer,
903 			    &env->vmd_cfg.delay);
904 			break;
905 		}
906 		vm->vm_state &= ~VM_STATE_WAITING;
907 		config_setvm(&env->vmd_ps, vm, -1, vm->vm_params.vmc_owner.uid);
908 	}
909 	log_debug("%s: done starting vms", __func__);
910 }
911 
912 int
913 vmd_configure(void)
914 {
915 	int			ncpus;
916 	struct vmd_switch	*vsw;
917 	int ncpu_mib[] = {CTL_HW, HW_NCPUONLINE};
918 	size_t ncpus_sz = sizeof(ncpus);
919 
920 	if ((env->vmd_ptmfd = open(PATH_PTMDEV, O_RDWR|O_CLOEXEC)) == -1)
921 		fatal("open %s", PATH_PTMDEV);
922 
923 	/*
924 	 * pledge in the parent process:
925 	 * stdio - for malloc and basic I/O including events.
926 	 * rpath - for reload to open and read the configuration files.
927 	 * wpath - for opening disk images and tap devices.
928 	 * tty - for openpty and TIOCUCNTL.
929 	 * proc - run kill to terminate its children safely.
930 	 * sendfd - for disks, interfaces and other fds.
931 	 * recvfd - for send and receive.
932 	 * getpw - lookup user or group id by name.
933 	 * chown, fattr - change tty ownership
934 	 * flock - locking disk files
935 	 */
936 	if (pledge("stdio rpath wpath proc tty recvfd sendfd getpw"
937 	    " chown fattr flock", NULL) == -1)
938 		fatal("pledge");
939 
940 	if (parse_config(env->vmd_conffile) == -1) {
941 		proc_kill(&env->vmd_ps);
942 		exit(1);
943 	}
944 
945 	if (env->vmd_noaction) {
946 		fprintf(stderr, "configuration OK\n");
947 		proc_kill(&env->vmd_ps);
948 		exit(0);
949 	}
950 
951 	/* Send shared global configuration to all children */
952 	if (config_setconfig(env) == -1)
953 		return (-1);
954 
955 	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
956 		if (vsw->sw_running)
957 			continue;
958 		if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
959 			log_warn("%s: failed to create switch %s",
960 			    __func__, vsw->sw_name);
961 			switch_remove(vsw);
962 			return (-1);
963 		}
964 	}
965 
966 	if (!(env->vmd_cfg.cfg_flags & VMD_CFG_STAGGERED_START)) {
967 		env->vmd_cfg.delay.tv_sec = VMD_DEFAULT_STAGGERED_START_DELAY;
968 		if (sysctl(ncpu_mib, nitems(ncpu_mib), &ncpus, &ncpus_sz, NULL, 0) == -1)
969 			ncpus = 1;
970 		env->vmd_cfg.parallelism = ncpus;
971 		log_debug("%s: setting staggered start configuration to "
972 		    "parallelism: %d and delay: %lld",
973 		    __func__, ncpus, (long long) env->vmd_cfg.delay.tv_sec);
974 	}
975 
976 	log_debug("%s: starting vms in staggered fashion", __func__);
977 	evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
978 	/* start first batch */
979 	start_vm_batch(0, 0, NULL);
980 
981 	return (0);
982 }
983 
984 int
985 vmd_reload(unsigned int reset, const char *filename)
986 {
987 	struct vmd_vm		*vm, *next_vm;
988 	struct vmd_switch	*vsw;
989 	int			 reload = 0;
990 
991 	/* Switch back to the default config file */
992 	if (filename == NULL || *filename == '\0') {
993 		filename = env->vmd_conffile;
994 		reload = 1;
995 	}
996 
997 	log_debug("%s: level %d config file %s", __func__, reset, filename);
998 
999 	if (reset) {
1000 		/* Purge the configuration */
1001 		config_purge(env, reset);
1002 		config_setreset(env, reset);
1003 	} else {
1004 		/*
1005 		 * Load or reload the configuration.
1006 		 *
1007 		 * Reloading removes all non-running VMs before processing the
1008 		 * config file, whereas loading only adds to the existing list
1009 		 * of VMs.
1010 		 */
1011 
1012 		if (reload) {
1013 			TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry,
1014 			    next_vm) {
1015 				if (!(vm->vm_state & VM_STATE_RUNNING)) {
1016 					DPRINTF("%s: calling vm_remove",
1017 					    __func__);
1018 					vm_remove(vm, __func__);
1019 				}
1020 			}
1021 		}
1022 
1023 		if (parse_config(filename) == -1) {
1024 			log_debug("%s: failed to load config file %s",
1025 			    __func__, filename);
1026 			return (-1);
1027 		}
1028 
1029 		if (reload) {
1030 			/* Update shared global configuration in all children */
1031 			if (config_setconfig(env) == -1)
1032 				return (-1);
1033 		}
1034 
1035 		TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1036 			if (vsw->sw_running)
1037 				continue;
1038 			if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
1039 				log_warn("%s: failed to create switch %s",
1040 				    __func__, vsw->sw_name);
1041 				switch_remove(vsw);
1042 				return (-1);
1043 			}
1044 		}
1045 
1046 		log_debug("%s: starting vms in staggered fashion", __func__);
1047 		evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
1048 		/* start first batch */
1049 		start_vm_batch(0, 0, NULL);
1050 
1051 		}
1052 
1053 	return (0);
1054 }
1055 
1056 void
1057 vmd_shutdown(void)
1058 {
1059 	struct vmd_vm *vm, *vm_next;
1060 
1061 	log_debug("%s: performing shutdown", __func__);
1062 
1063 	TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
1064 		vm_remove(vm, __func__);
1065 	}
1066 
1067 	proc_kill(&env->vmd_ps);
1068 	free(env);
1069 
1070 	log_warnx("parent terminating");
1071 	exit(0);
1072 }
1073 
1074 struct vmd_vm *
1075 vm_getbyvmid(uint32_t vmid)
1076 {
1077 	struct vmd_vm	*vm;
1078 
1079 	if (vmid == 0)
1080 		return (NULL);
1081 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1082 		if (vm->vm_vmid == vmid)
1083 			return (vm);
1084 	}
1085 
1086 	return (NULL);
1087 }
1088 
1089 struct vmd_vm *
1090 vm_getbyid(uint32_t id)
1091 {
1092 	struct vmd_vm	*vm;
1093 
1094 	if (id == 0)
1095 		return (NULL);
1096 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1097 		if (vm->vm_params.vmc_params.vcp_id == id)
1098 			return (vm);
1099 	}
1100 
1101 	return (NULL);
1102 }
1103 
1104 uint32_t
1105 vm_id2vmid(uint32_t id, struct vmd_vm *vm)
1106 {
1107 	if (vm == NULL && (vm = vm_getbyid(id)) == NULL)
1108 		return (0);
1109 	DPRINTF("%s: vmm id %u is vmid %u", __func__,
1110 	    id, vm->vm_vmid);
1111 	return (vm->vm_vmid);
1112 }
1113 
1114 uint32_t
1115 vm_vmid2id(uint32_t vmid, struct vmd_vm *vm)
1116 {
1117 	if (vm == NULL && (vm = vm_getbyvmid(vmid)) == NULL)
1118 		return (0);
1119 	DPRINTF("%s: vmid %u is vmm id %u", __func__,
1120 	    vmid, vm->vm_params.vmc_params.vcp_id);
1121 	return (vm->vm_params.vmc_params.vcp_id);
1122 }
1123 
1124 struct vmd_vm *
1125 vm_getbyname(const char *name)
1126 {
1127 	struct vmd_vm	*vm;
1128 
1129 	if (name == NULL)
1130 		return (NULL);
1131 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1132 		if (strcmp(vm->vm_params.vmc_params.vcp_name, name) == 0)
1133 			return (vm);
1134 	}
1135 
1136 	return (NULL);
1137 }
1138 
1139 struct vmd_vm *
1140 vm_getbypid(pid_t pid)
1141 {
1142 	struct vmd_vm	*vm;
1143 
1144 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1145 		if (vm->vm_pid == pid)
1146 			return (vm);
1147 	}
1148 
1149 	return (NULL);
1150 }
1151 
1152 void
1153 vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
1154 {
1155 	struct privsep	*ps = &env->vmd_ps;
1156 	unsigned int	 i, j;
1157 
1158 	if (vm == NULL)
1159 		return;
1160 
1161 	log_debug("%s: %s %s stopping vm %d%s",
1162 	    __func__, ps->ps_title[privsep_process], caller,
1163 	    vm->vm_vmid, keeptty ? ", keeping tty open" : "");
1164 
1165 	vm->vm_state &= ~(VM_STATE_RUNNING | VM_STATE_SHUTDOWN);
1166 
1167 	user_inc(&vm->vm_params.vmc_params, vm->vm_user, 0);
1168 	user_put(vm->vm_user);
1169 
1170 	if (vm->vm_iev.ibuf.fd != -1) {
1171 		event_del(&vm->vm_iev.ev);
1172 		close(vm->vm_iev.ibuf.fd);
1173 	}
1174 	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) {
1175 		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
1176 			if (vm->vm_disks[i][j] != -1) {
1177 				close(vm->vm_disks[i][j]);
1178 				vm->vm_disks[i][j] = -1;
1179 			}
1180 		}
1181 	}
1182 	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) {
1183 		if (vm->vm_ifs[i].vif_fd != -1) {
1184 			close(vm->vm_ifs[i].vif_fd);
1185 			vm->vm_ifs[i].vif_fd = -1;
1186 		}
1187 		free(vm->vm_ifs[i].vif_name);
1188 		free(vm->vm_ifs[i].vif_switch);
1189 		free(vm->vm_ifs[i].vif_group);
1190 		vm->vm_ifs[i].vif_name = NULL;
1191 		vm->vm_ifs[i].vif_switch = NULL;
1192 		vm->vm_ifs[i].vif_group = NULL;
1193 	}
1194 	if (vm->vm_kernel != -1) {
1195 		close(vm->vm_kernel);
1196 		vm->vm_kernel = -1;
1197 	}
1198 	if (vm->vm_cdrom != -1) {
1199 		close(vm->vm_cdrom);
1200 		vm->vm_cdrom = -1;
1201 	}
1202 	if (!keeptty) {
1203 		vm_closetty(vm);
1204 		vm->vm_uid = 0;
1205 	}
1206 }
1207 
1208 void
1209 vm_remove(struct vmd_vm *vm, const char *caller)
1210 {
1211 	struct privsep	*ps = &env->vmd_ps;
1212 
1213 	if (vm == NULL)
1214 		return;
1215 
1216 	log_debug("%s: %s %s removing vm %d from running config",
1217 	    __func__, ps->ps_title[privsep_process], caller,
1218 	    vm->vm_vmid);
1219 
1220 	TAILQ_REMOVE(env->vmd_vms, vm, vm_entry);
1221 
1222 	user_put(vm->vm_user);
1223 	vm_stop(vm, 0, caller);
1224 	free(vm);
1225 }
1226 
1227 int
1228 vm_claimid(const char *name, int uid, uint32_t *id)
1229 {
1230 	struct name2id *n2i = NULL;
1231 
1232 	TAILQ_FOREACH(n2i, env->vmd_known, entry)
1233 		if (strcmp(n2i->name, name) == 0 && n2i->uid == uid)
1234 			goto out;
1235 
1236 	if (++env->vmd_nvm == 0) {
1237 		log_warnx("too many vms");
1238 		return -1;
1239 	}
1240 	if ((n2i = calloc(1, sizeof(struct name2id))) == NULL) {
1241 		log_warnx("could not alloc vm name");
1242 		return -1;
1243 	}
1244 	n2i->id = env->vmd_nvm;
1245 	n2i->uid = uid;
1246 	if (strlcpy(n2i->name, name, sizeof(n2i->name)) >= sizeof(n2i->name)) {
1247 		log_warnx("vm name too long");
1248 		free(n2i);
1249 		return -1;
1250 	}
1251 	TAILQ_INSERT_TAIL(env->vmd_known, n2i, entry);
1252 
1253 out:
1254 	*id = n2i->id;
1255 	return 0;
1256 }
1257 
1258 int
1259 vm_register(struct privsep *ps, struct vmop_create_params *vmc,
1260     struct vmd_vm **ret_vm, uint32_t id, uid_t uid)
1261 {
1262 	struct vmd_vm		*vm = NULL, *vm_parent = NULL;
1263 	struct vm_create_params	*vcp = &vmc->vmc_params;
1264 	struct vmop_owner	*vmo = NULL;
1265 	struct vmd_user		*usr = NULL;
1266 	uint32_t		 nid, rng;
1267 	unsigned int		 i, j;
1268 	struct vmd_switch	*sw;
1269 	char			*s;
1270 	int			 ret = 0;
1271 
1272 	/* Check if this is an instance of another VM */
1273 	if ((ret = vm_instance(ps, &vm_parent, vmc, uid)) != 0) {
1274 		errno = ret; /* XXX might set invalid errno */
1275 		return (-1);
1276 	}
1277 
1278 	errno = 0;
1279 	*ret_vm = NULL;
1280 
1281 	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1282 	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1283 		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
1284 		    uid) != 0) {
1285 			errno = EPERM;
1286 			goto fail;
1287 		}
1288 		*ret_vm = vm;
1289 		errno = EALREADY;
1290 		goto fail;
1291 	}
1292 
1293 	if (vm_parent != NULL)
1294 		vmo = &vm_parent->vm_params.vmc_insowner;
1295 
1296 	/* non-root users can only start existing VMs or instances */
1297 	if (vm_checkperm(NULL, vmo, uid) != 0) {
1298 		log_warnx("permission denied");
1299 		errno = EPERM;
1300 		goto fail;
1301 	}
1302 	if (vmc->vmc_flags == 0) {
1303 		log_warnx("invalid configuration, no devices");
1304 		errno = VMD_DISK_MISSING;
1305 		goto fail;
1306 	}
1307 	if (vcp->vcp_ncpus == 0)
1308 		vcp->vcp_ncpus = 1;
1309 	if (vcp->vcp_memranges[0].vmr_size == 0)
1310 		vcp->vcp_memranges[0].vmr_size = VM_DEFAULT_MEMORY;
1311 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) {
1312 		log_warnx("invalid number of CPUs");
1313 		goto fail;
1314 	} else if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) {
1315 		log_warnx("invalid number of disks");
1316 		goto fail;
1317 	} else if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) {
1318 		log_warnx("invalid number of interfaces");
1319 		goto fail;
1320 	} else if (strlen(vcp->vcp_kernel) == 0 &&
1321 	    vcp->vcp_ndisks == 0 && strlen(vcp->vcp_cdrom) == 0) {
1322 		log_warnx("no kernel or disk/cdrom specified");
1323 		goto fail;
1324 	} else if (strlen(vcp->vcp_name) == 0) {
1325 		log_warnx("invalid VM name");
1326 		goto fail;
1327 	} else if (*vcp->vcp_name == '-' || *vcp->vcp_name == '.' ||
1328 	    *vcp->vcp_name == '_') {
1329 		log_warnx("invalid VM name");
1330 		goto fail;
1331 	} else {
1332 		for (s = vcp->vcp_name; *s != '\0'; ++s) {
1333 			if (!(isalnum(*s) || *s == '.' || *s == '-' ||
1334 			    *s == '_')) {
1335 				log_warnx("invalid VM name");
1336 				goto fail;
1337 			}
1338 		}
1339 	}
1340 
1341 	/* track active users */
1342 	if (uid != 0 && env->vmd_users != NULL &&
1343 	    (usr = user_get(uid)) == NULL) {
1344 		log_warnx("could not add user");
1345 		goto fail;
1346 	}
1347 
1348 	if ((vm = calloc(1, sizeof(*vm))) == NULL)
1349 		goto fail;
1350 
1351 	memcpy(&vm->vm_params, vmc, sizeof(vm->vm_params));
1352 	vmc = &vm->vm_params;
1353 	vcp = &vmc->vmc_params;
1354 	vm->vm_pid = -1;
1355 	vm->vm_tty = -1;
1356 	vm->vm_receive_fd = -1;
1357 	vm->vm_state &= ~VM_STATE_PAUSED;
1358 	vm->vm_user = usr;
1359 
1360 	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
1361 		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
1362 			vm->vm_disks[i][j] = -1;
1363 	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
1364 		vm->vm_ifs[i].vif_fd = -1;
1365 	for (i = 0; i < vcp->vcp_nnics; i++) {
1366 		if ((sw = switch_getbyname(vmc->vmc_ifswitch[i])) != NULL) {
1367 			/* inherit per-interface flags from the switch */
1368 			vmc->vmc_ifflags[i] |= (sw->sw_flags & VMIFF_OPTMASK);
1369 		}
1370 
1371 		/*
1372 		 * If the MAC address is zero, always randomize it in vmd(8)
1373 		 * because we cannot rely on the guest OS to do the right
1374 		 * thing like OpenBSD does.  Based on ether_fakeaddr()
1375 		 * from the kernel, incremented by one to differentiate
1376 		 * the source.
1377 		 */
1378 		if (memcmp(zero_mac, &vcp->vcp_macs[i], ETHER_ADDR_LEN) == 0) {
1379 			rng = arc4random();
1380 			vcp->vcp_macs[i][0] = 0xfe;
1381 			vcp->vcp_macs[i][1] = 0xe1;
1382 			vcp->vcp_macs[i][2] = 0xba + 1;
1383 			vcp->vcp_macs[i][3] = 0xd0 | ((i + 1) & 0xf);
1384 			vcp->vcp_macs[i][4] = rng;
1385 			vcp->vcp_macs[i][5] = rng >> 8;
1386 		}
1387 	}
1388 	vm->vm_kernel = -1;
1389 	vm->vm_cdrom = -1;
1390 	vm->vm_iev.ibuf.fd = -1;
1391 
1392 	/*
1393 	 * Assign a new internal Id if not specified and we succeed in
1394 	 * claiming a new Id.
1395 	 */
1396 	if (id != 0)
1397 		vm->vm_vmid = id;
1398 	else if (vm_claimid(vcp->vcp_name, uid, &nid) == -1)
1399 		goto fail;
1400 	else
1401 		vm->vm_vmid = nid;
1402 
1403 	log_debug("%s: registering vm %d", __func__, vm->vm_vmid);
1404 	TAILQ_INSERT_TAIL(env->vmd_vms, vm, vm_entry);
1405 
1406 	*ret_vm = vm;
1407 	return (0);
1408  fail:
1409 	if (errno == 0)
1410 		errno = EINVAL;
1411 	return (-1);
1412 }
1413 
1414 int
1415 vm_instance(struct privsep *ps, struct vmd_vm **vm_parent,
1416     struct vmop_create_params *vmc, uid_t uid)
1417 {
1418 	char			*name;
1419 	struct vm_create_params	*vcp = &vmc->vmc_params;
1420 	struct vmop_create_params *vmcp;
1421 	struct vm_create_params	*vcpp;
1422 	struct vmd_vm		*vm = NULL;
1423 	unsigned int		 i, j;
1424 	uint32_t		 id;
1425 
1426 	/* return without error if the parent is NULL (nothing to inherit) */
1427 	if ((vmc->vmc_flags & VMOP_CREATE_INSTANCE) == 0 ||
1428 	    vmc->vmc_instance[0] == '\0')
1429 		return (0);
1430 
1431 	if ((*vm_parent = vm_getbyname(vmc->vmc_instance)) == NULL) {
1432 		return (VMD_PARENT_INVALID);
1433 	}
1434 
1435 	vmcp = &(*vm_parent)->vm_params;
1436 	vcpp = &vmcp->vmc_params;
1437 
1438 	/* Are we allowed to create an instance from this VM? */
1439 	if (vm_checkperm(NULL, &vmcp->vmc_insowner, uid) != 0) {
1440 		log_warnx("vm \"%s\" no permission to create vm instance",
1441 		    vcpp->vcp_name);
1442 		return (ENAMETOOLONG);
1443 	}
1444 
1445 	id = vcp->vcp_id;
1446 	name = vcp->vcp_name;
1447 
1448 	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1449 	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1450 		return (EPROCLIM);
1451 	}
1452 
1453 	/* CPU */
1454 	if (vcp->vcp_ncpus == 0)
1455 		vcp->vcp_ncpus = vcpp->vcp_ncpus;
1456 	if (vm_checkinsflag(vmcp, VMOP_CREATE_CPU, uid) != 0 &&
1457 	    vcp->vcp_ncpus != vcpp->vcp_ncpus) {
1458 		log_warnx("vm \"%s\" no permission to set cpus", name);
1459 		return (EPERM);
1460 	}
1461 
1462 	/* memory */
1463 	if (vcp->vcp_memranges[0].vmr_size == 0)
1464 		vcp->vcp_memranges[0].vmr_size =
1465 		    vcpp->vcp_memranges[0].vmr_size;
1466 	if (vm_checkinsflag(vmcp, VMOP_CREATE_MEMORY, uid) != 0 &&
1467 	    vcp->vcp_memranges[0].vmr_size !=
1468 	    vcpp->vcp_memranges[0].vmr_size) {
1469 		log_warnx("vm \"%s\" no permission to set memory", name);
1470 		return (EPERM);
1471 	}
1472 
1473 	/* disks cannot be inherited */
1474 	if (vm_checkinsflag(vmcp, VMOP_CREATE_DISK, uid) != 0 &&
1475 	    vcp->vcp_ndisks) {
1476 		log_warnx("vm \"%s\" no permission to set disks", name);
1477 		return (EPERM);
1478 	}
1479 	for (i = 0; i < vcp->vcp_ndisks; i++) {
1480 		/* Check if this disk is already used in the parent */
1481 		for (j = 0; j < vcpp->vcp_ndisks; j++) {
1482 			if (strcmp(vcp->vcp_disks[i],
1483 			    vcpp->vcp_disks[j]) == 0) {
1484 				log_warnx("vm \"%s\" disk %s cannot be reused",
1485 				    name, vcp->vcp_disks[i]);
1486 				return (EBUSY);
1487 			}
1488 		}
1489 		vmc->vmc_checkaccess |= VMOP_CREATE_DISK;
1490 	}
1491 
1492 	/* interfaces */
1493 	if (vcp->vcp_nnics > 0 &&
1494 	    vm_checkinsflag(vmcp, VMOP_CREATE_NETWORK, uid) != 0 &&
1495 	    vcp->vcp_nnics != vcpp->vcp_nnics) {
1496 		log_warnx("vm \"%s\" no permission to set interfaces", name);
1497 		return (EPERM);
1498 	}
1499 	for (i = 0; i < vcpp->vcp_nnics; i++) {
1500 		/* Interface got overwritten */
1501 		if (i < vcp->vcp_nnics)
1502 			continue;
1503 
1504 		/* Copy interface from parent */
1505 		vmc->vmc_ifflags[i] = vmcp->vmc_ifflags[i];
1506 		(void)strlcpy(vmc->vmc_ifnames[i], vmcp->vmc_ifnames[i],
1507 		    sizeof(vmc->vmc_ifnames[i]));
1508 		(void)strlcpy(vmc->vmc_ifswitch[i], vmcp->vmc_ifswitch[i],
1509 		    sizeof(vmc->vmc_ifswitch[i]));
1510 		(void)strlcpy(vmc->vmc_ifgroup[i], vmcp->vmc_ifgroup[i],
1511 		    sizeof(vmc->vmc_ifgroup[i]));
1512 		memcpy(vcp->vcp_macs[i], vcpp->vcp_macs[i],
1513 		    sizeof(vcp->vcp_macs[i]));
1514 		vmc->vmc_ifrdomain[i] = vmcp->vmc_ifrdomain[i];
1515 		vcp->vcp_nnics++;
1516 	}
1517 	for (i = 0; i < vcp->vcp_nnics; i++) {
1518 		for (j = 0; j < vcpp->vcp_nnics; j++) {
1519 			if (memcmp(zero_mac, vcp->vcp_macs[i],
1520 			    sizeof(vcp->vcp_macs[i])) != 0 &&
1521 			    memcmp(vcpp->vcp_macs[i], vcp->vcp_macs[i],
1522 			    sizeof(vcp->vcp_macs[i])) != 0) {
1523 				log_warnx("vm \"%s\" lladdr cannot be reused",
1524 				    name);
1525 				return (EBUSY);
1526 			}
1527 			if (strlen(vmc->vmc_ifnames[i]) &&
1528 			    strcmp(vmc->vmc_ifnames[i],
1529 			    vmcp->vmc_ifnames[j]) == 0) {
1530 				log_warnx("vm \"%s\" %s cannot be reused",
1531 				    vmc->vmc_ifnames[i], name);
1532 				return (EBUSY);
1533 			}
1534 		}
1535 	}
1536 
1537 	/* kernel */
1538 	if (strlen(vcp->vcp_kernel) > 0) {
1539 		if (vm_checkinsflag(vmcp, VMOP_CREATE_KERNEL, uid) != 0) {
1540 			log_warnx("vm \"%s\" no permission to set boot image",
1541 			    name);
1542 			return (EPERM);
1543 		}
1544 		vmc->vmc_checkaccess |= VMOP_CREATE_KERNEL;
1545 	} else if (strlcpy(vcp->vcp_kernel, vcpp->vcp_kernel,
1546 	    sizeof(vcp->vcp_kernel)) >= sizeof(vcp->vcp_kernel)) {
1547 		log_warnx("vm \"%s\" kernel name too long", name);
1548 		return (EINVAL);
1549 	}
1550 
1551 	/* cdrom */
1552 	if (strlen(vcp->vcp_cdrom) > 0) {
1553 		if (vm_checkinsflag(vmcp, VMOP_CREATE_CDROM, uid) != 0) {
1554 			log_warnx("vm \"%s\" no permission to set cdrom", name);
1555 			return (EPERM);
1556 		}
1557 		vmc->vmc_checkaccess |= VMOP_CREATE_CDROM;
1558 	} else if (strlcpy(vcp->vcp_cdrom, vcpp->vcp_cdrom,
1559 	    sizeof(vcp->vcp_cdrom)) >= sizeof(vcp->vcp_cdrom)) {
1560 		log_warnx("vm \"%s\" cdrom name too long", name);
1561 		return (EINVAL);
1562 	}
1563 
1564 	/* user */
1565 	if (vmc->vmc_owner.uid == 0)
1566 		vmc->vmc_owner.uid = vmcp->vmc_owner.uid;
1567 	else if (vmc->vmc_owner.uid != uid &&
1568 	    vmc->vmc_owner.uid != vmcp->vmc_owner.uid) {
1569 		log_warnx("vm \"%s\" user mismatch", name);
1570 		return (EPERM);
1571 	}
1572 
1573 	/* group */
1574 	if (vmc->vmc_owner.gid == 0)
1575 		vmc->vmc_owner.gid = vmcp->vmc_owner.gid;
1576 	else if (vmc->vmc_owner.gid != vmcp->vmc_owner.gid) {
1577 		log_warnx("vm \"%s\" group mismatch", name);
1578 		return (EPERM);
1579 	}
1580 
1581 	/* child instances */
1582 	if (vmc->vmc_insflags) {
1583 		log_warnx("vm \"%s\" cannot change instance permissions", name);
1584 		return (EPERM);
1585 	}
1586 	if (vmcp->vmc_insflags & VMOP_CREATE_INSTANCE) {
1587 		vmc->vmc_insowner.gid = vmcp->vmc_insowner.gid;
1588 		vmc->vmc_insowner.uid = vmcp->vmc_insowner.gid;
1589 		vmc->vmc_insflags = vmcp->vmc_insflags;
1590 	} else {
1591 		vmc->vmc_insowner.gid = 0;
1592 		vmc->vmc_insowner.uid = 0;
1593 		vmc->vmc_insflags = 0;
1594 	}
1595 
1596 	/* finished, remove instance flags */
1597 	vmc->vmc_flags &= ~VMOP_CREATE_INSTANCE;
1598 
1599 	return (0);
1600 }
1601 
1602 /*
1603  * vm_checkperm
1604  *
1605  * Checks if the user represented by the 'uid' parameter is allowed to
1606  * manipulate the VM described by the 'vm' parameter (or connect to said VM's
1607  * console.)
1608  *
1609  * Parameters:
1610  *  vm: the VM whose permission is to be checked
1611  *  vmo: the required uid/gid to be checked
1612  *  uid: the user ID of the user making the request
1613  *
1614  * Return values:
1615  *   0: the permission should be granted
1616  *  -1: the permission check failed (also returned if vm == null)
1617  */
1618 int
1619 vm_checkperm(struct vmd_vm *vm, struct vmop_owner *vmo, uid_t uid)
1620 {
1621 	struct group	*gr;
1622 	struct passwd	*pw;
1623 	char		**grmem;
1624 
1625 	/* root has no restrictions */
1626 	if (uid == 0)
1627 		return (0);
1628 
1629 	if (vmo == NULL)
1630 		return (-1);
1631 
1632 	/* check user */
1633 	if (vm == NULL) {
1634 		if  (vmo->uid == uid)
1635 			return (0);
1636 	} else {
1637 		/*
1638 		 * check user of running vm (the owner of a running vm can
1639 		 * be different to (or more specific than) the configured owner.
1640 		 */
1641 		if (((vm->vm_state & VM_STATE_RUNNING) && vm->vm_uid == uid) ||
1642 		    (!(vm->vm_state & VM_STATE_RUNNING) && vmo->uid == uid))
1643 			return (0);
1644 	}
1645 
1646 	/* check groups */
1647 	if (vmo->gid != -1) {
1648 		if ((pw = getpwuid(uid)) == NULL)
1649 			return (-1);
1650 		if (pw->pw_gid == vmo->gid)
1651 			return (0);
1652 		if ((gr = getgrgid(vmo->gid)) != NULL) {
1653 			for (grmem = gr->gr_mem; *grmem; grmem++)
1654 				if (strcmp(*grmem, pw->pw_name) == 0)
1655 					return (0);
1656 		}
1657 	}
1658 
1659 	return (-1);
1660 }
1661 
1662 /*
1663  * vm_checkinsflag
1664  *
1665  * Checks wheter the non-root user is allowed to set an instance option.
1666  *
1667  * Parameters:
1668  *  vmc: the VM create parameters
1669  *  flag: the flag to be checked
1670  *  uid: the user ID of the user making the request
1671  *
1672  * Return values:
1673  *   0: the permission should be granted
1674  *  -1: the permission check failed (also returned if vm == null)
1675  */
1676 int
1677 vm_checkinsflag(struct vmop_create_params *vmc, unsigned int flag, uid_t uid)
1678 {
1679 	/* root has no restrictions */
1680 	if (uid == 0)
1681 		return (0);
1682 
1683 	if ((vmc->vmc_insflags & flag) == 0)
1684 		return (-1);
1685 
1686 	return (0);
1687 }
1688 
1689 /*
1690  * vm_checkaccess
1691  *
1692  * Checks if the user represented by the 'uid' parameter is allowed to
1693  * access the file described by the 'path' parameter.
1694  *
1695  * Parameters:
1696  *  fd: the file descriptor of the opened file
1697  *  uflag: check if the userid has access to the file
1698  *  uid: the user ID of the user making the request
1699  *  amode: the access flags of R_OK and W_OK
1700  *
1701  * Return values:
1702  *   0: the permission should be granted
1703  *  -1: the permission check failed
1704  */
1705 int
1706 vm_checkaccess(int fd, unsigned int uflag, uid_t uid, int amode)
1707 {
1708 	struct group	*gr;
1709 	struct passwd	*pw;
1710 	char		**grmem;
1711 	struct stat	 st;
1712 	mode_t		 mode;
1713 
1714 	if (fd == -1)
1715 		return (-1);
1716 
1717 	/*
1718 	 * File has to be accessible and a regular file
1719 	 */
1720 	if (fstat(fd, &st) == -1 || !S_ISREG(st.st_mode))
1721 		return (-1);
1722 
1723 	/* root has no restrictions */
1724 	if (uid == 0 || uflag == 0)
1725 		return (0);
1726 
1727 	/* check other */
1728 	mode = amode & W_OK ? S_IWOTH : 0;
1729 	mode |= amode & R_OK ? S_IROTH : 0;
1730 	if ((st.st_mode & mode) == mode)
1731 		return (0);
1732 
1733 	/* check user */
1734 	mode = amode & W_OK ? S_IWUSR : 0;
1735 	mode |= amode & R_OK ? S_IRUSR : 0;
1736 	if (uid == st.st_uid && (st.st_mode & mode) == mode)
1737 		return (0);
1738 
1739 	/* check groups */
1740 	mode = amode & W_OK ? S_IWGRP : 0;
1741 	mode |= amode & R_OK ? S_IRGRP : 0;
1742 	if ((st.st_mode & mode) != mode)
1743 		return (-1);
1744 	if ((pw = getpwuid(uid)) == NULL)
1745 		return (-1);
1746 	if (pw->pw_gid == st.st_gid)
1747 		return (0);
1748 	if ((gr = getgrgid(st.st_gid)) != NULL) {
1749 		for (grmem = gr->gr_mem; *grmem; grmem++)
1750 			if (strcmp(*grmem, pw->pw_name) == 0)
1751 				return (0);
1752 	}
1753 
1754 	return (-1);
1755 }
1756 
1757 int
1758 vm_opentty(struct vmd_vm *vm)
1759 {
1760 	struct ptmget		 ptm;
1761 	struct stat		 st;
1762 	struct group		*gr;
1763 	uid_t			 uid;
1764 	gid_t			 gid;
1765 	mode_t			 mode;
1766 	int			 on;
1767 
1768 	/*
1769 	 * Open tty with pre-opened PTM fd
1770 	 */
1771 	if ((ioctl(env->vmd_ptmfd, PTMGET, &ptm) == -1))
1772 		return (-1);
1773 
1774 	/*
1775 	 * We use user ioctl(2) mode to pass break commands.
1776 	 */
1777 	on = 1;
1778 	if (ioctl(ptm.cfd, TIOCUCNTL, &on) == -1)
1779 		fatal("could not enable user ioctl mode");
1780 
1781 	vm->vm_tty = ptm.cfd;
1782 	close(ptm.sfd);
1783 	if ((vm->vm_ttyname = strdup(ptm.sn)) == NULL)
1784 		goto fail;
1785 
1786 	uid = vm->vm_uid;
1787 	gid = vm->vm_params.vmc_owner.gid;
1788 
1789 	if (vm->vm_params.vmc_owner.gid != -1) {
1790 		mode = 0660;
1791 	} else if ((gr = getgrnam("tty")) != NULL) {
1792 		gid = gr->gr_gid;
1793 		mode = 0620;
1794 	} else {
1795 		mode = 0600;
1796 		gid = 0;
1797 	}
1798 
1799 	log_debug("%s: vm %s tty %s uid %d gid %d mode %o",
1800 	    __func__, vm->vm_params.vmc_params.vcp_name,
1801 	    vm->vm_ttyname, uid, gid, mode);
1802 
1803 	/*
1804 	 * Change ownership and mode of the tty as required.
1805 	 * Loosely based on the implementation of sshpty.c
1806 	 */
1807 	if (stat(vm->vm_ttyname, &st) == -1)
1808 		goto fail;
1809 
1810 	if (st.st_uid != uid || st.st_gid != gid) {
1811 		if (chown(vm->vm_ttyname, uid, gid) == -1) {
1812 			log_warn("chown %s %d %d failed, uid %d",
1813 			    vm->vm_ttyname, uid, gid, getuid());
1814 
1815 			/* Ignore failure on read-only filesystems */
1816 			if (!((errno == EROFS) &&
1817 			    (st.st_uid == uid || st.st_uid == 0)))
1818 				goto fail;
1819 		}
1820 	}
1821 
1822 	if ((st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)) != mode) {
1823 		if (chmod(vm->vm_ttyname, mode) == -1) {
1824 			log_warn("chmod %s %o failed, uid %d",
1825 			    vm->vm_ttyname, mode, getuid());
1826 
1827 			/* Ignore failure on read-only filesystems */
1828 			if (!((errno == EROFS) &&
1829 			    (st.st_uid == uid || st.st_uid == 0)))
1830 				goto fail;
1831 		}
1832 	}
1833 
1834 	return (0);
1835  fail:
1836 	vm_closetty(vm);
1837 	return (-1);
1838 }
1839 
1840 void
1841 vm_closetty(struct vmd_vm *vm)
1842 {
1843 	if (vm->vm_tty != -1) {
1844 		/* Release and close the tty */
1845 		if (fchown(vm->vm_tty, 0, 0) == -1)
1846 			log_warn("chown %s 0 0 failed", vm->vm_ttyname);
1847 		if (fchmod(vm->vm_tty, 0666) == -1)
1848 			log_warn("chmod %s 0666 failed", vm->vm_ttyname);
1849 		close(vm->vm_tty);
1850 		vm->vm_tty = -1;
1851 	}
1852 	free(vm->vm_ttyname);
1853 	vm->vm_ttyname = NULL;
1854 }
1855 
1856 void
1857 switch_remove(struct vmd_switch *vsw)
1858 {
1859 	if (vsw == NULL)
1860 		return;
1861 
1862 	TAILQ_REMOVE(env->vmd_switches, vsw, sw_entry);
1863 
1864 	free(vsw->sw_group);
1865 	free(vsw->sw_name);
1866 	free(vsw);
1867 }
1868 
1869 struct vmd_switch *
1870 switch_getbyname(const char *name)
1871 {
1872 	struct vmd_switch	*vsw;
1873 
1874 	if (name == NULL)
1875 		return (NULL);
1876 	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1877 		if (strcmp(vsw->sw_name, name) == 0)
1878 			return (vsw);
1879 	}
1880 
1881 	return (NULL);
1882 }
1883 
1884 struct vmd_user *
1885 user_get(uid_t uid)
1886 {
1887 	struct vmd_user		*usr;
1888 
1889 	if (uid == 0)
1890 		return (NULL);
1891 
1892 	/* first try to find an existing user */
1893 	TAILQ_FOREACH(usr, env->vmd_users, usr_entry) {
1894 		if (usr->usr_id.uid == uid)
1895 			goto done;
1896 	}
1897 
1898 	if ((usr = calloc(1, sizeof(*usr))) == NULL) {
1899 		log_warn("could not allocate user");
1900 		return (NULL);
1901 	}
1902 
1903 	usr->usr_id.uid = uid;
1904 	usr->usr_id.gid = -1;
1905 	TAILQ_INSERT_TAIL(env->vmd_users, usr, usr_entry);
1906 
1907  done:
1908 	DPRINTF("%s: uid %d #%d +",
1909 	    __func__, usr->usr_id.uid, usr->usr_refcnt + 1);
1910 	usr->usr_refcnt++;
1911 
1912 	return (usr);
1913 }
1914 
1915 void
1916 user_put(struct vmd_user *usr)
1917 {
1918 	if (usr == NULL)
1919 		return;
1920 
1921 	DPRINTF("%s: uid %d #%d -",
1922 	    __func__, usr->usr_id.uid, usr->usr_refcnt - 1);
1923 
1924 	if (--usr->usr_refcnt > 0)
1925 		return;
1926 
1927 	TAILQ_REMOVE(env->vmd_users, usr, usr_entry);
1928 	free(usr);
1929 }
1930 
1931 void
1932 user_inc(struct vm_create_params *vcp, struct vmd_user *usr, int inc)
1933 {
1934 	char	 mem[FMT_SCALED_STRSIZE];
1935 
1936 	if (usr == NULL)
1937 		return;
1938 
1939 	/* increment or decrement counters */
1940 	inc = inc ? 1 : -1;
1941 
1942 	usr->usr_maxcpu += vcp->vcp_ncpus * inc;
1943 	usr->usr_maxmem += vcp->vcp_memranges[0].vmr_size * inc;
1944 	usr->usr_maxifs += vcp->vcp_nnics * inc;
1945 
1946 	if (log_getverbose() > 1) {
1947 		(void)fmt_scaled(usr->usr_maxmem * 1024 * 1024, mem);
1948 		log_debug("%s: %c uid %d ref %d cpu %llu mem %s ifs %llu",
1949 		    __func__, inc == 1 ? '+' : '-',
1950 		    usr->usr_id.uid, usr->usr_refcnt,
1951 		    usr->usr_maxcpu, mem, usr->usr_maxifs);
1952 	}
1953 }
1954 
1955 int
1956 user_checklimit(struct vmd_user *usr, struct vm_create_params *vcp)
1957 {
1958 	const char	*limit = "";
1959 
1960 	/* XXX make the limits configurable */
1961 	if (usr->usr_maxcpu > VM_DEFAULT_USER_MAXCPU) {
1962 		limit = "cpu ";
1963 		goto fail;
1964 	}
1965 	if (usr->usr_maxmem > VM_DEFAULT_USER_MAXMEM) {
1966 		limit = "memory ";
1967 		goto fail;
1968 	}
1969 	if (usr->usr_maxifs > VM_DEFAULT_USER_MAXIFS) {
1970 		limit = "interface ";
1971 		goto fail;
1972 	}
1973 
1974 	return (0);
1975 
1976  fail:
1977 	log_warnx("%s: user %d %slimit reached", vcp->vcp_name,
1978 	    usr->usr_id.uid, limit);
1979 	return (-1);
1980 }
1981 
1982 char *
1983 get_string(uint8_t *ptr, size_t len)
1984 {
1985 	size_t	 i;
1986 
1987 	for (i = 0; i < len; i++)
1988 		if (!isprint(ptr[i]))
1989 			break;
1990 
1991 	return strndup(ptr, i);
1992 }
1993 
1994 uint32_t
1995 prefixlen2mask(uint8_t prefixlen)
1996 {
1997 	if (prefixlen == 0)
1998 		return (0);
1999 
2000 	if (prefixlen > 32)
2001 		prefixlen = 32;
2002 
2003 	return (htonl(0xffffffff << (32 - prefixlen)));
2004 }
2005 
2006 void
2007 prefixlen2mask6(uint8_t prefixlen, struct in6_addr *mask)
2008 {
2009 	struct in6_addr	 s6;
2010 	int		 i;
2011 
2012 	if (prefixlen > 128)
2013 		prefixlen = 128;
2014 
2015 	memset(&s6, 0, sizeof(s6));
2016 	for (i = 0; i < prefixlen / 8; i++)
2017 		s6.s6_addr[i] = 0xff;
2018 	i = prefixlen % 8;
2019 	if (i)
2020 		s6.s6_addr[prefixlen / 8] = 0xff00 >> i;
2021 
2022 	memcpy(mask, &s6, sizeof(s6));
2023 }
2024 
2025 void
2026 getmonotime(struct timeval *tv)
2027 {
2028 	struct timespec	 ts;
2029 
2030 	if (clock_gettime(CLOCK_MONOTONIC, &ts))
2031 		fatal("clock_gettime");
2032 
2033 	TIMESPEC_TO_TIMEVAL(tv, &ts);
2034 }
2035