xref: /openbsd/usr.sbin/vmd/vmd.c (revision 771fbea0)
1 /*	$OpenBSD: vmd.c,v 1.125 2021/05/05 21:33:11 dv Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>	/* nitems */
20 #include <sys/queue.h>
21 #include <sys/wait.h>
22 #include <sys/cdefs.h>
23 #include <sys/stat.h>
24 #include <sys/sysctl.h>
25 #include <sys/tty.h>
26 #include <sys/ttycom.h>
27 #include <sys/ioctl.h>
28 
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <termios.h>
33 #include <errno.h>
34 #include <event.h>
35 #include <fcntl.h>
36 #include <pwd.h>
37 #include <signal.h>
38 #include <syslog.h>
39 #include <unistd.h>
40 #include <util.h>
41 #include <ctype.h>
42 #include <pwd.h>
43 #include <grp.h>
44 
45 #include <machine/specialreg.h>
46 #include <machine/vmmvar.h>
47 
48 #include "proc.h"
49 #include "atomicio.h"
50 #include "vmd.h"
51 
52 __dead void usage(void);
53 
54 int	 main(int, char **);
55 int	 vmd_configure(void);
56 void	 vmd_sighdlr(int sig, short event, void *arg);
57 void	 vmd_shutdown(void);
58 int	 vmd_control_run(void);
59 int	 vmd_dispatch_control(int, struct privsep_proc *, struct imsg *);
60 int	 vmd_dispatch_vmm(int, struct privsep_proc *, struct imsg *);
61 int	 vmd_dispatch_priv(int, struct privsep_proc *, struct imsg *);
62 int	 vmd_check_vmh(struct vm_dump_header *);
63 
64 int	 vm_instance(struct privsep *, struct vmd_vm **,
65 	    struct vmop_create_params *, uid_t);
66 int	 vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t);
67 int	 vm_claimid(const char *, int, uint32_t *);
68 void	 start_vm_batch(int, short, void*);
69 
70 struct vmd	*env;
71 
72 static struct privsep_proc procs[] = {
73 	/* Keep "priv" on top as procs[0] */
74 	{ "priv",	PROC_PRIV,	vmd_dispatch_priv, priv },
75 	{ "control",	PROC_CONTROL,	vmd_dispatch_control, control },
76 	{ "vmm",	PROC_VMM,	vmd_dispatch_vmm, vmm, vmm_shutdown },
77 };
78 
79 enum privsep_procid privsep_process;
80 
81 struct event staggered_start_timer;
82 
83 /* For the privileged process */
84 static struct privsep_proc *proc_priv = &procs[0];
85 static struct passwd proc_privpw;
86 static const uint8_t zero_mac[ETHER_ADDR_LEN];
87 
88 int
89 vmd_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg)
90 {
91 	struct privsep			*ps = p->p_ps;
92 	int				 res = 0, ret = 0, cmd = 0, verbose;
93 	unsigned int			 v = 0, flags;
94 	struct vmop_create_params	 vmc;
95 	struct vmop_id			 vid;
96 	struct vmop_result		 vmr;
97 	struct vm_dump_header		 vmh;
98 	struct vmd_vm			*vm = NULL;
99 	char				*str = NULL;
100 	uint32_t			 id = 0;
101 	struct control_sock		*rcs;
102 
103 	switch (imsg->hdr.type) {
104 	case IMSG_VMDOP_START_VM_REQUEST:
105 		IMSG_SIZE_CHECK(imsg, &vmc);
106 		memcpy(&vmc, imsg->data, sizeof(vmc));
107 		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
108 		if (vmc.vmc_flags == 0) {
109 			/* start an existing VM with pre-configured options */
110 			if (!(ret == -1 && errno == EALREADY &&
111 			    !(vm->vm_state & VM_STATE_RUNNING))) {
112 				res = errno;
113 				cmd = IMSG_VMDOP_START_VM_RESPONSE;
114 			}
115 		} else if (ret != 0) {
116 			res = errno;
117 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
118 		}
119 		if (res == 0 &&
120 		    config_setvm(ps, vm,
121 		    imsg->hdr.peerid, vm->vm_params.vmc_owner.uid) == -1) {
122 			res = errno;
123 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
124 		}
125 		break;
126 	case IMSG_VMDOP_WAIT_VM_REQUEST:
127 	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
128 		IMSG_SIZE_CHECK(imsg, &vid);
129 		memcpy(&vid, imsg->data, sizeof(vid));
130 		flags = vid.vid_flags;
131 		cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
132 
133 		if ((id = vid.vid_id) == 0) {
134 			/* Lookup vm (id) by name */
135 			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
136 				res = ENOENT;
137 				break;
138 			} else if ((vm->vm_state & VM_STATE_SHUTDOWN) &&
139 			    (flags & VMOP_FORCE) == 0) {
140 				res = EALREADY;
141 				break;
142 			} else if (!(vm->vm_state & VM_STATE_RUNNING)) {
143 				res = EINVAL;
144 				break;
145 			}
146 			id = vm->vm_vmid;
147 		} else if ((vm = vm_getbyvmid(id)) == NULL) {
148 			res = ENOENT;
149 			break;
150 		}
151 		if (vm_checkperm(vm, &vm->vm_params.vmc_owner, vid.vid_uid)) {
152 			res = EPERM;
153 			break;
154 		}
155 
156 		/* Only relay TERMINATION requests, not WAIT requests */
157 		if (imsg->hdr.type == IMSG_VMDOP_TERMINATE_VM_REQUEST) {
158 			memset(&vid, 0, sizeof(vid));
159 			vid.vid_id = id;
160 			vid.vid_flags = flags;
161 
162 			if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
163 				imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1)
164 				return (-1);
165 		}
166 		break;
167 	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
168 		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
169 		break;
170 	case IMSG_VMDOP_LOAD:
171 		IMSG_SIZE_CHECK(imsg, str); /* at least one byte for path */
172 		str = get_string((uint8_t *)imsg->data,
173 		    IMSG_DATA_SIZE(imsg));
174 	case IMSG_VMDOP_RELOAD:
175 		if (vmd_reload(0, str) == -1)
176 			cmd = IMSG_CTL_FAIL;
177 		else
178 			cmd = IMSG_CTL_OK;
179 		free(str);
180 		break;
181 	case IMSG_CTL_RESET:
182 		IMSG_SIZE_CHECK(imsg, &v);
183 		memcpy(&v, imsg->data, sizeof(v));
184 		if (vmd_reload(v, NULL) == -1)
185 			cmd = IMSG_CTL_FAIL;
186 		else
187 			cmd = IMSG_CTL_OK;
188 		break;
189 	case IMSG_CTL_VERBOSE:
190 		IMSG_SIZE_CHECK(imsg, &verbose);
191 		memcpy(&verbose, imsg->data, sizeof(verbose));
192 		log_setverbose(verbose);
193 
194 		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
195 		proc_forward_imsg(ps, imsg, PROC_PRIV, -1);
196 		cmd = IMSG_CTL_OK;
197 		break;
198 	case IMSG_VMDOP_PAUSE_VM:
199 	case IMSG_VMDOP_UNPAUSE_VM:
200 		IMSG_SIZE_CHECK(imsg, &vid);
201 		memcpy(&vid, imsg->data, sizeof(vid));
202 		if (vid.vid_id == 0) {
203 			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
204 				res = ENOENT;
205 				cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
206 				    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
207 				    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
208 				break;
209 			} else {
210 				vid.vid_id = vm->vm_vmid;
211 			}
212 		} else if ((vm = vm_getbyid(vid.vid_id)) == NULL) {
213 			res = ENOENT;
214 			cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
215 			    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
216 			    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
217 			break;
218 		}
219 		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
220 		    vid.vid_uid) != 0) {
221 			res = EPERM;
222 			cmd = imsg->hdr.type == IMSG_VMDOP_PAUSE_VM
223 			    ? IMSG_VMDOP_PAUSE_VM_RESPONSE
224 			    : IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
225 			break;
226 		}
227 		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
228 		    imsg->hdr.peerid, -1, &vid, sizeof(vid));
229 		break;
230 	case IMSG_VMDOP_SEND_VM_REQUEST:
231 		IMSG_SIZE_CHECK(imsg, &vid);
232 		memcpy(&vid, imsg->data, sizeof(vid));
233 		id = vid.vid_id;
234 		if (vid.vid_id == 0) {
235 			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
236 				res = ENOENT;
237 				cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
238 				close(imsg->fd);
239 				break;
240 			} else {
241 				vid.vid_id = vm->vm_vmid;
242 			}
243 		} else if ((vm = vm_getbyvmid(vid.vid_id)) == NULL) {
244 			res = ENOENT;
245 			cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
246 			close(imsg->fd);
247 			break;
248 		}
249 		vmr.vmr_id = vid.vid_id;
250 		log_debug("%s: sending fd to vmm", __func__);
251 		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
252 		    imsg->hdr.peerid, imsg->fd, &vid, sizeof(vid));
253 		break;
254 	case IMSG_VMDOP_RECEIVE_VM_REQUEST:
255 		IMSG_SIZE_CHECK(imsg, &vid);
256 		memcpy(&vid, imsg->data, sizeof(vid));
257 		if (imsg->fd == -1) {
258 			log_warnx("%s: invalid fd", __func__);
259 			return (-1);
260 		}
261 		if (atomicio(read, imsg->fd, &vmh, sizeof(vmh)) !=
262 		    sizeof(vmh)) {
263 			log_warnx("%s: error reading vmh from received vm",
264 			    __func__);
265 			res = EIO;
266 			close(imsg->fd);
267 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
268 			break;
269 		}
270 
271 		if (vmd_check_vmh(&vmh)) {
272 			res = ENOENT;
273 			close(imsg->fd);
274 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
275 			break;
276 		}
277 		if (atomicio(read, imsg->fd, &vmc, sizeof(vmc)) !=
278 		    sizeof(vmc)) {
279 			log_warnx("%s: error reading vmc from received vm",
280 			    __func__);
281 			res = EIO;
282 			close(imsg->fd);
283 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
284 			break;
285 		}
286 		strlcpy(vmc.vmc_params.vcp_name, vid.vid_name,
287 		    sizeof(vmc.vmc_params.vcp_name));
288 		vmc.vmc_params.vcp_id = 0;
289 
290 		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
291 		if (ret != 0) {
292 			res = errno;
293 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
294 			close(imsg->fd);
295 		} else {
296 			vm->vm_state |= VM_STATE_RECEIVED;
297 			config_setvm(ps, vm, imsg->hdr.peerid,
298 			    vmc.vmc_owner.uid);
299 			log_debug("%s: sending fd to vmm", __func__);
300 			proc_compose_imsg(ps, PROC_VMM, -1,
301 			    IMSG_VMDOP_RECEIVE_VM_END, vm->vm_vmid, imsg->fd,
302 			    NULL, 0);
303 		}
304 		break;
305 	case IMSG_VMDOP_DONE:
306 		control_reset(&ps->ps_csock);
307 		TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry)
308 			control_reset(rcs);
309 		cmd = 0;
310 		break;
311 	default:
312 		return (-1);
313 	}
314 
315 	switch (cmd) {
316 	case 0:
317 		break;
318 	case IMSG_VMDOP_START_VM_RESPONSE:
319 	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
320 		memset(&vmr, 0, sizeof(vmr));
321 		vmr.vmr_result = res;
322 		vmr.vmr_id = id;
323 		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
324 		    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
325 			return (-1);
326 		break;
327 	default:
328 		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
329 		    imsg->hdr.peerid, -1, &res, sizeof(res)) == -1)
330 			return (-1);
331 		break;
332 	}
333 
334 	return (0);
335 }
336 
337 int
338 vmd_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg)
339 {
340 	struct vmop_result	 vmr;
341 	struct privsep		*ps = p->p_ps;
342 	int			 res = 0;
343 	struct vmd_vm		*vm;
344 	struct vm_create_params	*vcp;
345 	struct vmop_info_result	 vir;
346 
347 	switch (imsg->hdr.type) {
348 	case IMSG_VMDOP_PAUSE_VM_RESPONSE:
349 		IMSG_SIZE_CHECK(imsg, &vmr);
350 		memcpy(&vmr, imsg->data, sizeof(vmr));
351 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
352 			break;
353 		proc_compose_imsg(ps, PROC_CONTROL, -1,
354 		    imsg->hdr.type, imsg->hdr.peerid, -1,
355 		    imsg->data, sizeof(imsg->data));
356 		log_info("%s: paused vm %d successfully",
357 		    vm->vm_params.vmc_params.vcp_name,
358 		    vm->vm_vmid);
359 		vm->vm_state |= VM_STATE_PAUSED;
360 		break;
361 	case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
362 		IMSG_SIZE_CHECK(imsg, &vmr);
363 		memcpy(&vmr, imsg->data, sizeof(vmr));
364 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
365 			break;
366 		proc_compose_imsg(ps, PROC_CONTROL, -1,
367 		    imsg->hdr.type, imsg->hdr.peerid, -1,
368 		    imsg->data, sizeof(imsg->data));
369 		log_info("%s: unpaused vm %d successfully.",
370 		    vm->vm_params.vmc_params.vcp_name,
371 		    vm->vm_vmid);
372 		vm->vm_state &= ~VM_STATE_PAUSED;
373 		break;
374 	case IMSG_VMDOP_START_VM_RESPONSE:
375 		IMSG_SIZE_CHECK(imsg, &vmr);
376 		memcpy(&vmr, imsg->data, sizeof(vmr));
377 		if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL)
378 			break;
379 		vm->vm_pid = vmr.vmr_pid;
380 		vcp = &vm->vm_params.vmc_params;
381 		vcp->vcp_id = vmr.vmr_id;
382 
383 		/*
384 		 * If the peerid is not -1, forward the response back to the
385 		 * the control socket.  If it is -1, the request originated
386 		 * from the parent, not the control socket.
387 		 */
388 		if (vm->vm_peerid != (uint32_t)-1) {
389 			(void)strlcpy(vmr.vmr_ttyname, vm->vm_ttyname,
390 			    sizeof(vmr.vmr_ttyname));
391 			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
392 			    imsg->hdr.type, vm->vm_peerid, -1,
393 			    &vmr, sizeof(vmr)) == -1) {
394 				errno = vmr.vmr_result;
395 				log_warn("%s: failed to foward vm result",
396 				    vcp->vcp_name);
397 				vm_remove(vm, __func__);
398 				return (-1);
399 			}
400 		}
401 
402 		if (vmr.vmr_result) {
403 			errno = vmr.vmr_result;
404 			log_warn("%s: failed to start vm", vcp->vcp_name);
405 			vm_remove(vm, __func__);
406 			break;
407 		}
408 
409 		/* Now configure all the interfaces */
410 		if (vm_priv_ifconfig(ps, vm) == -1) {
411 			log_warn("%s: failed to configure vm", vcp->vcp_name);
412 			vm_remove(vm, __func__);
413 			break;
414 		}
415 
416 		log_info("%s: started vm %d successfully, tty %s",
417 		    vcp->vcp_name, vm->vm_vmid, vm->vm_ttyname);
418 		break;
419 	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
420 		IMSG_SIZE_CHECK(imsg, &vmr);
421 		memcpy(&vmr, imsg->data, sizeof(vmr));
422 
423 		if (vmr.vmr_result) {
424 			DPRINTF("%s: forwarding TERMINATE VM for vm id %d",
425 			    __func__, vmr.vmr_id);
426 			proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
427 		} else {
428 			if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
429 				break;
430 			/* Mark VM as shutting down */
431 			vm->vm_state |= VM_STATE_SHUTDOWN;
432 		}
433 		break;
434 	case IMSG_VMDOP_SEND_VM_RESPONSE:
435 		IMSG_SIZE_CHECK(imsg, &vmr);
436 		memcpy(&vmr, imsg->data, sizeof(vmr));
437 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
438 			break;
439 		if (!vmr.vmr_result) {
440 			log_info("%s: sent vm %d successfully.",
441 			    vm->vm_params.vmc_params.vcp_name,
442 			    vm->vm_vmid);
443 			if (vm->vm_from_config)
444 				vm_stop(vm, 0, __func__);
445 			else
446 				vm_remove(vm, __func__);
447 		}
448 
449 		/* Send a response if a control client is waiting for it */
450 		if (imsg->hdr.peerid != (uint32_t)-1) {
451 			/* the error is meaningless for deferred responses */
452 			vmr.vmr_result = 0;
453 
454 			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
455 			    IMSG_VMDOP_SEND_VM_RESPONSE,
456 			    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
457 				return (-1);
458 		}
459 		break;
460 	case IMSG_VMDOP_TERMINATE_VM_EVENT:
461 		IMSG_SIZE_CHECK(imsg, &vmr);
462 		memcpy(&vmr, imsg->data, sizeof(vmr));
463 		DPRINTF("%s: handling TERMINATE_EVENT for vm id %d ret %d",
464 		    __func__, vmr.vmr_id, vmr.vmr_result);
465 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) {
466 			log_debug("%s: vm %d is no longer available",
467 			    __func__, vmr.vmr_id);
468 			break;
469 		}
470 		if (vmr.vmr_result != EAGAIN ||
471 		    vm->vm_params.vmc_bootdevice) {
472 			if (vm->vm_from_config)
473 				vm_stop(vm, 0, __func__);
474 			else
475 				vm_remove(vm, __func__);
476 		} else {
477 			/* Stop VM instance but keep the tty open */
478 			vm_stop(vm, 1, __func__);
479 			config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid);
480 		}
481 
482 		/* The error is meaningless for deferred responses */
483 		vmr.vmr_result = 0;
484 
485 		if (proc_compose_imsg(ps, PROC_CONTROL, -1,
486 			IMSG_VMDOP_TERMINATE_VM_EVENT,
487 			imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
488 			return (-1);
489 		break;
490 	case IMSG_VMDOP_GET_INFO_VM_DATA:
491 		IMSG_SIZE_CHECK(imsg, &vir);
492 		memcpy(&vir, imsg->data, sizeof(vir));
493 		if ((vm = vm_getbyvmid(vir.vir_info.vir_id)) != NULL) {
494 			memset(vir.vir_ttyname, 0, sizeof(vir.vir_ttyname));
495 			if (vm->vm_ttyname != NULL)
496 				strlcpy(vir.vir_ttyname, vm->vm_ttyname,
497 				    sizeof(vir.vir_ttyname));
498 			log_debug("%s: running vm: %d, vm_state: 0x%x",
499 			    __func__, vm->vm_vmid, vm->vm_state);
500 			vir.vir_state = vm->vm_state;
501 			/* get the user id who started the vm */
502 			vir.vir_uid = vm->vm_uid;
503 			vir.vir_gid = vm->vm_params.vmc_owner.gid;
504 		}
505 		if (proc_compose_imsg(ps, PROC_CONTROL, -1, imsg->hdr.type,
506 		    imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) {
507 			log_debug("%s: GET_INFO_VM failed for vm %d, removing",
508 			    __func__, vm->vm_vmid);
509 			vm_remove(vm, __func__);
510 			return (-1);
511 		}
512 		break;
513 	case IMSG_VMDOP_GET_INFO_VM_END_DATA:
514 		/*
515 		 * PROC_VMM has responded with the *running* VMs, now we
516 		 * append the others. These use the special value 0 for their
517 		 * kernel id to indicate that they are not running.
518 		 */
519 		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
520 			if (!(vm->vm_state & VM_STATE_RUNNING)) {
521 				memset(&vir, 0, sizeof(vir));
522 				vir.vir_info.vir_id = vm->vm_vmid;
523 				strlcpy(vir.vir_info.vir_name,
524 				    vm->vm_params.vmc_params.vcp_name,
525 				    VMM_MAX_NAME_LEN);
526 				vir.vir_info.vir_memory_size =
527 				    vm->vm_params.vmc_params.
528 				    vcp_memranges[0].vmr_size;
529 				vir.vir_info.vir_ncpus =
530 				    vm->vm_params.vmc_params.vcp_ncpus;
531 				/* get the configured user id for this vm */
532 				vir.vir_uid = vm->vm_params.vmc_owner.uid;
533 				vir.vir_gid = vm->vm_params.vmc_owner.gid;
534 				log_debug("%s: vm: %d, vm_state: 0x%x",
535 				    __func__, vm->vm_vmid, vm->vm_state);
536 				vir.vir_state = vm->vm_state;
537 				if (proc_compose_imsg(ps, PROC_CONTROL, -1,
538 				    IMSG_VMDOP_GET_INFO_VM_DATA,
539 				    imsg->hdr.peerid, -1, &vir,
540 				    sizeof(vir)) == -1) {
541 					log_debug("%s: GET_INFO_VM_END failed",
542 					    __func__);
543 					vm_remove(vm, __func__);
544 					return (-1);
545 				}
546 			}
547 		}
548 		IMSG_SIZE_CHECK(imsg, &res);
549 		proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
550 		break;
551 	default:
552 		return (-1);
553 	}
554 
555 	return (0);
556 }
557 
558 int
559 vmd_dispatch_priv(int fd, struct privsep_proc *p, struct imsg *imsg)
560 {
561 	struct vmop_addr_result	 var;
562 
563 	switch (imsg->hdr.type) {
564 	case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
565 		IMSG_SIZE_CHECK(imsg, &var);
566 		memcpy(&var, imsg->data, sizeof(var));
567 		proc_forward_imsg(p->p_ps, imsg, PROC_VMM, -1);
568 		break;
569 	default:
570 		return (-1);
571 	}
572 
573 	return (0);
574 }
575 
576 int
577 vmd_check_vmh(struct vm_dump_header *vmh)
578 {
579 	int i;
580 	unsigned int code, leaf;
581 	unsigned int a, b, c, d;
582 
583 	if (strncmp(vmh->vmh_signature, VM_DUMP_SIGNATURE, strlen(VM_DUMP_SIGNATURE)) != 0) {
584 		log_warnx("%s: incompatible dump signature", __func__);
585 		return (-1);
586 	}
587 
588 	if (vmh->vmh_version != VM_DUMP_VERSION) {
589 		log_warnx("%s: incompatible dump version", __func__);
590 		return (-1);
591 	}
592 
593 	for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
594 		code = vmh->vmh_cpuids[i].code;
595 		leaf = vmh->vmh_cpuids[i].leaf;
596 		if (leaf != 0x00) {
597 			log_debug("%s: invalid leaf 0x%x for code 0x%x",
598 			    __func__, leaf, code);
599 			return (-1);
600 		}
601 
602 		switch (code) {
603 		case 0x00:
604 			CPUID_LEAF(code, leaf, a, b, c, d);
605 			if (vmh->vmh_cpuids[i].a > a) {
606 				log_debug("%s: incompatible cpuid level",
607 				    __func__);
608 				return (-1);
609 			}
610 			if (!(vmh->vmh_cpuids[i].b == b &&
611 			    vmh->vmh_cpuids[i].c == c &&
612 			    vmh->vmh_cpuids[i].d == d)) {
613 				log_debug("%s: incompatible cpu brand",
614 				    __func__);
615 				return (-1);
616 			}
617 			break;
618 
619 		case 0x01:
620 			CPUID_LEAF(code, leaf, a, b, c, d);
621 			if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
622 			    (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
623 				log_debug("%s: incompatible cpu features "
624 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
625 				    code, leaf);
626 				return (-1);
627 			}
628 			if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
629 			    (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
630 				log_debug("%s: incompatible cpu features "
631 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
632 				    code, leaf);
633 				return (-1);
634 			}
635 			break;
636 
637 		case 0x07:
638 			CPUID_LEAF(code, leaf, a, b, c, d);
639 			if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
640 			    (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
641 				log_debug("%s: incompatible cpu features "
642 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
643 				    code, leaf);
644 				return (-1);
645 			}
646 			if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
647 			    (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
648 				log_debug("%s: incompatible cpu features "
649 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
650 				    code, leaf);
651 				return (-1);
652 			}
653 			break;
654 
655 		case 0x0d:
656 			CPUID_LEAF(code, leaf, a, b, c, d);
657 			if (vmh->vmh_cpuids[i].b > b) {
658 				log_debug("%s: incompatible cpu: insufficient "
659 				    "max save area for enabled XCR0 features",
660 				    __func__);
661 				return (-1);
662 			}
663 			if (vmh->vmh_cpuids[i].c > c) {
664 				log_debug("%s: incompatible cpu: insufficient "
665 				    "max save area for supported XCR0 features",
666 				    __func__);
667 				return (-1);
668 			}
669 			break;
670 
671 		case 0x80000001:
672 			CPUID_LEAF(code, leaf, a, b, c, d);
673 			if ((vmh->vmh_cpuids[i].a & a) !=
674 			    vmh->vmh_cpuids[i].a) {
675 				log_debug("%s: incompatible cpu features "
676 				    "code: 0x%x leaf: 0x%x  reg: a", __func__,
677 				    code, leaf);
678 				return (-1);
679 			}
680 			if ((vmh->vmh_cpuids[i].c & c) !=
681 			    vmh->vmh_cpuids[i].c) {
682 				log_debug("%s: incompatible cpu features "
683 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
684 				    code, leaf);
685 				return (-1);
686 			}
687 			if ((vmh->vmh_cpuids[i].d & d) !=
688 			    vmh->vmh_cpuids[i].d) {
689 				log_debug("%s: incompatible cpu features "
690 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
691 				    code, leaf);
692 				return (-1);
693 			}
694 			break;
695 
696 		default:
697 			log_debug("%s: unknown code 0x%x", __func__, code);
698 			return (-1);
699 		}
700 	}
701 
702 	return (0);
703 }
704 
705 void
706 vmd_sighdlr(int sig, short event, void *arg)
707 {
708 	if (privsep_process != PROC_PARENT)
709 		return;
710 	log_debug("%s: handling signal", __func__);
711 
712 	switch (sig) {
713 	case SIGHUP:
714 		log_info("%s: reload requested with SIGHUP", __func__);
715 
716 		/*
717 		 * This is safe because libevent uses async signal handlers
718 		 * that run in the event loop and not in signal context.
719 		 */
720 		(void)vmd_reload(0, NULL);
721 		break;
722 	case SIGPIPE:
723 		log_info("%s: ignoring SIGPIPE", __func__);
724 		break;
725 	case SIGUSR1:
726 		log_info("%s: ignoring SIGUSR1", __func__);
727 		break;
728 	case SIGTERM:
729 	case SIGINT:
730 		vmd_shutdown();
731 		break;
732 	default:
733 		fatalx("unexpected signal");
734 	}
735 }
736 
737 __dead void
738 usage(void)
739 {
740 	extern char *__progname;
741 	fprintf(stderr, "usage: %s [-dnv] [-D macro=value] [-f file]\n",
742 	    __progname);
743 	exit(1);
744 }
745 
746 int
747 main(int argc, char **argv)
748 {
749 	struct privsep		*ps;
750 	int			 ch;
751 	const char		*conffile = VMD_CONF;
752 	enum privsep_procid	 proc_id = PROC_PARENT;
753 	int			 proc_instance = 0;
754 	const char		*errp, *title = NULL;
755 	int			 argc0 = argc;
756 
757 	log_init(0, LOG_DAEMON);
758 
759 	if ((env = calloc(1, sizeof(*env))) == NULL)
760 		fatal("calloc: env");
761 
762 	while ((ch = getopt(argc, argv, "D:P:I:df:vn")) != -1) {
763 		switch (ch) {
764 		case 'D':
765 			if (cmdline_symset(optarg) < 0)
766 				log_warnx("could not parse macro definition %s",
767 				    optarg);
768 			break;
769 		case 'd':
770 			env->vmd_debug = 2;
771 			break;
772 		case 'f':
773 			conffile = optarg;
774 			break;
775 		case 'v':
776 			env->vmd_verbose++;
777 			break;
778 		case 'n':
779 			env->vmd_noaction = 1;
780 			break;
781 		case 'P':
782 			title = optarg;
783 			proc_id = proc_getid(procs, nitems(procs), title);
784 			if (proc_id == PROC_MAX)
785 				fatalx("invalid process name");
786 			break;
787 		case 'I':
788 			proc_instance = strtonum(optarg, 0,
789 			    PROC_MAX_INSTANCES, &errp);
790 			if (errp)
791 				fatalx("invalid process instance");
792 			break;
793 		default:
794 			usage();
795 		}
796 	}
797 
798 	argc -= optind;
799 	if (argc > 0)
800 		usage();
801 
802 	if (env->vmd_noaction && !env->vmd_debug)
803 		env->vmd_debug = 1;
804 
805 	log_init(env->vmd_debug, LOG_DAEMON);
806 	log_setverbose(env->vmd_verbose);
807 
808 	/* check for root privileges */
809 	if (env->vmd_noaction == 0) {
810 		if (geteuid())
811 			fatalx("need root privileges");
812 	}
813 
814 	ps = &env->vmd_ps;
815 	ps->ps_env = env;
816 	env->vmd_fd = -1;
817 
818 	if (config_init(env) == -1)
819 		fatal("failed to initialize configuration");
820 
821 	if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL)
822 		fatal("unknown user %s", VMD_USER);
823 
824 	/* First proc runs as root without pledge but in default chroot */
825 	proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */
826 	proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */
827 
828 	/* Open /dev/vmm */
829 	if (env->vmd_noaction == 0) {
830 		env->vmd_fd = open(VMM_NODE, O_RDWR);
831 		if (env->vmd_fd == -1)
832 			fatal("%s", VMM_NODE);
833 	}
834 
835 	/* Configure the control socket */
836 	ps->ps_csock.cs_name = SOCKET_NAME;
837 	TAILQ_INIT(&ps->ps_rcsocks);
838 
839 	/* Configuration will be parsed after forking the children */
840 	env->vmd_conffile = conffile;
841 
842 	if (env->vmd_noaction)
843 		ps->ps_noaction = 1;
844 	ps->ps_instance = proc_instance;
845 	if (title != NULL)
846 		ps->ps_title[proc_id] = title;
847 
848 	/* only the parent returns */
849 	proc_init(ps, procs, nitems(procs), env->vmd_debug, argc0, argv,
850 	    proc_id);
851 
852 	log_procinit("parent");
853 	if (!env->vmd_debug && daemon(0, 0) == -1)
854 		fatal("can't daemonize");
855 
856 	if (ps->ps_noaction == 0)
857 		log_info("startup");
858 
859 	event_init();
860 
861 	signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps);
862 	signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps);
863 	signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps);
864 	signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps);
865 	signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps);
866 
867 	signal_add(&ps->ps_evsigint, NULL);
868 	signal_add(&ps->ps_evsigterm, NULL);
869 	signal_add(&ps->ps_evsighup, NULL);
870 	signal_add(&ps->ps_evsigpipe, NULL);
871 	signal_add(&ps->ps_evsigusr1, NULL);
872 
873 	if (!env->vmd_noaction)
874 		proc_connect(ps);
875 
876 	if (vmd_configure() == -1)
877 		fatalx("configuration failed");
878 
879 	event_dispatch();
880 
881 	log_debug("parent exiting");
882 
883 	return (0);
884 }
885 
886 void
887 start_vm_batch(int fd, short type, void *args)
888 {
889 	int		i = 0;
890 	struct vmd_vm	*vm;
891 
892 	log_debug("%s: starting batch of %d vms", __func__,
893 	    env->vmd_cfg.parallelism);
894 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
895 		if (!(vm->vm_state & VM_STATE_WAITING)) {
896 			log_debug("%s: not starting vm %s (disabled)",
897 			    __func__,
898 			    vm->vm_params.vmc_params.vcp_name);
899 			continue;
900 		}
901 		i++;
902 		if (i > env->vmd_cfg.parallelism) {
903 			evtimer_add(&staggered_start_timer,
904 			    &env->vmd_cfg.delay);
905 			break;
906 		}
907 		vm->vm_state &= ~VM_STATE_WAITING;
908 		config_setvm(&env->vmd_ps, vm, -1, vm->vm_params.vmc_owner.uid);
909 	}
910 	log_debug("%s: done starting vms", __func__);
911 }
912 
913 int
914 vmd_configure(void)
915 {
916 	int			ncpus;
917 	struct vmd_switch	*vsw;
918 	int ncpu_mib[] = {CTL_HW, HW_NCPUONLINE};
919 	size_t ncpus_sz = sizeof(ncpus);
920 
921 	if ((env->vmd_ptmfd = open(PATH_PTMDEV, O_RDWR|O_CLOEXEC)) == -1)
922 		fatal("open %s", PATH_PTMDEV);
923 
924 	/*
925 	 * pledge in the parent process:
926 	 * stdio - for malloc and basic I/O including events.
927 	 * rpath - for reload to open and read the configuration files.
928 	 * wpath - for opening disk images and tap devices.
929 	 * tty - for openpty and TIOCUCNTL.
930 	 * proc - run kill to terminate its children safely.
931 	 * sendfd - for disks, interfaces and other fds.
932 	 * recvfd - for send and receive.
933 	 * getpw - lookup user or group id by name.
934 	 * chown, fattr - change tty ownership
935 	 * flock - locking disk files
936 	 */
937 	if (pledge("stdio rpath wpath proc tty recvfd sendfd getpw"
938 	    " chown fattr flock", NULL) == -1)
939 		fatal("pledge");
940 
941 	if (parse_config(env->vmd_conffile) == -1) {
942 		proc_kill(&env->vmd_ps);
943 		exit(1);
944 	}
945 
946 	if (env->vmd_noaction) {
947 		fprintf(stderr, "configuration OK\n");
948 		proc_kill(&env->vmd_ps);
949 		exit(0);
950 	}
951 
952 	/* Send shared global configuration to all children */
953 	if (config_setconfig(env) == -1)
954 		return (-1);
955 
956 	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
957 		if (vsw->sw_running)
958 			continue;
959 		if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
960 			log_warn("%s: failed to create switch %s",
961 			    __func__, vsw->sw_name);
962 			switch_remove(vsw);
963 			return (-1);
964 		}
965 	}
966 
967 	if (!(env->vmd_cfg.cfg_flags & VMD_CFG_STAGGERED_START)) {
968 		env->vmd_cfg.delay.tv_sec = VMD_DEFAULT_STAGGERED_START_DELAY;
969 		if (sysctl(ncpu_mib, NELEM(ncpu_mib), &ncpus, &ncpus_sz, NULL, 0) == -1)
970 			ncpus = 1;
971 		env->vmd_cfg.parallelism = ncpus;
972 		log_debug("%s: setting staggered start configuration to "
973 		    "parallelism: %d and delay: %lld",
974 		    __func__, ncpus, (long long) env->vmd_cfg.delay.tv_sec);
975 	}
976 
977 	log_debug("%s: starting vms in staggered fashion", __func__);
978 	evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
979 	/* start first batch */
980 	start_vm_batch(0, 0, NULL);
981 
982 	return (0);
983 }
984 
985 int
986 vmd_reload(unsigned int reset, const char *filename)
987 {
988 	struct vmd_vm		*vm, *next_vm;
989 	struct vmd_switch	*vsw;
990 	int			 reload = 0;
991 
992 	/* Switch back to the default config file */
993 	if (filename == NULL || *filename == '\0') {
994 		filename = env->vmd_conffile;
995 		reload = 1;
996 	}
997 
998 	log_debug("%s: level %d config file %s", __func__, reset, filename);
999 
1000 	if (reset) {
1001 		/* Purge the configuration */
1002 		config_purge(env, reset);
1003 		config_setreset(env, reset);
1004 	} else {
1005 		/*
1006 		 * Load or reload the configuration.
1007 		 *
1008 		 * Reloading removes all non-running VMs before processing the
1009 		 * config file, whereas loading only adds to the existing list
1010 		 * of VMs.
1011 		 */
1012 
1013 		if (reload) {
1014 			TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry,
1015 			    next_vm) {
1016 				if (!(vm->vm_state & VM_STATE_RUNNING)) {
1017 					DPRINTF("%s: calling vm_remove",
1018 					    __func__);
1019 					vm_remove(vm, __func__);
1020 				}
1021 			}
1022 		}
1023 
1024 		if (parse_config(filename) == -1) {
1025 			log_debug("%s: failed to load config file %s",
1026 			    __func__, filename);
1027 			return (-1);
1028 		}
1029 
1030 		if (reload) {
1031 			/* Update shared global configuration in all children */
1032 			if (config_setconfig(env) == -1)
1033 				return (-1);
1034 		}
1035 
1036 		TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1037 			if (vsw->sw_running)
1038 				continue;
1039 			if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
1040 				log_warn("%s: failed to create switch %s",
1041 				    __func__, vsw->sw_name);
1042 				switch_remove(vsw);
1043 				return (-1);
1044 			}
1045 		}
1046 
1047 		log_debug("%s: starting vms in staggered fashion", __func__);
1048 		evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
1049 		/* start first batch */
1050 		start_vm_batch(0, 0, NULL);
1051 
1052 		}
1053 
1054 	return (0);
1055 }
1056 
1057 void
1058 vmd_shutdown(void)
1059 {
1060 	struct vmd_vm *vm, *vm_next;
1061 
1062 	log_debug("%s: performing shutdown", __func__);
1063 
1064 	TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
1065 		vm_remove(vm, __func__);
1066 	}
1067 
1068 	proc_kill(&env->vmd_ps);
1069 	free(env);
1070 
1071 	log_warnx("parent terminating");
1072 	exit(0);
1073 }
1074 
1075 struct vmd_vm *
1076 vm_getbyvmid(uint32_t vmid)
1077 {
1078 	struct vmd_vm	*vm;
1079 
1080 	if (vmid == 0)
1081 		return (NULL);
1082 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1083 		if (vm->vm_vmid == vmid)
1084 			return (vm);
1085 	}
1086 
1087 	return (NULL);
1088 }
1089 
1090 struct vmd_vm *
1091 vm_getbyid(uint32_t id)
1092 {
1093 	struct vmd_vm	*vm;
1094 
1095 	if (id == 0)
1096 		return (NULL);
1097 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1098 		if (vm->vm_params.vmc_params.vcp_id == id)
1099 			return (vm);
1100 	}
1101 
1102 	return (NULL);
1103 }
1104 
1105 uint32_t
1106 vm_id2vmid(uint32_t id, struct vmd_vm *vm)
1107 {
1108 	if (vm == NULL && (vm = vm_getbyid(id)) == NULL)
1109 		return (0);
1110 	DPRINTF("%s: vmm id %u is vmid %u", __func__,
1111 	    id, vm->vm_vmid);
1112 	return (vm->vm_vmid);
1113 }
1114 
1115 uint32_t
1116 vm_vmid2id(uint32_t vmid, struct vmd_vm *vm)
1117 {
1118 	if (vm == NULL && (vm = vm_getbyvmid(vmid)) == NULL)
1119 		return (0);
1120 	DPRINTF("%s: vmid %u is vmm id %u", __func__,
1121 	    vmid, vm->vm_params.vmc_params.vcp_id);
1122 	return (vm->vm_params.vmc_params.vcp_id);
1123 }
1124 
1125 struct vmd_vm *
1126 vm_getbyname(const char *name)
1127 {
1128 	struct vmd_vm	*vm;
1129 
1130 	if (name == NULL)
1131 		return (NULL);
1132 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1133 		if (strcmp(vm->vm_params.vmc_params.vcp_name, name) == 0)
1134 			return (vm);
1135 	}
1136 
1137 	return (NULL);
1138 }
1139 
1140 struct vmd_vm *
1141 vm_getbypid(pid_t pid)
1142 {
1143 	struct vmd_vm	*vm;
1144 
1145 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1146 		if (vm->vm_pid == pid)
1147 			return (vm);
1148 	}
1149 
1150 	return (NULL);
1151 }
1152 
1153 void
1154 vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
1155 {
1156 	struct privsep	*ps = &env->vmd_ps;
1157 	unsigned int	 i, j;
1158 
1159 	if (vm == NULL)
1160 		return;
1161 
1162 	log_debug("%s: %s %s stopping vm %d%s",
1163 	    __func__, ps->ps_title[privsep_process], caller,
1164 	    vm->vm_vmid, keeptty ? ", keeping tty open" : "");
1165 
1166 	vm->vm_state &= ~(VM_STATE_RUNNING | VM_STATE_SHUTDOWN);
1167 
1168 	user_inc(&vm->vm_params.vmc_params, vm->vm_user, 0);
1169 	user_put(vm->vm_user);
1170 
1171 	if (vm->vm_iev.ibuf.fd != -1) {
1172 		event_del(&vm->vm_iev.ev);
1173 		close(vm->vm_iev.ibuf.fd);
1174 	}
1175 	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) {
1176 		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
1177 			if (vm->vm_disks[i][j] != -1) {
1178 				close(vm->vm_disks[i][j]);
1179 				vm->vm_disks[i][j] = -1;
1180 			}
1181 		}
1182 	}
1183 	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) {
1184 		if (vm->vm_ifs[i].vif_fd != -1) {
1185 			close(vm->vm_ifs[i].vif_fd);
1186 			vm->vm_ifs[i].vif_fd = -1;
1187 		}
1188 		free(vm->vm_ifs[i].vif_name);
1189 		free(vm->vm_ifs[i].vif_switch);
1190 		free(vm->vm_ifs[i].vif_group);
1191 		vm->vm_ifs[i].vif_name = NULL;
1192 		vm->vm_ifs[i].vif_switch = NULL;
1193 		vm->vm_ifs[i].vif_group = NULL;
1194 	}
1195 	if (vm->vm_kernel != -1) {
1196 		close(vm->vm_kernel);
1197 		vm->vm_kernel = -1;
1198 	}
1199 	if (vm->vm_cdrom != -1) {
1200 		close(vm->vm_cdrom);
1201 		vm->vm_cdrom = -1;
1202 	}
1203 	if (!keeptty) {
1204 		vm_closetty(vm);
1205 		vm->vm_uid = 0;
1206 	}
1207 }
1208 
1209 void
1210 vm_remove(struct vmd_vm *vm, const char *caller)
1211 {
1212 	struct privsep	*ps = &env->vmd_ps;
1213 
1214 	if (vm == NULL)
1215 		return;
1216 
1217 	log_debug("%s: %s %s removing vm %d from running config",
1218 	    __func__, ps->ps_title[privsep_process], caller,
1219 	    vm->vm_vmid);
1220 
1221 	TAILQ_REMOVE(env->vmd_vms, vm, vm_entry);
1222 
1223 	user_put(vm->vm_user);
1224 	vm_stop(vm, 0, caller);
1225 	free(vm);
1226 }
1227 
1228 int
1229 vm_claimid(const char *name, int uid, uint32_t *id)
1230 {
1231 	struct name2id *n2i = NULL;
1232 
1233 	TAILQ_FOREACH(n2i, env->vmd_known, entry)
1234 		if (strcmp(n2i->name, name) == 0 && n2i->uid == uid)
1235 			goto out;
1236 
1237 	if (++env->vmd_nvm == 0) {
1238 		log_warnx("too many vms");
1239 		return -1;
1240 	}
1241 	if ((n2i = calloc(1, sizeof(struct name2id))) == NULL) {
1242 		log_warnx("could not alloc vm name");
1243 		return -1;
1244 	}
1245 	n2i->id = env->vmd_nvm;
1246 	n2i->uid = uid;
1247 	if (strlcpy(n2i->name, name, sizeof(n2i->name)) >= sizeof(n2i->name)) {
1248 		log_warnx("vm name too long");
1249 		free(n2i);
1250 		return -1;
1251 	}
1252 	TAILQ_INSERT_TAIL(env->vmd_known, n2i, entry);
1253 
1254 out:
1255 	*id = n2i->id;
1256 	return 0;
1257 }
1258 
1259 int
1260 vm_register(struct privsep *ps, struct vmop_create_params *vmc,
1261     struct vmd_vm **ret_vm, uint32_t id, uid_t uid)
1262 {
1263 	struct vmd_vm		*vm = NULL, *vm_parent = NULL;
1264 	struct vm_create_params	*vcp = &vmc->vmc_params;
1265 	struct vmop_owner	*vmo = NULL;
1266 	struct vmd_user		*usr = NULL;
1267 	uint32_t		 nid, rng;
1268 	unsigned int		 i, j;
1269 	struct vmd_switch	*sw;
1270 	char			*s;
1271 	int			 ret = 0;
1272 
1273 	/* Check if this is an instance of another VM */
1274 	if ((ret = vm_instance(ps, &vm_parent, vmc, uid)) != 0) {
1275 		errno = ret; /* XXX might set invalid errno */
1276 		return (-1);
1277 	}
1278 
1279 	errno = 0;
1280 	*ret_vm = NULL;
1281 
1282 	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1283 	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1284 		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
1285 		    uid) != 0) {
1286 			errno = EPERM;
1287 			goto fail;
1288 		}
1289 		*ret_vm = vm;
1290 		errno = EALREADY;
1291 		goto fail;
1292 	}
1293 
1294 	if (vm_parent != NULL)
1295 		vmo = &vm_parent->vm_params.vmc_insowner;
1296 
1297 	/* non-root users can only start existing VMs or instances */
1298 	if (vm_checkperm(NULL, vmo, uid) != 0) {
1299 		log_warnx("permission denied");
1300 		errno = EPERM;
1301 		goto fail;
1302 	}
1303 	if (vmc->vmc_flags == 0) {
1304 		log_warnx("invalid configuration, no devices");
1305 		errno = VMD_DISK_MISSING;
1306 		goto fail;
1307 	}
1308 	if (vcp->vcp_ncpus == 0)
1309 		vcp->vcp_ncpus = 1;
1310 	if (vcp->vcp_memranges[0].vmr_size == 0)
1311 		vcp->vcp_memranges[0].vmr_size = VM_DEFAULT_MEMORY;
1312 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) {
1313 		log_warnx("invalid number of CPUs");
1314 		goto fail;
1315 	} else if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) {
1316 		log_warnx("invalid number of disks");
1317 		goto fail;
1318 	} else if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) {
1319 		log_warnx("invalid number of interfaces");
1320 		goto fail;
1321 	} else if (strlen(vcp->vcp_kernel) == 0 &&
1322 	    vcp->vcp_ndisks == 0 && strlen(vcp->vcp_cdrom) == 0) {
1323 		log_warnx("no kernel or disk/cdrom specified");
1324 		goto fail;
1325 	} else if (strlen(vcp->vcp_name) == 0) {
1326 		log_warnx("invalid VM name");
1327 		goto fail;
1328 	} else if (*vcp->vcp_name == '-' || *vcp->vcp_name == '.' ||
1329 	    *vcp->vcp_name == '_') {
1330 		log_warnx("invalid VM name");
1331 		goto fail;
1332 	} else {
1333 		for (s = vcp->vcp_name; *s != '\0'; ++s) {
1334 			if (!(isalnum(*s) || *s == '.' || *s == '-' ||
1335 			    *s == '_')) {
1336 				log_warnx("invalid VM name");
1337 				goto fail;
1338 			}
1339 		}
1340 	}
1341 
1342 	/* track active users */
1343 	if (uid != 0 && env->vmd_users != NULL &&
1344 	    (usr = user_get(uid)) == NULL) {
1345 		log_warnx("could not add user");
1346 		goto fail;
1347 	}
1348 
1349 	if ((vm = calloc(1, sizeof(*vm))) == NULL)
1350 		goto fail;
1351 
1352 	memcpy(&vm->vm_params, vmc, sizeof(vm->vm_params));
1353 	vmc = &vm->vm_params;
1354 	vcp = &vmc->vmc_params;
1355 	vm->vm_pid = -1;
1356 	vm->vm_tty = -1;
1357 	vm->vm_receive_fd = -1;
1358 	vm->vm_state &= ~VM_STATE_PAUSED;
1359 	vm->vm_user = usr;
1360 
1361 	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
1362 		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
1363 			vm->vm_disks[i][j] = -1;
1364 	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
1365 		vm->vm_ifs[i].vif_fd = -1;
1366 	for (i = 0; i < vcp->vcp_nnics; i++) {
1367 		if ((sw = switch_getbyname(vmc->vmc_ifswitch[i])) != NULL) {
1368 			/* inherit per-interface flags from the switch */
1369 			vmc->vmc_ifflags[i] |= (sw->sw_flags & VMIFF_OPTMASK);
1370 		}
1371 
1372 		/*
1373 		 * If the MAC address is zero, always randomize it in vmd(8)
1374 		 * because we cannot rely on the guest OS to do the right
1375 		 * thing like OpenBSD does.  Based on ether_fakeaddr()
1376 		 * from the kernel, incremented by one to differentiate
1377 		 * the source.
1378 		 */
1379 		if (memcmp(zero_mac, &vcp->vcp_macs[i], ETHER_ADDR_LEN) == 0) {
1380 			rng = arc4random();
1381 			vcp->vcp_macs[i][0] = 0xfe;
1382 			vcp->vcp_macs[i][1] = 0xe1;
1383 			vcp->vcp_macs[i][2] = 0xba + 1;
1384 			vcp->vcp_macs[i][3] = 0xd0 | ((i + 1) & 0xf);
1385 			vcp->vcp_macs[i][4] = rng;
1386 			vcp->vcp_macs[i][5] = rng >> 8;
1387 		}
1388 	}
1389 	vm->vm_kernel = -1;
1390 	vm->vm_cdrom = -1;
1391 	vm->vm_iev.ibuf.fd = -1;
1392 
1393 	/*
1394 	 * Assign a new internal Id if not specified and we succeed in
1395 	 * claiming a new Id.
1396 	 */
1397 	if (id != 0)
1398 		vm->vm_vmid = id;
1399 	else if (vm_claimid(vcp->vcp_name, uid, &nid) == -1)
1400 		goto fail;
1401 	else
1402 		vm->vm_vmid = nid;
1403 
1404 	log_debug("%s: registering vm %d", __func__, vm->vm_vmid);
1405 	TAILQ_INSERT_TAIL(env->vmd_vms, vm, vm_entry);
1406 
1407 	*ret_vm = vm;
1408 	return (0);
1409  fail:
1410 	if (errno == 0)
1411 		errno = EINVAL;
1412 	return (-1);
1413 }
1414 
1415 int
1416 vm_instance(struct privsep *ps, struct vmd_vm **vm_parent,
1417     struct vmop_create_params *vmc, uid_t uid)
1418 {
1419 	char			*name;
1420 	struct vm_create_params	*vcp = &vmc->vmc_params;
1421 	struct vmop_create_params *vmcp;
1422 	struct vm_create_params	*vcpp;
1423 	struct vmd_vm		*vm = NULL;
1424 	unsigned int		 i, j;
1425 	uint32_t		 id;
1426 
1427 	/* return without error if the parent is NULL (nothing to inherit) */
1428 	if ((vmc->vmc_flags & VMOP_CREATE_INSTANCE) == 0 ||
1429 	    vmc->vmc_instance[0] == '\0')
1430 		return (0);
1431 
1432 	if ((*vm_parent = vm_getbyname(vmc->vmc_instance)) == NULL) {
1433 		return (VMD_PARENT_INVALID);
1434 	}
1435 
1436 	vmcp = &(*vm_parent)->vm_params;
1437 	vcpp = &vmcp->vmc_params;
1438 
1439 	/* Are we allowed to create an instance from this VM? */
1440 	if (vm_checkperm(NULL, &vmcp->vmc_insowner, uid) != 0) {
1441 		log_warnx("vm \"%s\" no permission to create vm instance",
1442 		    vcpp->vcp_name);
1443 		return (ENAMETOOLONG);
1444 	}
1445 
1446 	id = vcp->vcp_id;
1447 	name = vcp->vcp_name;
1448 
1449 	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1450 	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1451 		return (EPROCLIM);
1452 	}
1453 
1454 	/* CPU */
1455 	if (vcp->vcp_ncpus == 0)
1456 		vcp->vcp_ncpus = vcpp->vcp_ncpus;
1457 	if (vm_checkinsflag(vmcp, VMOP_CREATE_CPU, uid) != 0 &&
1458 	    vcp->vcp_ncpus != vcpp->vcp_ncpus) {
1459 		log_warnx("vm \"%s\" no permission to set cpus", name);
1460 		return (EPERM);
1461 	}
1462 
1463 	/* memory */
1464 	if (vcp->vcp_memranges[0].vmr_size == 0)
1465 		vcp->vcp_memranges[0].vmr_size =
1466 		    vcpp->vcp_memranges[0].vmr_size;
1467 	if (vm_checkinsflag(vmcp, VMOP_CREATE_MEMORY, uid) != 0 &&
1468 	    vcp->vcp_memranges[0].vmr_size !=
1469 	    vcpp->vcp_memranges[0].vmr_size) {
1470 		log_warnx("vm \"%s\" no permission to set memory", name);
1471 		return (EPERM);
1472 	}
1473 
1474 	/* disks cannot be inherited */
1475 	if (vm_checkinsflag(vmcp, VMOP_CREATE_DISK, uid) != 0 &&
1476 	    vcp->vcp_ndisks) {
1477 		log_warnx("vm \"%s\" no permission to set disks", name);
1478 		return (EPERM);
1479 	}
1480 	for (i = 0; i < vcp->vcp_ndisks; i++) {
1481 		/* Check if this disk is already used in the parent */
1482 		for (j = 0; j < vcpp->vcp_ndisks; j++) {
1483 			if (strcmp(vcp->vcp_disks[i],
1484 			    vcpp->vcp_disks[j]) == 0) {
1485 				log_warnx("vm \"%s\" disk %s cannot be reused",
1486 				    name, vcp->vcp_disks[i]);
1487 				return (EBUSY);
1488 			}
1489 		}
1490 		vmc->vmc_checkaccess |= VMOP_CREATE_DISK;
1491 	}
1492 
1493 	/* interfaces */
1494 	if (vcp->vcp_nnics > 0 &&
1495 	    vm_checkinsflag(vmcp, VMOP_CREATE_NETWORK, uid) != 0 &&
1496 	    vcp->vcp_nnics != vcpp->vcp_nnics) {
1497 		log_warnx("vm \"%s\" no permission to set interfaces", name);
1498 		return (EPERM);
1499 	}
1500 	for (i = 0; i < vcpp->vcp_nnics; i++) {
1501 		/* Interface got overwritten */
1502 		if (i < vcp->vcp_nnics)
1503 			continue;
1504 
1505 		/* Copy interface from parent */
1506 		vmc->vmc_ifflags[i] = vmcp->vmc_ifflags[i];
1507 		(void)strlcpy(vmc->vmc_ifnames[i], vmcp->vmc_ifnames[i],
1508 		    sizeof(vmc->vmc_ifnames[i]));
1509 		(void)strlcpy(vmc->vmc_ifswitch[i], vmcp->vmc_ifswitch[i],
1510 		    sizeof(vmc->vmc_ifswitch[i]));
1511 		(void)strlcpy(vmc->vmc_ifgroup[i], vmcp->vmc_ifgroup[i],
1512 		    sizeof(vmc->vmc_ifgroup[i]));
1513 		memcpy(vcp->vcp_macs[i], vcpp->vcp_macs[i],
1514 		    sizeof(vcp->vcp_macs[i]));
1515 		vmc->vmc_ifrdomain[i] = vmcp->vmc_ifrdomain[i];
1516 		vcp->vcp_nnics++;
1517 	}
1518 	for (i = 0; i < vcp->vcp_nnics; i++) {
1519 		for (j = 0; j < vcpp->vcp_nnics; j++) {
1520 			if (memcmp(zero_mac, vcp->vcp_macs[i],
1521 			    sizeof(vcp->vcp_macs[i])) != 0 &&
1522 			    memcmp(vcpp->vcp_macs[i], vcp->vcp_macs[i],
1523 			    sizeof(vcp->vcp_macs[i])) != 0) {
1524 				log_warnx("vm \"%s\" lladdr cannot be reused",
1525 				    name);
1526 				return (EBUSY);
1527 			}
1528 			if (strlen(vmc->vmc_ifnames[i]) &&
1529 			    strcmp(vmc->vmc_ifnames[i],
1530 			    vmcp->vmc_ifnames[j]) == 0) {
1531 				log_warnx("vm \"%s\" %s cannot be reused",
1532 				    vmc->vmc_ifnames[i], name);
1533 				return (EBUSY);
1534 			}
1535 		}
1536 	}
1537 
1538 	/* kernel */
1539 	if (strlen(vcp->vcp_kernel) > 0) {
1540 		if (vm_checkinsflag(vmcp, VMOP_CREATE_KERNEL, uid) != 0) {
1541 			log_warnx("vm \"%s\" no permission to set boot image",
1542 			    name);
1543 			return (EPERM);
1544 		}
1545 		vmc->vmc_checkaccess |= VMOP_CREATE_KERNEL;
1546 	} else if (strlcpy(vcp->vcp_kernel, vcpp->vcp_kernel,
1547 	    sizeof(vcp->vcp_kernel)) >= sizeof(vcp->vcp_kernel)) {
1548 		log_warnx("vm \"%s\" kernel name too long", name);
1549 		return (EINVAL);
1550 	}
1551 
1552 	/* cdrom */
1553 	if (strlen(vcp->vcp_cdrom) > 0) {
1554 		if (vm_checkinsflag(vmcp, VMOP_CREATE_CDROM, uid) != 0) {
1555 			log_warnx("vm \"%s\" no permission to set cdrom", name);
1556 			return (EPERM);
1557 		}
1558 		vmc->vmc_checkaccess |= VMOP_CREATE_CDROM;
1559 	} else if (strlcpy(vcp->vcp_cdrom, vcpp->vcp_cdrom,
1560 	    sizeof(vcp->vcp_cdrom)) >= sizeof(vcp->vcp_cdrom)) {
1561 		log_warnx("vm \"%s\" cdrom name too long", name);
1562 		return (EINVAL);
1563 	}
1564 
1565 	/* user */
1566 	if (vmc->vmc_owner.uid == 0)
1567 		vmc->vmc_owner.uid = vmcp->vmc_owner.uid;
1568 	else if (vmc->vmc_owner.uid != uid &&
1569 	    vmc->vmc_owner.uid != vmcp->vmc_owner.uid) {
1570 		log_warnx("vm \"%s\" user mismatch", name);
1571 		return (EPERM);
1572 	}
1573 
1574 	/* group */
1575 	if (vmc->vmc_owner.gid == 0)
1576 		vmc->vmc_owner.gid = vmcp->vmc_owner.gid;
1577 	else if (vmc->vmc_owner.gid != vmcp->vmc_owner.gid) {
1578 		log_warnx("vm \"%s\" group mismatch", name);
1579 		return (EPERM);
1580 	}
1581 
1582 	/* child instances */
1583 	if (vmc->vmc_insflags) {
1584 		log_warnx("vm \"%s\" cannot change instance permissions", name);
1585 		return (EPERM);
1586 	}
1587 	if (vmcp->vmc_insflags & VMOP_CREATE_INSTANCE) {
1588 		vmc->vmc_insowner.gid = vmcp->vmc_insowner.gid;
1589 		vmc->vmc_insowner.uid = vmcp->vmc_insowner.gid;
1590 		vmc->vmc_insflags = vmcp->vmc_insflags;
1591 	} else {
1592 		vmc->vmc_insowner.gid = 0;
1593 		vmc->vmc_insowner.uid = 0;
1594 		vmc->vmc_insflags = 0;
1595 	}
1596 
1597 	/* finished, remove instance flags */
1598 	vmc->vmc_flags &= ~VMOP_CREATE_INSTANCE;
1599 
1600 	return (0);
1601 }
1602 
1603 /*
1604  * vm_checkperm
1605  *
1606  * Checks if the user represented by the 'uid' parameter is allowed to
1607  * manipulate the VM described by the 'vm' parameter (or connect to said VM's
1608  * console.)
1609  *
1610  * Parameters:
1611  *  vm: the VM whose permission is to be checked
1612  *  vmo: the required uid/gid to be checked
1613  *  uid: the user ID of the user making the request
1614  *
1615  * Return values:
1616  *   0: the permission should be granted
1617  *  -1: the permission check failed (also returned if vm == null)
1618  */
1619 int
1620 vm_checkperm(struct vmd_vm *vm, struct vmop_owner *vmo, uid_t uid)
1621 {
1622 	struct group	*gr;
1623 	struct passwd	*pw;
1624 	char		**grmem;
1625 
1626 	/* root has no restrictions */
1627 	if (uid == 0)
1628 		return (0);
1629 
1630 	if (vmo == NULL)
1631 		return (-1);
1632 
1633 	/* check user */
1634 	if (vm == NULL) {
1635 		if  (vmo->uid == uid)
1636 			return (0);
1637 	} else {
1638 		/*
1639 		 * check user of running vm (the owner of a running vm can
1640 		 * be different to (or more specific than) the configured owner.
1641 		 */
1642 		if (((vm->vm_state & VM_STATE_RUNNING) && vm->vm_uid == uid) ||
1643 		    (!(vm->vm_state & VM_STATE_RUNNING) && vmo->uid == uid))
1644 			return (0);
1645 	}
1646 
1647 	/* check groups */
1648 	if (vmo->gid != -1) {
1649 		if ((pw = getpwuid(uid)) == NULL)
1650 			return (-1);
1651 		if (pw->pw_gid == vmo->gid)
1652 			return (0);
1653 		if ((gr = getgrgid(vmo->gid)) != NULL) {
1654 			for (grmem = gr->gr_mem; *grmem; grmem++)
1655 				if (strcmp(*grmem, pw->pw_name) == 0)
1656 					return (0);
1657 		}
1658 	}
1659 
1660 	return (-1);
1661 }
1662 
1663 /*
1664  * vm_checkinsflag
1665  *
1666  * Checks wheter the non-root user is allowed to set an instance option.
1667  *
1668  * Parameters:
1669  *  vmc: the VM create parameters
1670  *  flag: the flag to be checked
1671  *  uid: the user ID of the user making the request
1672  *
1673  * Return values:
1674  *   0: the permission should be granted
1675  *  -1: the permission check failed (also returned if vm == null)
1676  */
1677 int
1678 vm_checkinsflag(struct vmop_create_params *vmc, unsigned int flag, uid_t uid)
1679 {
1680 	/* root has no restrictions */
1681 	if (uid == 0)
1682 		return (0);
1683 
1684 	if ((vmc->vmc_insflags & flag) == 0)
1685 		return (-1);
1686 
1687 	return (0);
1688 }
1689 
1690 /*
1691  * vm_checkaccess
1692  *
1693  * Checks if the user represented by the 'uid' parameter is allowed to
1694  * access the file described by the 'path' parameter.
1695  *
1696  * Parameters:
1697  *  fd: the file descriptor of the opened file
1698  *  uflag: check if the userid has access to the file
1699  *  uid: the user ID of the user making the request
1700  *  amode: the access flags of R_OK and W_OK
1701  *
1702  * Return values:
1703  *   0: the permission should be granted
1704  *  -1: the permission check failed
1705  */
1706 int
1707 vm_checkaccess(int fd, unsigned int uflag, uid_t uid, int amode)
1708 {
1709 	struct group	*gr;
1710 	struct passwd	*pw;
1711 	char		**grmem;
1712 	struct stat	 st;
1713 	mode_t		 mode;
1714 
1715 	if (fd == -1)
1716 		return (-1);
1717 
1718 	/*
1719 	 * File has to be accessible and a regular file
1720 	 */
1721 	if (fstat(fd, &st) == -1 || !S_ISREG(st.st_mode))
1722 		return (-1);
1723 
1724 	/* root has no restrictions */
1725 	if (uid == 0 || uflag == 0)
1726 		return (0);
1727 
1728 	/* check other */
1729 	mode = amode & W_OK ? S_IWOTH : 0;
1730 	mode |= amode & R_OK ? S_IROTH : 0;
1731 	if ((st.st_mode & mode) == mode)
1732 		return (0);
1733 
1734 	/* check user */
1735 	mode = amode & W_OK ? S_IWUSR : 0;
1736 	mode |= amode & R_OK ? S_IRUSR : 0;
1737 	if (uid == st.st_uid && (st.st_mode & mode) == mode)
1738 		return (0);
1739 
1740 	/* check groups */
1741 	mode = amode & W_OK ? S_IWGRP : 0;
1742 	mode |= amode & R_OK ? S_IRGRP : 0;
1743 	if ((st.st_mode & mode) != mode)
1744 		return (-1);
1745 	if ((pw = getpwuid(uid)) == NULL)
1746 		return (-1);
1747 	if (pw->pw_gid == st.st_gid)
1748 		return (0);
1749 	if ((gr = getgrgid(st.st_gid)) != NULL) {
1750 		for (grmem = gr->gr_mem; *grmem; grmem++)
1751 			if (strcmp(*grmem, pw->pw_name) == 0)
1752 				return (0);
1753 	}
1754 
1755 	return (-1);
1756 }
1757 
1758 int
1759 vm_opentty(struct vmd_vm *vm)
1760 {
1761 	struct ptmget		 ptm;
1762 	struct stat		 st;
1763 	struct group		*gr;
1764 	uid_t			 uid;
1765 	gid_t			 gid;
1766 	mode_t			 mode;
1767 	int			 on;
1768 
1769 	/*
1770 	 * Open tty with pre-opened PTM fd
1771 	 */
1772 	if ((ioctl(env->vmd_ptmfd, PTMGET, &ptm) == -1))
1773 		return (-1);
1774 
1775 	/*
1776 	 * We use user ioctl(2) mode to pass break commands.
1777 	 */
1778 	on = 1;
1779 	if (ioctl(ptm.cfd, TIOCUCNTL, &on) == -1)
1780 		fatal("could not enable user ioctl mode");
1781 
1782 	vm->vm_tty = ptm.cfd;
1783 	close(ptm.sfd);
1784 	if ((vm->vm_ttyname = strdup(ptm.sn)) == NULL)
1785 		goto fail;
1786 
1787 	uid = vm->vm_uid;
1788 	gid = vm->vm_params.vmc_owner.gid;
1789 
1790 	if (vm->vm_params.vmc_owner.gid != -1) {
1791 		mode = 0660;
1792 	} else if ((gr = getgrnam("tty")) != NULL) {
1793 		gid = gr->gr_gid;
1794 		mode = 0620;
1795 	} else {
1796 		mode = 0600;
1797 		gid = 0;
1798 	}
1799 
1800 	log_debug("%s: vm %s tty %s uid %d gid %d mode %o",
1801 	    __func__, vm->vm_params.vmc_params.vcp_name,
1802 	    vm->vm_ttyname, uid, gid, mode);
1803 
1804 	/*
1805 	 * Change ownership and mode of the tty as required.
1806 	 * Loosely based on the implementation of sshpty.c
1807 	 */
1808 	if (stat(vm->vm_ttyname, &st) == -1)
1809 		goto fail;
1810 
1811 	if (st.st_uid != uid || st.st_gid != gid) {
1812 		if (chown(vm->vm_ttyname, uid, gid) == -1) {
1813 			log_warn("chown %s %d %d failed, uid %d",
1814 			    vm->vm_ttyname, uid, gid, getuid());
1815 
1816 			/* Ignore failure on read-only filesystems */
1817 			if (!((errno == EROFS) &&
1818 			    (st.st_uid == uid || st.st_uid == 0)))
1819 				goto fail;
1820 		}
1821 	}
1822 
1823 	if ((st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)) != mode) {
1824 		if (chmod(vm->vm_ttyname, mode) == -1) {
1825 			log_warn("chmod %s %o failed, uid %d",
1826 			    vm->vm_ttyname, mode, getuid());
1827 
1828 			/* Ignore failure on read-only filesystems */
1829 			if (!((errno == EROFS) &&
1830 			    (st.st_uid == uid || st.st_uid == 0)))
1831 				goto fail;
1832 		}
1833 	}
1834 
1835 	return (0);
1836  fail:
1837 	vm_closetty(vm);
1838 	return (-1);
1839 }
1840 
1841 void
1842 vm_closetty(struct vmd_vm *vm)
1843 {
1844 	if (vm->vm_tty != -1) {
1845 		/* Release and close the tty */
1846 		if (fchown(vm->vm_tty, 0, 0) == -1)
1847 			log_warn("chown %s 0 0 failed", vm->vm_ttyname);
1848 		if (fchmod(vm->vm_tty, 0666) == -1)
1849 			log_warn("chmod %s 0666 failed", vm->vm_ttyname);
1850 		close(vm->vm_tty);
1851 		vm->vm_tty = -1;
1852 	}
1853 	free(vm->vm_ttyname);
1854 	vm->vm_ttyname = NULL;
1855 }
1856 
1857 void
1858 switch_remove(struct vmd_switch *vsw)
1859 {
1860 	if (vsw == NULL)
1861 		return;
1862 
1863 	TAILQ_REMOVE(env->vmd_switches, vsw, sw_entry);
1864 
1865 	free(vsw->sw_group);
1866 	free(vsw->sw_name);
1867 	free(vsw);
1868 }
1869 
1870 struct vmd_switch *
1871 switch_getbyname(const char *name)
1872 {
1873 	struct vmd_switch	*vsw;
1874 
1875 	if (name == NULL)
1876 		return (NULL);
1877 	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1878 		if (strcmp(vsw->sw_name, name) == 0)
1879 			return (vsw);
1880 	}
1881 
1882 	return (NULL);
1883 }
1884 
1885 struct vmd_user *
1886 user_get(uid_t uid)
1887 {
1888 	struct vmd_user		*usr;
1889 
1890 	if (uid == 0)
1891 		return (NULL);
1892 
1893 	/* first try to find an existing user */
1894 	TAILQ_FOREACH(usr, env->vmd_users, usr_entry) {
1895 		if (usr->usr_id.uid == uid)
1896 			goto done;
1897 	}
1898 
1899 	if ((usr = calloc(1, sizeof(*usr))) == NULL) {
1900 		log_warn("could not allocate user");
1901 		return (NULL);
1902 	}
1903 
1904 	usr->usr_id.uid = uid;
1905 	usr->usr_id.gid = -1;
1906 	TAILQ_INSERT_TAIL(env->vmd_users, usr, usr_entry);
1907 
1908  done:
1909 	DPRINTF("%s: uid %d #%d +",
1910 	    __func__, usr->usr_id.uid, usr->usr_refcnt + 1);
1911 	usr->usr_refcnt++;
1912 
1913 	return (usr);
1914 }
1915 
1916 void
1917 user_put(struct vmd_user *usr)
1918 {
1919 	if (usr == NULL)
1920 		return;
1921 
1922 	DPRINTF("%s: uid %d #%d -",
1923 	    __func__, usr->usr_id.uid, usr->usr_refcnt - 1);
1924 
1925 	if (--usr->usr_refcnt > 0)
1926 		return;
1927 
1928 	TAILQ_REMOVE(env->vmd_users, usr, usr_entry);
1929 	free(usr);
1930 }
1931 
1932 void
1933 user_inc(struct vm_create_params *vcp, struct vmd_user *usr, int inc)
1934 {
1935 	char	 mem[FMT_SCALED_STRSIZE];
1936 
1937 	if (usr == NULL)
1938 		return;
1939 
1940 	/* increment or decrement counters */
1941 	inc = inc ? 1 : -1;
1942 
1943 	usr->usr_maxcpu += vcp->vcp_ncpus * inc;
1944 	usr->usr_maxmem += vcp->vcp_memranges[0].vmr_size * inc;
1945 	usr->usr_maxifs += vcp->vcp_nnics * inc;
1946 
1947 	if (log_getverbose() > 1) {
1948 		(void)fmt_scaled(usr->usr_maxmem * 1024 * 1024, mem);
1949 		log_debug("%s: %c uid %d ref %d cpu %llu mem %s ifs %llu",
1950 		    __func__, inc == 1 ? '+' : '-',
1951 		    usr->usr_id.uid, usr->usr_refcnt,
1952 		    usr->usr_maxcpu, mem, usr->usr_maxifs);
1953 	}
1954 }
1955 
1956 int
1957 user_checklimit(struct vmd_user *usr, struct vm_create_params *vcp)
1958 {
1959 	const char	*limit = "";
1960 
1961 	/* XXX make the limits configurable */
1962 	if (usr->usr_maxcpu > VM_DEFAULT_USER_MAXCPU) {
1963 		limit = "cpu ";
1964 		goto fail;
1965 	}
1966 	if (usr->usr_maxmem > VM_DEFAULT_USER_MAXMEM) {
1967 		limit = "memory ";
1968 		goto fail;
1969 	}
1970 	if (usr->usr_maxifs > VM_DEFAULT_USER_MAXIFS) {
1971 		limit = "interface ";
1972 		goto fail;
1973 	}
1974 
1975 	return (0);
1976 
1977  fail:
1978 	log_warnx("%s: user %d %slimit reached", vcp->vcp_name,
1979 	    usr->usr_id.uid, limit);
1980 	return (-1);
1981 }
1982 
1983 char *
1984 get_string(uint8_t *ptr, size_t len)
1985 {
1986 	size_t	 i;
1987 
1988 	for (i = 0; i < len; i++)
1989 		if (!isprint(ptr[i]))
1990 			break;
1991 
1992 	return strndup(ptr, i);
1993 }
1994 
1995 uint32_t
1996 prefixlen2mask(uint8_t prefixlen)
1997 {
1998 	if (prefixlen == 0)
1999 		return (0);
2000 
2001 	if (prefixlen > 32)
2002 		prefixlen = 32;
2003 
2004 	return (htonl(0xffffffff << (32 - prefixlen)));
2005 }
2006 
2007 void
2008 prefixlen2mask6(uint8_t prefixlen, struct in6_addr *mask)
2009 {
2010 	struct in6_addr	 s6;
2011 	int		 i;
2012 
2013 	if (prefixlen > 128)
2014 		prefixlen = 128;
2015 
2016 	memset(&s6, 0, sizeof(s6));
2017 	for (i = 0; i < prefixlen / 8; i++)
2018 		s6.s6_addr[i] = 0xff;
2019 	i = prefixlen % 8;
2020 	if (i)
2021 		s6.s6_addr[prefixlen / 8] = 0xff00 >> i;
2022 
2023 	memcpy(mask, &s6, sizeof(s6));
2024 }
2025 
2026 void
2027 getmonotime(struct timeval *tv)
2028 {
2029 	struct timespec	 ts;
2030 
2031 	if (clock_gettime(CLOCK_MONOTONIC, &ts))
2032 		fatal("clock_gettime");
2033 
2034 	TIMESPEC_TO_TIMEVAL(tv, &ts);
2035 }
2036