xref: /openbsd/usr.sbin/vmd/vmd.c (revision 905646f0)
1 /*	$OpenBSD: vmd.c,v 1.119 2020/09/23 19:18:18 martijn Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>	/* nitems */
20 #include <sys/queue.h>
21 #include <sys/wait.h>
22 #include <sys/cdefs.h>
23 #include <sys/stat.h>
24 #include <sys/sysctl.h>
25 #include <sys/tty.h>
26 #include <sys/ttycom.h>
27 #include <sys/ioctl.h>
28 
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <termios.h>
33 #include <errno.h>
34 #include <event.h>
35 #include <fcntl.h>
36 #include <pwd.h>
37 #include <signal.h>
38 #include <syslog.h>
39 #include <unistd.h>
40 #include <util.h>
41 #include <ctype.h>
42 #include <pwd.h>
43 #include <grp.h>
44 
45 #include <machine/specialreg.h>
46 #include <machine/vmmvar.h>
47 
48 #include "proc.h"
49 #include "atomicio.h"
50 #include "vmd.h"
51 
52 __dead void usage(void);
53 
54 int	 main(int, char **);
55 int	 vmd_configure(void);
56 void	 vmd_sighdlr(int sig, short event, void *arg);
57 void	 vmd_shutdown(void);
58 int	 vmd_control_run(void);
59 int	 vmd_dispatch_control(int, struct privsep_proc *, struct imsg *);
60 int	 vmd_dispatch_vmm(int, struct privsep_proc *, struct imsg *);
61 int	 vmd_check_vmh(struct vm_dump_header *);
62 
63 int	 vm_instance(struct privsep *, struct vmd_vm **,
64 	    struct vmop_create_params *, uid_t);
65 int	 vm_checkinsflag(struct vmop_create_params *, unsigned int, uid_t);
66 int	 vm_claimid(const char *, int, uint32_t *);
67 void	 start_vm_batch(int, short, void*);
68 
69 struct vmd	*env;
70 
71 static struct privsep_proc procs[] = {
72 	/* Keep "priv" on top as procs[0] */
73 	{ "priv",	PROC_PRIV,	NULL, priv },
74 	{ "control",	PROC_CONTROL,	vmd_dispatch_control, control },
75 	{ "vmm",	PROC_VMM,	vmd_dispatch_vmm, vmm, vmm_shutdown },
76 };
77 
78 struct event staggered_start_timer;
79 
80 /* For the privileged process */
81 static struct privsep_proc *proc_priv = &procs[0];
82 static struct passwd proc_privpw;
83 static const uint8_t zero_mac[ETHER_ADDR_LEN];
84 
85 int
86 vmd_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg)
87 {
88 	struct privsep			*ps = p->p_ps;
89 	int				 res = 0, ret = 0, cmd = 0, verbose;
90 	unsigned int			 v = 0, flags;
91 	struct vmop_create_params	 vmc;
92 	struct vmop_id			 vid;
93 	struct vmop_result		 vmr;
94 	struct vm_dump_header		 vmh;
95 	struct vmd_vm			*vm = NULL;
96 	char				*str = NULL;
97 	uint32_t			 id = 0;
98 	struct control_sock		*rcs;
99 
100 	switch (imsg->hdr.type) {
101 	case IMSG_VMDOP_START_VM_REQUEST:
102 		IMSG_SIZE_CHECK(imsg, &vmc);
103 		memcpy(&vmc, imsg->data, sizeof(vmc));
104 		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
105 		if (vmc.vmc_flags == 0) {
106 			/* start an existing VM with pre-configured options */
107 			if (!(ret == -1 && errno == EALREADY &&
108 			    !(vm->vm_state & VM_STATE_RUNNING))) {
109 				res = errno;
110 				cmd = IMSG_VMDOP_START_VM_RESPONSE;
111 			}
112 		} else if (ret != 0) {
113 			res = errno;
114 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
115 		}
116 		if (res == 0 &&
117 		    config_setvm(ps, vm,
118 		    imsg->hdr.peerid, vm->vm_params.vmc_owner.uid) == -1) {
119 			res = errno;
120 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
121 		}
122 		break;
123 	case IMSG_VMDOP_WAIT_VM_REQUEST:
124 	case IMSG_VMDOP_TERMINATE_VM_REQUEST:
125 		IMSG_SIZE_CHECK(imsg, &vid);
126 		memcpy(&vid, imsg->data, sizeof(vid));
127 		flags = vid.vid_flags;
128 
129 		if ((id = vid.vid_id) == 0) {
130 			/* Lookup vm (id) by name */
131 			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
132 				res = ENOENT;
133 				cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
134 				break;
135 			} else if ((vm->vm_state & VM_STATE_SHUTDOWN) &&
136 			    (flags & VMOP_FORCE) == 0) {
137 				res = EALREADY;
138 				cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
139 				break;
140 			} else if (!(vm->vm_state & VM_STATE_RUNNING)) {
141 				res = EINVAL;
142 				cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
143 				break;
144 			}
145 			id = vm->vm_vmid;
146 		} else if ((vm = vm_getbyvmid(id)) == NULL) {
147 			res = ENOENT;
148 			cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
149 			break;
150 		}
151 		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
152 		    vid.vid_uid) != 0) {
153 			res = EPERM;
154 			cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
155 			break;
156 		}
157 
158 		memset(&vid, 0, sizeof(vid));
159 		vid.vid_id = id;
160 		vid.vid_flags = flags;
161 		if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
162 		    imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1)
163 			return (-1);
164 		break;
165 	case IMSG_VMDOP_GET_INFO_VM_REQUEST:
166 		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
167 		break;
168 	case IMSG_VMDOP_LOAD:
169 		IMSG_SIZE_CHECK(imsg, str); /* at least one byte for path */
170 		str = get_string((uint8_t *)imsg->data,
171 		    IMSG_DATA_SIZE(imsg));
172 	case IMSG_VMDOP_RELOAD:
173 		if (vmd_reload(0, str) == -1)
174 			cmd = IMSG_CTL_FAIL;
175 		else
176 			cmd = IMSG_CTL_OK;
177 		free(str);
178 		break;
179 	case IMSG_CTL_RESET:
180 		IMSG_SIZE_CHECK(imsg, &v);
181 		memcpy(&v, imsg->data, sizeof(v));
182 		if (vmd_reload(v, NULL) == -1)
183 			cmd = IMSG_CTL_FAIL;
184 		else
185 			cmd = IMSG_CTL_OK;
186 		break;
187 	case IMSG_CTL_VERBOSE:
188 		IMSG_SIZE_CHECK(imsg, &verbose);
189 		memcpy(&verbose, imsg->data, sizeof(verbose));
190 		log_setverbose(verbose);
191 
192 		proc_forward_imsg(ps, imsg, PROC_VMM, -1);
193 		proc_forward_imsg(ps, imsg, PROC_PRIV, -1);
194 		cmd = IMSG_CTL_OK;
195 		break;
196 	case IMSG_VMDOP_PAUSE_VM:
197 	case IMSG_VMDOP_UNPAUSE_VM:
198 		IMSG_SIZE_CHECK(imsg, &vid);
199 		memcpy(&vid, imsg->data, sizeof(vid));
200 		if (vid.vid_id == 0) {
201 			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
202 				res = ENOENT;
203 				cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
204 				break;
205 			} else {
206 				vid.vid_id = vm->vm_vmid;
207 			}
208 		} else if ((vm = vm_getbyid(vid.vid_id)) == NULL) {
209 			res = ENOENT;
210 			cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
211 			break;
212 		}
213 		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
214 		    vid.vid_uid) != 0) {
215 			res = EPERM;
216 			cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
217 			break;
218 		}
219 		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
220 		    imsg->hdr.peerid, -1, &vid, sizeof(vid));
221 		break;
222 	case IMSG_VMDOP_SEND_VM_REQUEST:
223 		IMSG_SIZE_CHECK(imsg, &vid);
224 		memcpy(&vid, imsg->data, sizeof(vid));
225 		id = vid.vid_id;
226 		if (vid.vid_id == 0) {
227 			if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
228 				res = ENOENT;
229 				cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
230 				close(imsg->fd);
231 				break;
232 			} else {
233 				vid.vid_id = vm->vm_vmid;
234 			}
235 		} else if ((vm = vm_getbyvmid(vid.vid_id)) == NULL) {
236 			res = ENOENT;
237 			cmd = IMSG_VMDOP_SEND_VM_RESPONSE;
238 			close(imsg->fd);
239 			break;
240 		}
241 		vmr.vmr_id = vid.vid_id;
242 		log_debug("%s: sending fd to vmm", __func__);
243 		proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
244 		    imsg->hdr.peerid, imsg->fd, &vid, sizeof(vid));
245 		break;
246 	case IMSG_VMDOP_RECEIVE_VM_REQUEST:
247 		IMSG_SIZE_CHECK(imsg, &vid);
248 		memcpy(&vid, imsg->data, sizeof(vid));
249 		if (imsg->fd == -1) {
250 			log_warnx("%s: invalid fd", __func__);
251 			return (-1);
252 		}
253 		if (atomicio(read, imsg->fd, &vmh, sizeof(vmh)) !=
254 		    sizeof(vmh)) {
255 			log_warnx("%s: error reading vmh from received vm",
256 			    __func__);
257 			res = EIO;
258 			close(imsg->fd);
259 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
260 			break;
261 		}
262 
263 		if (vmd_check_vmh(&vmh)) {
264 			res = ENOENT;
265 			close(imsg->fd);
266 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
267 			break;
268 		}
269 		if (atomicio(read, imsg->fd, &vmc, sizeof(vmc)) !=
270 		    sizeof(vmc)) {
271 			log_warnx("%s: error reading vmc from received vm",
272 			    __func__);
273 			res = EIO;
274 			close(imsg->fd);
275 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
276 			break;
277 		}
278 		strlcpy(vmc.vmc_params.vcp_name, vid.vid_name,
279 		    sizeof(vmc.vmc_params.vcp_name));
280 		vmc.vmc_params.vcp_id = 0;
281 
282 		ret = vm_register(ps, &vmc, &vm, 0, vmc.vmc_owner.uid);
283 		if (ret != 0) {
284 			res = errno;
285 			cmd = IMSG_VMDOP_START_VM_RESPONSE;
286 			close(imsg->fd);
287 		} else {
288 			vm->vm_state |= VM_STATE_RECEIVED;
289 			config_setvm(ps, vm, imsg->hdr.peerid,
290 			    vmc.vmc_owner.uid);
291 			log_debug("%s: sending fd to vmm", __func__);
292 			proc_compose_imsg(ps, PROC_VMM, -1,
293 			    IMSG_VMDOP_RECEIVE_VM_END, vm->vm_vmid, imsg->fd,
294 			    NULL, 0);
295 		}
296 		break;
297 	case IMSG_VMDOP_DONE:
298 		control_reset(&ps->ps_csock);
299 		TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry)
300 			control_reset(rcs);
301 		cmd = 0;
302 		break;
303 	default:
304 		return (-1);
305 	}
306 
307 	switch (cmd) {
308 	case 0:
309 		break;
310 	case IMSG_VMDOP_START_VM_RESPONSE:
311 	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
312 		memset(&vmr, 0, sizeof(vmr));
313 		vmr.vmr_result = res;
314 		vmr.vmr_id = id;
315 		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
316 		    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
317 			return (-1);
318 		break;
319 	default:
320 		if (proc_compose_imsg(ps, PROC_CONTROL, -1, cmd,
321 		    imsg->hdr.peerid, -1, &res, sizeof(res)) == -1)
322 			return (-1);
323 		break;
324 	}
325 
326 	return (0);
327 }
328 
329 int
330 vmd_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg)
331 {
332 	struct vmop_result	 vmr;
333 	struct privsep		*ps = p->p_ps;
334 	int			 res = 0;
335 	struct vmd_vm		*vm;
336 	struct vm_create_params	*vcp;
337 	struct vmop_info_result	 vir;
338 
339 	switch (imsg->hdr.type) {
340 	case IMSG_VMDOP_PAUSE_VM_RESPONSE:
341 		IMSG_SIZE_CHECK(imsg, &vmr);
342 		memcpy(&vmr, imsg->data, sizeof(vmr));
343 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
344 			break;
345 		proc_compose_imsg(ps, PROC_CONTROL, -1,
346 		    imsg->hdr.type, imsg->hdr.peerid, -1,
347 		    imsg->data, sizeof(imsg->data));
348 		log_info("%s: paused vm %d successfully",
349 		    vm->vm_params.vmc_params.vcp_name,
350 		    vm->vm_vmid);
351 		vm->vm_state |= VM_STATE_PAUSED;
352 		break;
353 	case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
354 		IMSG_SIZE_CHECK(imsg, &vmr);
355 		memcpy(&vmr, imsg->data, sizeof(vmr));
356 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
357 			break;
358 		proc_compose_imsg(ps, PROC_CONTROL, -1,
359 		    imsg->hdr.type, imsg->hdr.peerid, -1,
360 		    imsg->data, sizeof(imsg->data));
361 		log_info("%s: unpaused vm %d successfully.",
362 		    vm->vm_params.vmc_params.vcp_name,
363 		    vm->vm_vmid);
364 		vm->vm_state &= ~VM_STATE_PAUSED;
365 		break;
366 	case IMSG_VMDOP_START_VM_RESPONSE:
367 		IMSG_SIZE_CHECK(imsg, &vmr);
368 		memcpy(&vmr, imsg->data, sizeof(vmr));
369 		if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL)
370 			break;
371 		vm->vm_pid = vmr.vmr_pid;
372 		vcp = &vm->vm_params.vmc_params;
373 		vcp->vcp_id = vmr.vmr_id;
374 
375 		/*
376 		 * If the peerid is not -1, forward the response back to the
377 		 * the control socket.  If it is -1, the request originated
378 		 * from the parent, not the control socket.
379 		 */
380 		if (vm->vm_peerid != (uint32_t)-1) {
381 			(void)strlcpy(vmr.vmr_ttyname, vm->vm_ttyname,
382 			    sizeof(vmr.vmr_ttyname));
383 			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
384 			    imsg->hdr.type, vm->vm_peerid, -1,
385 			    &vmr, sizeof(vmr)) == -1) {
386 				errno = vmr.vmr_result;
387 				log_warn("%s: failed to foward vm result",
388 				    vcp->vcp_name);
389 				vm_remove(vm, __func__);
390 				return (-1);
391 			}
392 		}
393 
394 		if (vmr.vmr_result) {
395 			errno = vmr.vmr_result;
396 			log_warn("%s: failed to start vm", vcp->vcp_name);
397 			vm_remove(vm, __func__);
398 			break;
399 		}
400 
401 		/* Now configure all the interfaces */
402 		if (vm_priv_ifconfig(ps, vm) == -1) {
403 			log_warn("%s: failed to configure vm", vcp->vcp_name);
404 			vm_remove(vm, __func__);
405 			break;
406 		}
407 
408 		log_info("%s: started vm %d successfully, tty %s",
409 		    vcp->vcp_name, vm->vm_vmid, vm->vm_ttyname);
410 		break;
411 	case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
412 		IMSG_SIZE_CHECK(imsg, &vmr);
413 		memcpy(&vmr, imsg->data, sizeof(vmr));
414 		DPRINTF("%s: forwarding TERMINATE VM for vm id %d",
415 		    __func__, vmr.vmr_id);
416 		proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
417 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
418 			break;
419 		if (vmr.vmr_result == 0) {
420 			/* Mark VM as shutting down */
421 			vm->vm_state |= VM_STATE_SHUTDOWN;
422 		}
423 		break;
424 	case IMSG_VMDOP_SEND_VM_RESPONSE:
425 		IMSG_SIZE_CHECK(imsg, &vmr);
426 		memcpy(&vmr, imsg->data, sizeof(vmr));
427 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL)
428 			break;
429 		if (!vmr.vmr_result) {
430 			log_info("%s: sent vm %d successfully.",
431 			    vm->vm_params.vmc_params.vcp_name,
432 			    vm->vm_vmid);
433 			if (vm->vm_from_config)
434 				vm_stop(vm, 0, __func__);
435 			else
436 				vm_remove(vm, __func__);
437 		}
438 
439 		/* Send a response if a control client is waiting for it */
440 		if (imsg->hdr.peerid != (uint32_t)-1) {
441 			/* the error is meaningless for deferred responses */
442 			vmr.vmr_result = 0;
443 
444 			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
445 			    IMSG_VMDOP_SEND_VM_RESPONSE,
446 			    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
447 				return (-1);
448 		}
449 		break;
450 	case IMSG_VMDOP_TERMINATE_VM_EVENT:
451 		IMSG_SIZE_CHECK(imsg, &vmr);
452 		memcpy(&vmr, imsg->data, sizeof(vmr));
453 		DPRINTF("%s: handling TERMINATE_EVENT for vm id %d ret %d",
454 		    __func__, vmr.vmr_id, vmr.vmr_result);
455 		if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) {
456 			log_debug("%s: vm %d is no longer available",
457 			    __func__, vmr.vmr_id);
458 			break;
459 		}
460 		if (vmr.vmr_result != EAGAIN ||
461 		    vm->vm_params.vmc_bootdevice) {
462 			if (vm->vm_from_config)
463 				vm_stop(vm, 0, __func__);
464 			else
465 				vm_remove(vm, __func__);
466 		} else {
467 			/* Stop VM instance but keep the tty open */
468 			vm_stop(vm, 1, __func__);
469 			config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid);
470 		}
471 
472 		/* Send a response if a control client is waiting for it */
473 		if (imsg->hdr.peerid != (uint32_t)-1) {
474 			/* the error is meaningless for deferred responses */
475 			vmr.vmr_result = 0;
476 
477 			if (proc_compose_imsg(ps, PROC_CONTROL, -1,
478 			    IMSG_VMDOP_TERMINATE_VM_RESPONSE,
479 			    imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
480 				return (-1);
481 		}
482 		break;
483 	case IMSG_VMDOP_GET_INFO_VM_DATA:
484 		IMSG_SIZE_CHECK(imsg, &vir);
485 		memcpy(&vir, imsg->data, sizeof(vir));
486 		if ((vm = vm_getbyvmid(vir.vir_info.vir_id)) != NULL) {
487 			memset(vir.vir_ttyname, 0, sizeof(vir.vir_ttyname));
488 			if (vm->vm_ttyname != NULL)
489 				strlcpy(vir.vir_ttyname, vm->vm_ttyname,
490 				    sizeof(vir.vir_ttyname));
491 			log_debug("%s: running vm: %d, vm_state: 0x%x",
492 			    __func__, vm->vm_vmid, vm->vm_state);
493 			vir.vir_state = vm->vm_state;
494 			/* get the user id who started the vm */
495 			vir.vir_uid = vm->vm_uid;
496 			vir.vir_gid = vm->vm_params.vmc_owner.gid;
497 		}
498 		if (proc_compose_imsg(ps, PROC_CONTROL, -1, imsg->hdr.type,
499 		    imsg->hdr.peerid, -1, &vir, sizeof(vir)) == -1) {
500 			log_debug("%s: GET_INFO_VM failed for vm %d, removing",
501 			    __func__, vm->vm_vmid);
502 			vm_remove(vm, __func__);
503 			return (-1);
504 		}
505 		break;
506 	case IMSG_VMDOP_GET_INFO_VM_END_DATA:
507 		/*
508 		 * PROC_VMM has responded with the *running* VMs, now we
509 		 * append the others. These use the special value 0 for their
510 		 * kernel id to indicate that they are not running.
511 		 */
512 		TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
513 			if (!(vm->vm_state & VM_STATE_RUNNING)) {
514 				memset(&vir, 0, sizeof(vir));
515 				vir.vir_info.vir_id = vm->vm_vmid;
516 				strlcpy(vir.vir_info.vir_name,
517 				    vm->vm_params.vmc_params.vcp_name,
518 				    VMM_MAX_NAME_LEN);
519 				vir.vir_info.vir_memory_size =
520 				    vm->vm_params.vmc_params.
521 				    vcp_memranges[0].vmr_size;
522 				vir.vir_info.vir_ncpus =
523 				    vm->vm_params.vmc_params.vcp_ncpus;
524 				/* get the configured user id for this vm */
525 				vir.vir_uid = vm->vm_params.vmc_owner.uid;
526 				vir.vir_gid = vm->vm_params.vmc_owner.gid;
527 				log_debug("%s: vm: %d, vm_state: 0x%x",
528 				    __func__, vm->vm_vmid, vm->vm_state);
529 				vir.vir_state = vm->vm_state;
530 				if (proc_compose_imsg(ps, PROC_CONTROL, -1,
531 				    IMSG_VMDOP_GET_INFO_VM_DATA,
532 				    imsg->hdr.peerid, -1, &vir,
533 				    sizeof(vir)) == -1) {
534 					log_debug("%s: GET_INFO_VM_END failed",
535 					    __func__);
536 					vm_remove(vm, __func__);
537 					return (-1);
538 				}
539 			}
540 		}
541 		IMSG_SIZE_CHECK(imsg, &res);
542 		proc_forward_imsg(ps, imsg, PROC_CONTROL, -1);
543 		break;
544 	default:
545 		return (-1);
546 	}
547 
548 	return (0);
549 }
550 
551 int
552 vmd_check_vmh(struct vm_dump_header *vmh)
553 {
554 	int i;
555 	unsigned int code, leaf;
556 	unsigned int a, b, c, d;
557 
558 	if (strncmp(vmh->vmh_signature, VM_DUMP_SIGNATURE, strlen(VM_DUMP_SIGNATURE)) != 0) {
559 		log_warnx("%s: incompatible dump signature", __func__);
560 		return (-1);
561 	}
562 
563 	if (vmh->vmh_version != VM_DUMP_VERSION) {
564 		log_warnx("%s: incompatible dump version", __func__);
565 		return (-1);
566 	}
567 
568 	for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
569 		code = vmh->vmh_cpuids[i].code;
570 		leaf = vmh->vmh_cpuids[i].leaf;
571 		if (leaf != 0x00) {
572 			log_debug("%s: invalid leaf 0x%x for code 0x%x",
573 			    __func__, leaf, code);
574 			return (-1);
575 		}
576 
577 		switch (code) {
578 		case 0x00:
579 			CPUID_LEAF(code, leaf, a, b, c, d);
580 			if (vmh->vmh_cpuids[i].a > a) {
581 				log_debug("%s: incompatible cpuid level",
582 				    __func__);
583 				return (-1);
584 			}
585 			if (!(vmh->vmh_cpuids[i].b == b &&
586 			    vmh->vmh_cpuids[i].c == c &&
587 			    vmh->vmh_cpuids[i].d == d)) {
588 				log_debug("%s: incompatible cpu brand",
589 				    __func__);
590 				return (-1);
591 			}
592 			break;
593 
594 		case 0x01:
595 			CPUID_LEAF(code, leaf, a, b, c, d);
596 			if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
597 			    (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
598 				log_debug("%s: incompatible cpu features "
599 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
600 				    code, leaf);
601 				return (-1);
602 			}
603 			if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
604 			    (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
605 				log_debug("%s: incompatible cpu features "
606 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
607 				    code, leaf);
608 				return (-1);
609 			}
610 			break;
611 
612 		case 0x07:
613 			CPUID_LEAF(code, leaf, a, b, c, d);
614 			if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
615 			    (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
616 				log_debug("%s: incompatible cpu features "
617 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
618 				    code, leaf);
619 				return (-1);
620 			}
621 			if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
622 			    (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
623 				log_debug("%s: incompatible cpu features "
624 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
625 				    code, leaf);
626 				return (-1);
627 			}
628 			break;
629 
630 		case 0x0d:
631 			CPUID_LEAF(code, leaf, a, b, c, d);
632 			if (vmh->vmh_cpuids[i].b > b) {
633 				log_debug("%s: incompatible cpu: insufficient "
634 				    "max save area for enabled XCR0 features",
635 				    __func__);
636 				return (-1);
637 			}
638 			if (vmh->vmh_cpuids[i].c > c) {
639 				log_debug("%s: incompatible cpu: insufficient "
640 				    "max save area for supported XCR0 features",
641 				    __func__);
642 				return (-1);
643 			}
644 			break;
645 
646 		case 0x80000001:
647 			CPUID_LEAF(code, leaf, a, b, c, d);
648 			if ((vmh->vmh_cpuids[i].a & a) !=
649 			    vmh->vmh_cpuids[i].a) {
650 				log_debug("%s: incompatible cpu features "
651 				    "code: 0x%x leaf: 0x%x  reg: a", __func__,
652 				    code, leaf);
653 				return (-1);
654 			}
655 			if ((vmh->vmh_cpuids[i].c & c) !=
656 			    vmh->vmh_cpuids[i].c) {
657 				log_debug("%s: incompatible cpu features "
658 				    "code: 0x%x leaf: 0x%x  reg: c", __func__,
659 				    code, leaf);
660 				return (-1);
661 			}
662 			if ((vmh->vmh_cpuids[i].d & d) !=
663 			    vmh->vmh_cpuids[i].d) {
664 				log_debug("%s: incompatible cpu features "
665 				    "code: 0x%x leaf: 0x%x  reg: d", __func__,
666 				    code, leaf);
667 				return (-1);
668 			}
669 			break;
670 
671 		default:
672 			log_debug("%s: unknown code 0x%x", __func__, code);
673 			return (-1);
674 		}
675 	}
676 
677 	return (0);
678 }
679 
680 void
681 vmd_sighdlr(int sig, short event, void *arg)
682 {
683 	if (privsep_process != PROC_PARENT)
684 		return;
685 	log_debug("%s: handling signal", __func__);
686 
687 	switch (sig) {
688 	case SIGHUP:
689 		log_info("%s: reload requested with SIGHUP", __func__);
690 
691 		/*
692 		 * This is safe because libevent uses async signal handlers
693 		 * that run in the event loop and not in signal context.
694 		 */
695 		(void)vmd_reload(0, NULL);
696 		break;
697 	case SIGPIPE:
698 		log_info("%s: ignoring SIGPIPE", __func__);
699 		break;
700 	case SIGUSR1:
701 		log_info("%s: ignoring SIGUSR1", __func__);
702 		break;
703 	case SIGTERM:
704 	case SIGINT:
705 		vmd_shutdown();
706 		break;
707 	default:
708 		fatalx("unexpected signal");
709 	}
710 }
711 
712 __dead void
713 usage(void)
714 {
715 	extern char *__progname;
716 	fprintf(stderr, "usage: %s [-dnv] [-D macro=value] [-f file]\n",
717 	    __progname);
718 	exit(1);
719 }
720 
721 int
722 main(int argc, char **argv)
723 {
724 	struct privsep		*ps;
725 	int			 ch;
726 	const char		*conffile = VMD_CONF;
727 	enum privsep_procid	 proc_id = PROC_PARENT;
728 	int			 proc_instance = 0;
729 	const char		*errp, *title = NULL;
730 	int			 argc0 = argc;
731 
732 	log_init(0, LOG_DAEMON);
733 
734 	if ((env = calloc(1, sizeof(*env))) == NULL)
735 		fatal("calloc: env");
736 
737 	while ((ch = getopt(argc, argv, "D:P:I:df:vn")) != -1) {
738 		switch (ch) {
739 		case 'D':
740 			if (cmdline_symset(optarg) < 0)
741 				log_warnx("could not parse macro definition %s",
742 				    optarg);
743 			break;
744 		case 'd':
745 			env->vmd_debug = 2;
746 			break;
747 		case 'f':
748 			conffile = optarg;
749 			break;
750 		case 'v':
751 			env->vmd_verbose++;
752 			break;
753 		case 'n':
754 			env->vmd_noaction = 1;
755 			break;
756 		case 'P':
757 			title = optarg;
758 			proc_id = proc_getid(procs, nitems(procs), title);
759 			if (proc_id == PROC_MAX)
760 				fatalx("invalid process name");
761 			break;
762 		case 'I':
763 			proc_instance = strtonum(optarg, 0,
764 			    PROC_MAX_INSTANCES, &errp);
765 			if (errp)
766 				fatalx("invalid process instance");
767 			break;
768 		default:
769 			usage();
770 		}
771 	}
772 
773 	argc -= optind;
774 	if (argc > 0)
775 		usage();
776 
777 	if (env->vmd_noaction && !env->vmd_debug)
778 		env->vmd_debug = 1;
779 
780 	/* check for root privileges */
781 	if (env->vmd_noaction == 0) {
782 		if (geteuid())
783 			fatalx("need root privileges");
784 	}
785 
786 	ps = &env->vmd_ps;
787 	ps->ps_env = env;
788 	env->vmd_fd = -1;
789 
790 	if (config_init(env) == -1)
791 		fatal("failed to initialize configuration");
792 
793 	if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL)
794 		fatal("unknown user %s", VMD_USER);
795 
796 	/* First proc runs as root without pledge but in default chroot */
797 	proc_priv->p_pw = &proc_privpw; /* initialized to all 0 */
798 	proc_priv->p_chroot = ps->ps_pw->pw_dir; /* from VMD_USER */
799 
800 	/* Open /dev/vmm */
801 	if (env->vmd_noaction == 0) {
802 		env->vmd_fd = open(VMM_NODE, O_RDWR);
803 		if (env->vmd_fd == -1)
804 			fatal("%s", VMM_NODE);
805 	}
806 
807 	/* Configure the control socket */
808 	ps->ps_csock.cs_name = SOCKET_NAME;
809 	TAILQ_INIT(&ps->ps_rcsocks);
810 
811 	/* Configuration will be parsed after forking the children */
812 	env->vmd_conffile = conffile;
813 
814 	log_init(env->vmd_debug, LOG_DAEMON);
815 	log_setverbose(env->vmd_verbose);
816 
817 	if (env->vmd_noaction)
818 		ps->ps_noaction = 1;
819 	ps->ps_instance = proc_instance;
820 	if (title != NULL)
821 		ps->ps_title[proc_id] = title;
822 
823 	/* only the parent returns */
824 	proc_init(ps, procs, nitems(procs), env->vmd_debug, argc0, argv,
825 	    proc_id);
826 
827 	log_procinit("parent");
828 	if (!env->vmd_debug && daemon(0, 0) == -1)
829 		fatal("can't daemonize");
830 
831 	if (ps->ps_noaction == 0)
832 		log_info("startup");
833 
834 	event_init();
835 
836 	signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps);
837 	signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps);
838 	signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps);
839 	signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps);
840 	signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps);
841 
842 	signal_add(&ps->ps_evsigint, NULL);
843 	signal_add(&ps->ps_evsigterm, NULL);
844 	signal_add(&ps->ps_evsighup, NULL);
845 	signal_add(&ps->ps_evsigpipe, NULL);
846 	signal_add(&ps->ps_evsigusr1, NULL);
847 
848 	if (!env->vmd_noaction)
849 		proc_connect(ps);
850 
851 	if (vmd_configure() == -1)
852 		fatalx("configuration failed");
853 
854 	event_dispatch();
855 
856 	log_debug("parent exiting");
857 
858 	return (0);
859 }
860 
861 void
862 start_vm_batch(int fd, short type, void *args)
863 {
864 	int		i = 0;
865 	struct vmd_vm	*vm;
866 
867 	log_debug("%s: starting batch of %d vms", __func__,
868 	    env->vmd_cfg.parallelism);
869 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
870 		if (!(vm->vm_state & VM_STATE_WAITING)) {
871 			log_debug("%s: not starting vm %s (disabled)",
872 			    __func__,
873 			    vm->vm_params.vmc_params.vcp_name);
874 			continue;
875 		}
876 		i++;
877 		if (i > env->vmd_cfg.parallelism) {
878 			evtimer_add(&staggered_start_timer,
879 			    &env->vmd_cfg.delay);
880 			break;
881 		}
882 		vm->vm_state &= ~VM_STATE_WAITING;
883 		config_setvm(&env->vmd_ps, vm, -1, vm->vm_params.vmc_owner.uid);
884 	}
885 	log_debug("%s: done starting vms", __func__);
886 }
887 
888 int
889 vmd_configure(void)
890 {
891 	int			ncpus;
892 	struct vmd_switch	*vsw;
893 	int ncpu_mib[] = {CTL_HW, HW_NCPUONLINE};
894 	size_t ncpus_sz = sizeof(ncpus);
895 
896 	if ((env->vmd_ptmfd = open(PATH_PTMDEV, O_RDWR|O_CLOEXEC)) == -1)
897 		fatal("open %s", PATH_PTMDEV);
898 
899 	/*
900 	 * pledge in the parent process:
901 	 * stdio - for malloc and basic I/O including events.
902 	 * rpath - for reload to open and read the configuration files.
903 	 * wpath - for opening disk images and tap devices.
904 	 * tty - for openpty and TIOCUCNTL.
905 	 * proc - run kill to terminate its children safely.
906 	 * sendfd - for disks, interfaces and other fds.
907 	 * recvfd - for send and receive.
908 	 * getpw - lookup user or group id by name.
909 	 * chown, fattr - change tty ownership
910 	 * flock - locking disk files
911 	 */
912 	if (pledge("stdio rpath wpath proc tty recvfd sendfd getpw"
913 	    " chown fattr flock", NULL) == -1)
914 		fatal("pledge");
915 
916 	if (parse_config(env->vmd_conffile) == -1) {
917 		proc_kill(&env->vmd_ps);
918 		exit(1);
919 	}
920 
921 	if (env->vmd_noaction) {
922 		fprintf(stderr, "configuration OK\n");
923 		proc_kill(&env->vmd_ps);
924 		exit(0);
925 	}
926 
927 	/* Send shared global configuration to all children */
928 	if (config_setconfig(env) == -1)
929 		return (-1);
930 
931 	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
932 		if (vsw->sw_running)
933 			continue;
934 		if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
935 			log_warn("%s: failed to create switch %s",
936 			    __func__, vsw->sw_name);
937 			switch_remove(vsw);
938 			return (-1);
939 		}
940 	}
941 
942 	if (!(env->vmd_cfg.cfg_flags & VMD_CFG_STAGGERED_START)) {
943 		env->vmd_cfg.delay.tv_sec = VMD_DEFAULT_STAGGERED_START_DELAY;
944 		if (sysctl(ncpu_mib, NELEM(ncpu_mib), &ncpus, &ncpus_sz, NULL, 0) == -1)
945 			ncpus = 1;
946 		env->vmd_cfg.parallelism = ncpus;
947 		log_debug("%s: setting staggered start configuration to "
948 		    "parallelism: %d and delay: %lld",
949 		    __func__, ncpus, (long long) env->vmd_cfg.delay.tv_sec);
950 	}
951 
952 	log_debug("%s: starting vms in staggered fashion", __func__);
953 	evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
954 	/* start first batch */
955 	start_vm_batch(0, 0, NULL);
956 
957 	return (0);
958 }
959 
960 int
961 vmd_reload(unsigned int reset, const char *filename)
962 {
963 	struct vmd_vm		*vm, *next_vm;
964 	struct vmd_switch	*vsw;
965 	int			 reload = 0;
966 
967 	/* Switch back to the default config file */
968 	if (filename == NULL || *filename == '\0') {
969 		filename = env->vmd_conffile;
970 		reload = 1;
971 	}
972 
973 	log_debug("%s: level %d config file %s", __func__, reset, filename);
974 
975 	if (reset) {
976 		/* Purge the configuration */
977 		config_purge(env, reset);
978 		config_setreset(env, reset);
979 	} else {
980 		/*
981 		 * Load or reload the configuration.
982 		 *
983 		 * Reloading removes all non-running VMs before processing the
984 		 * config file, whereas loading only adds to the existing list
985 		 * of VMs.
986 		 */
987 
988 		if (reload) {
989 			TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry,
990 			    next_vm) {
991 				if (!(vm->vm_state & VM_STATE_RUNNING)) {
992 					DPRINTF("%s: calling vm_remove",
993 					    __func__);
994 					vm_remove(vm, __func__);
995 				}
996 			}
997 		}
998 
999 		if (parse_config(filename) == -1) {
1000 			log_debug("%s: failed to load config file %s",
1001 			    __func__, filename);
1002 			return (-1);
1003 		}
1004 
1005 		if (reload) {
1006 			/* Update shared global configuration in all children */
1007 			if (config_setconfig(env) == -1)
1008 				return (-1);
1009 		}
1010 
1011 		TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1012 			if (vsw->sw_running)
1013 				continue;
1014 			if (vm_priv_brconfig(&env->vmd_ps, vsw) == -1) {
1015 				log_warn("%s: failed to create switch %s",
1016 				    __func__, vsw->sw_name);
1017 				switch_remove(vsw);
1018 				return (-1);
1019 			}
1020 		}
1021 
1022 		log_debug("%s: starting vms in staggered fashion", __func__);
1023 		evtimer_set(&staggered_start_timer, start_vm_batch, NULL);
1024 		/* start first batch */
1025 		start_vm_batch(0, 0, NULL);
1026 
1027 		}
1028 
1029 	return (0);
1030 }
1031 
1032 void
1033 vmd_shutdown(void)
1034 {
1035 	struct vmd_vm *vm, *vm_next;
1036 
1037 	log_debug("%s: performing shutdown", __func__);
1038 
1039 	TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
1040 		vm_remove(vm, __func__);
1041 	}
1042 
1043 	proc_kill(&env->vmd_ps);
1044 	free(env);
1045 
1046 	log_warnx("parent terminating");
1047 	exit(0);
1048 }
1049 
1050 struct vmd_vm *
1051 vm_getbyvmid(uint32_t vmid)
1052 {
1053 	struct vmd_vm	*vm;
1054 
1055 	if (vmid == 0)
1056 		return (NULL);
1057 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1058 		if (vm->vm_vmid == vmid)
1059 			return (vm);
1060 	}
1061 
1062 	return (NULL);
1063 }
1064 
1065 struct vmd_vm *
1066 vm_getbyid(uint32_t id)
1067 {
1068 	struct vmd_vm	*vm;
1069 
1070 	if (id == 0)
1071 		return (NULL);
1072 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1073 		if (vm->vm_params.vmc_params.vcp_id == id)
1074 			return (vm);
1075 	}
1076 
1077 	return (NULL);
1078 }
1079 
1080 uint32_t
1081 vm_id2vmid(uint32_t id, struct vmd_vm *vm)
1082 {
1083 	if (vm == NULL && (vm = vm_getbyid(id)) == NULL)
1084 		return (0);
1085 	DPRINTF("%s: vmm id %u is vmid %u", __func__,
1086 	    id, vm->vm_vmid);
1087 	return (vm->vm_vmid);
1088 }
1089 
1090 uint32_t
1091 vm_vmid2id(uint32_t vmid, struct vmd_vm *vm)
1092 {
1093 	if (vm == NULL && (vm = vm_getbyvmid(vmid)) == NULL)
1094 		return (0);
1095 	DPRINTF("%s: vmid %u is vmm id %u", __func__,
1096 	    vmid, vm->vm_params.vmc_params.vcp_id);
1097 	return (vm->vm_params.vmc_params.vcp_id);
1098 }
1099 
1100 struct vmd_vm *
1101 vm_getbyname(const char *name)
1102 {
1103 	struct vmd_vm	*vm;
1104 
1105 	if (name == NULL)
1106 		return (NULL);
1107 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1108 		if (strcmp(vm->vm_params.vmc_params.vcp_name, name) == 0)
1109 			return (vm);
1110 	}
1111 
1112 	return (NULL);
1113 }
1114 
1115 struct vmd_vm *
1116 vm_getbypid(pid_t pid)
1117 {
1118 	struct vmd_vm	*vm;
1119 
1120 	TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
1121 		if (vm->vm_pid == pid)
1122 			return (vm);
1123 	}
1124 
1125 	return (NULL);
1126 }
1127 
1128 void
1129 vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
1130 {
1131 	struct privsep	*ps = &env->vmd_ps;
1132 	unsigned int	 i, j;
1133 
1134 	if (vm == NULL)
1135 		return;
1136 
1137 	log_debug("%s: %s %s stopping vm %d%s",
1138 	    __func__, ps->ps_title[privsep_process], caller,
1139 	    vm->vm_vmid, keeptty ? ", keeping tty open" : "");
1140 
1141 	vm->vm_state &= ~(VM_STATE_RUNNING | VM_STATE_SHUTDOWN);
1142 
1143 	user_inc(&vm->vm_params.vmc_params, vm->vm_user, 0);
1144 	user_put(vm->vm_user);
1145 
1146 	if (vm->vm_iev.ibuf.fd != -1) {
1147 		event_del(&vm->vm_iev.ev);
1148 		close(vm->vm_iev.ibuf.fd);
1149 	}
1150 	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) {
1151 		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
1152 			if (vm->vm_disks[i][j] != -1) {
1153 				close(vm->vm_disks[i][j]);
1154 				vm->vm_disks[i][j] = -1;
1155 			}
1156 		}
1157 	}
1158 	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) {
1159 		if (vm->vm_ifs[i].vif_fd != -1) {
1160 			close(vm->vm_ifs[i].vif_fd);
1161 			vm->vm_ifs[i].vif_fd = -1;
1162 		}
1163 		free(vm->vm_ifs[i].vif_name);
1164 		free(vm->vm_ifs[i].vif_switch);
1165 		free(vm->vm_ifs[i].vif_group);
1166 		vm->vm_ifs[i].vif_name = NULL;
1167 		vm->vm_ifs[i].vif_switch = NULL;
1168 		vm->vm_ifs[i].vif_group = NULL;
1169 	}
1170 	if (vm->vm_kernel != -1) {
1171 		close(vm->vm_kernel);
1172 		vm->vm_kernel = -1;
1173 	}
1174 	if (vm->vm_cdrom != -1) {
1175 		close(vm->vm_cdrom);
1176 		vm->vm_cdrom = -1;
1177 	}
1178 	if (!keeptty) {
1179 		vm_closetty(vm);
1180 		vm->vm_uid = 0;
1181 	}
1182 }
1183 
1184 void
1185 vm_remove(struct vmd_vm *vm, const char *caller)
1186 {
1187 	struct privsep	*ps = &env->vmd_ps;
1188 
1189 	if (vm == NULL)
1190 		return;
1191 
1192 	log_debug("%s: %s %s removing vm %d from running config",
1193 	    __func__, ps->ps_title[privsep_process], caller,
1194 	    vm->vm_vmid);
1195 
1196 	TAILQ_REMOVE(env->vmd_vms, vm, vm_entry);
1197 
1198 	user_put(vm->vm_user);
1199 	vm_stop(vm, 0, caller);
1200 	free(vm);
1201 }
1202 
1203 int
1204 vm_claimid(const char *name, int uid, uint32_t *id)
1205 {
1206 	struct name2id *n2i = NULL;
1207 
1208 	TAILQ_FOREACH(n2i, env->vmd_known, entry)
1209 		if (strcmp(n2i->name, name) == 0 && n2i->uid == uid)
1210 			goto out;
1211 
1212 	if (++env->vmd_nvm == 0) {
1213 		log_warnx("too many vms");
1214 		return -1;
1215 	}
1216 	if ((n2i = calloc(1, sizeof(struct name2id))) == NULL) {
1217 		log_warnx("could not alloc vm name");
1218 		return -1;
1219 	}
1220 	n2i->id = env->vmd_nvm;
1221 	n2i->uid = uid;
1222 	if (strlcpy(n2i->name, name, sizeof(n2i->name)) >= sizeof(n2i->name)) {
1223 		log_warnx("vm name too long");
1224 		free(n2i);
1225 		return -1;
1226 	}
1227 	TAILQ_INSERT_TAIL(env->vmd_known, n2i, entry);
1228 
1229 out:
1230 	*id = n2i->id;
1231 	return 0;
1232 }
1233 
1234 int
1235 vm_register(struct privsep *ps, struct vmop_create_params *vmc,
1236     struct vmd_vm **ret_vm, uint32_t id, uid_t uid)
1237 {
1238 	struct vmd_vm		*vm = NULL, *vm_parent = NULL;
1239 	struct vm_create_params	*vcp = &vmc->vmc_params;
1240 	struct vmop_owner	*vmo = NULL;
1241 	struct vmd_user		*usr = NULL;
1242 	uint32_t		 nid, rng;
1243 	unsigned int		 i, j;
1244 	struct vmd_switch	*sw;
1245 	char			*s;
1246 
1247 	/* Check if this is an instance of another VM */
1248 	if (vm_instance(ps, &vm_parent, vmc, uid) == -1)
1249 		return (-1);
1250 
1251 	errno = 0;
1252 	*ret_vm = NULL;
1253 
1254 	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1255 	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1256 		if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
1257 		    uid) != 0) {
1258 			errno = EPERM;
1259 			goto fail;
1260 		}
1261 		*ret_vm = vm;
1262 		errno = EALREADY;
1263 		goto fail;
1264 	}
1265 
1266 	if (vm_parent != NULL)
1267 		vmo = &vm_parent->vm_params.vmc_insowner;
1268 
1269 	/* non-root users can only start existing VMs or instances */
1270 	if (vm_checkperm(NULL, vmo, uid) != 0) {
1271 		log_warnx("permission denied");
1272 		errno = EPERM;
1273 		goto fail;
1274 	}
1275 	if (vmc->vmc_flags == 0) {
1276 		log_warnx("invalid configuration, no devices");
1277 		errno = VMD_DISK_MISSING;
1278 		goto fail;
1279 	}
1280 	if (vcp->vcp_ncpus == 0)
1281 		vcp->vcp_ncpus = 1;
1282 	if (vcp->vcp_memranges[0].vmr_size == 0)
1283 		vcp->vcp_memranges[0].vmr_size = VM_DEFAULT_MEMORY;
1284 	if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) {
1285 		log_warnx("invalid number of CPUs");
1286 		goto fail;
1287 	} else if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) {
1288 		log_warnx("invalid number of disks");
1289 		goto fail;
1290 	} else if (vcp->vcp_nnics > VMM_MAX_NICS_PER_VM) {
1291 		log_warnx("invalid number of interfaces");
1292 		goto fail;
1293 	} else if (strlen(vcp->vcp_kernel) == 0 &&
1294 	    vcp->vcp_ndisks == 0 && strlen(vcp->vcp_cdrom) == 0) {
1295 		log_warnx("no kernel or disk/cdrom specified");
1296 		goto fail;
1297 	} else if (strlen(vcp->vcp_name) == 0) {
1298 		log_warnx("invalid VM name");
1299 		goto fail;
1300 	} else if (*vcp->vcp_name == '-' || *vcp->vcp_name == '.' ||
1301 	    *vcp->vcp_name == '_') {
1302 		log_warnx("invalid VM name");
1303 		goto fail;
1304 	} else {
1305 		for (s = vcp->vcp_name; *s != '\0'; ++s) {
1306 			if (!(isalnum(*s) || *s == '.' || *s == '-' ||
1307 			    *s == '_')) {
1308 				log_warnx("invalid VM name");
1309 				goto fail;
1310 			}
1311 		}
1312 	}
1313 
1314 	/* track active users */
1315 	if (uid != 0 && env->vmd_users != NULL &&
1316 	    (usr = user_get(uid)) == NULL) {
1317 		log_warnx("could not add user");
1318 		goto fail;
1319 	}
1320 
1321 	if ((vm = calloc(1, sizeof(*vm))) == NULL)
1322 		goto fail;
1323 
1324 	memcpy(&vm->vm_params, vmc, sizeof(vm->vm_params));
1325 	vmc = &vm->vm_params;
1326 	vcp = &vmc->vmc_params;
1327 	vm->vm_pid = -1;
1328 	vm->vm_tty = -1;
1329 	vm->vm_receive_fd = -1;
1330 	vm->vm_state &= ~VM_STATE_PAUSED;
1331 	vm->vm_user = usr;
1332 
1333 	for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
1334 		for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
1335 			vm->vm_disks[i][j] = -1;
1336 	for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
1337 		vm->vm_ifs[i].vif_fd = -1;
1338 	for (i = 0; i < vcp->vcp_nnics; i++) {
1339 		if ((sw = switch_getbyname(vmc->vmc_ifswitch[i])) != NULL) {
1340 			/* inherit per-interface flags from the switch */
1341 			vmc->vmc_ifflags[i] |= (sw->sw_flags & VMIFF_OPTMASK);
1342 		}
1343 
1344 		/*
1345 		 * If the MAC address is zero, always randomize it in vmd(8)
1346 		 * because we cannot rely on the guest OS to do the right
1347 		 * thing like OpenBSD does.  Based on ether_fakeaddr()
1348 		 * from the kernel, incremented by one to differentiate
1349 		 * the source.
1350 		 */
1351 		if (memcmp(zero_mac, &vcp->vcp_macs[i], ETHER_ADDR_LEN) == 0) {
1352 			rng = arc4random();
1353 			vcp->vcp_macs[i][0] = 0xfe;
1354 			vcp->vcp_macs[i][1] = 0xe1;
1355 			vcp->vcp_macs[i][2] = 0xba + 1;
1356 			vcp->vcp_macs[i][3] = 0xd0 | ((i + 1) & 0xf);
1357 			vcp->vcp_macs[i][4] = rng;
1358 			vcp->vcp_macs[i][5] = rng >> 8;
1359 		}
1360 	}
1361 	vm->vm_kernel = -1;
1362 	vm->vm_cdrom = -1;
1363 	vm->vm_iev.ibuf.fd = -1;
1364 
1365 	/*
1366 	 * Assign a new internal Id if not specified and we succeed in
1367 	 * claiming a new Id.
1368 	 */
1369 	if (id != 0)
1370 		vm->vm_vmid = id;
1371 	else if (vm_claimid(vcp->vcp_name, uid, &nid) == -1)
1372 		goto fail;
1373 	else
1374 		vm->vm_vmid = nid;
1375 
1376 	log_debug("%s: registering vm %d", __func__, vm->vm_vmid);
1377 	TAILQ_INSERT_TAIL(env->vmd_vms, vm, vm_entry);
1378 
1379 	*ret_vm = vm;
1380 	return (0);
1381  fail:
1382 	if (errno == 0)
1383 		errno = EINVAL;
1384 	return (-1);
1385 }
1386 
1387 int
1388 vm_instance(struct privsep *ps, struct vmd_vm **vm_parent,
1389     struct vmop_create_params *vmc, uid_t uid)
1390 {
1391 	char			*name;
1392 	struct vm_create_params	*vcp = &vmc->vmc_params;
1393 	struct vmop_create_params *vmcp;
1394 	struct vm_create_params	*vcpp;
1395 	struct vmd_vm		*vm = NULL;
1396 	unsigned int		 i, j;
1397 	uint32_t		 id;
1398 
1399 	/* return without error if the parent is NULL (nothing to inherit) */
1400 	if ((vmc->vmc_flags & VMOP_CREATE_INSTANCE) == 0 ||
1401 	    vmc->vmc_instance[0] == '\0')
1402 		return (0);
1403 
1404 	if ((*vm_parent = vm_getbyname(vmc->vmc_instance)) == NULL) {
1405 		errno = VMD_PARENT_INVALID;
1406 		return (-1);
1407 	}
1408 
1409 	errno = 0;
1410 	vmcp = &(*vm_parent)->vm_params;
1411 	vcpp = &vmcp->vmc_params;
1412 
1413 	/* Are we allowed to create an instance from this VM? */
1414 	if (vm_checkperm(NULL, &vmcp->vmc_insowner, uid) != 0) {
1415 		log_warnx("vm \"%s\" no permission to create vm instance",
1416 		    vcpp->vcp_name);
1417 		errno = ENAMETOOLONG;
1418 		return (-1);
1419 	}
1420 
1421 	id = vcp->vcp_id;
1422 	name = vcp->vcp_name;
1423 
1424 	if ((vm = vm_getbyname(vcp->vcp_name)) != NULL ||
1425 	    (vm = vm_getbyvmid(vcp->vcp_id)) != NULL) {
1426 		errno = EPROCLIM;
1427 		return (-1);
1428 	}
1429 
1430 	/* CPU */
1431 	if (vcp->vcp_ncpus == 0)
1432 		vcp->vcp_ncpus = vcpp->vcp_ncpus;
1433 	if (vm_checkinsflag(vmcp, VMOP_CREATE_CPU, uid) != 0 &&
1434 	    vcp->vcp_ncpus != vcpp->vcp_ncpus) {
1435 		log_warnx("vm \"%s\" no permission to set cpus", name);
1436 		errno = EPERM;
1437 		return (-1);
1438 	}
1439 
1440 	/* memory */
1441 	if (vcp->vcp_memranges[0].vmr_size == 0)
1442 		vcp->vcp_memranges[0].vmr_size =
1443 		    vcpp->vcp_memranges[0].vmr_size;
1444 	if (vm_checkinsflag(vmcp, VMOP_CREATE_MEMORY, uid) != 0 &&
1445 	    vcp->vcp_memranges[0].vmr_size !=
1446 	    vcpp->vcp_memranges[0].vmr_size) {
1447 		log_warnx("vm \"%s\" no permission to set memory", name);
1448 		errno = EPERM;
1449 		return (-1);
1450 	}
1451 
1452 	/* disks cannot be inherited */
1453 	if (vm_checkinsflag(vmcp, VMOP_CREATE_DISK, uid) != 0 &&
1454 	    vcp->vcp_ndisks) {
1455 		log_warnx("vm \"%s\" no permission to set disks", name);
1456 		errno = EPERM;
1457 		return (-1);
1458 	}
1459 	for (i = 0; i < vcp->vcp_ndisks; i++) {
1460 		/* Check if this disk is already used in the parent */
1461 		for (j = 0; j < vcpp->vcp_ndisks; j++) {
1462 			if (strcmp(vcp->vcp_disks[i],
1463 			    vcpp->vcp_disks[j]) == 0) {
1464 				log_warnx("vm \"%s\" disk %s cannot be reused",
1465 				    name, vcp->vcp_disks[i]);
1466 				errno = EBUSY;
1467 				return (-1);
1468 			}
1469 		}
1470 		vmc->vmc_checkaccess |= VMOP_CREATE_DISK;
1471 	}
1472 
1473 	/* interfaces */
1474 	if (vcp->vcp_nnics > 0 &&
1475 	    vm_checkinsflag(vmcp, VMOP_CREATE_NETWORK, uid) != 0 &&
1476 	    vcp->vcp_nnics != vcpp->vcp_nnics) {
1477 		log_warnx("vm \"%s\" no permission to set interfaces", name);
1478 		errno = EPERM;
1479 		return (-1);
1480 	}
1481 	for (i = 0; i < vcpp->vcp_nnics; i++) {
1482 		/* Interface got overwritten */
1483 		if (i < vcp->vcp_nnics)
1484 			continue;
1485 
1486 		/* Copy interface from parent */
1487 		vmc->vmc_ifflags[i] = vmcp->vmc_ifflags[i];
1488 		(void)strlcpy(vmc->vmc_ifnames[i], vmcp->vmc_ifnames[i],
1489 		    sizeof(vmc->vmc_ifnames[i]));
1490 		(void)strlcpy(vmc->vmc_ifswitch[i], vmcp->vmc_ifswitch[i],
1491 		    sizeof(vmc->vmc_ifswitch[i]));
1492 		(void)strlcpy(vmc->vmc_ifgroup[i], vmcp->vmc_ifgroup[i],
1493 		    sizeof(vmc->vmc_ifgroup[i]));
1494 		memcpy(vcp->vcp_macs[i], vcpp->vcp_macs[i],
1495 		    sizeof(vcp->vcp_macs[i]));
1496 		vmc->vmc_ifrdomain[i] = vmcp->vmc_ifrdomain[i];
1497 		vcp->vcp_nnics++;
1498 	}
1499 	for (i = 0; i < vcp->vcp_nnics; i++) {
1500 		for (j = 0; j < vcpp->vcp_nnics; j++) {
1501 			if (memcmp(zero_mac, vcp->vcp_macs[i],
1502 			    sizeof(vcp->vcp_macs[i])) != 0 &&
1503 			    memcmp(vcpp->vcp_macs[i], vcp->vcp_macs[i],
1504 			    sizeof(vcp->vcp_macs[i])) != 0) {
1505 				log_warnx("vm \"%s\" lladdr cannot be reused",
1506 				    name);
1507 				errno = EBUSY;
1508 				return (-1);
1509 			}
1510 			if (strlen(vmc->vmc_ifnames[i]) &&
1511 			    strcmp(vmc->vmc_ifnames[i],
1512 			    vmcp->vmc_ifnames[j]) == 0) {
1513 				log_warnx("vm \"%s\" %s cannot be reused",
1514 				    vmc->vmc_ifnames[i], name);
1515 				errno = EBUSY;
1516 				return (-1);
1517 			}
1518 		}
1519 	}
1520 
1521 	/* kernel */
1522 	if (strlen(vcp->vcp_kernel) > 0) {
1523 		if (vm_checkinsflag(vmcp, VMOP_CREATE_KERNEL, uid) != 0) {
1524 			log_warnx("vm \"%s\" no permission to set boot image",
1525 			    name);
1526 			errno = EPERM;
1527 			return (-1);
1528 		}
1529 		vmc->vmc_checkaccess |= VMOP_CREATE_KERNEL;
1530 	} else if (strlcpy(vcp->vcp_kernel, vcpp->vcp_kernel,
1531 	    sizeof(vcp->vcp_kernel)) >= sizeof(vcp->vcp_kernel)) {
1532 		log_warnx("vm \"%s\" kernel name too long", name);
1533 		errno = EINVAL;
1534 		return (-1);
1535 	}
1536 
1537 	/* cdrom */
1538 	if (strlen(vcp->vcp_cdrom) > 0) {
1539 		if (vm_checkinsflag(vmcp, VMOP_CREATE_CDROM, uid) != 0) {
1540 			log_warnx("vm \"%s\" no permission to set cdrom", name);
1541 			errno = EPERM;
1542 			return (-1);
1543 		}
1544 		vmc->vmc_checkaccess |= VMOP_CREATE_CDROM;
1545 	} else if (strlcpy(vcp->vcp_cdrom, vcpp->vcp_cdrom,
1546 	    sizeof(vcp->vcp_cdrom)) >= sizeof(vcp->vcp_cdrom)) {
1547 		log_warnx("vm \"%s\" cdrom name too long", name);
1548 		errno = EINVAL;
1549 		return (-1);
1550 	}
1551 
1552 	/* user */
1553 	if (vmc->vmc_owner.uid == 0)
1554 		vmc->vmc_owner.uid = vmcp->vmc_owner.uid;
1555 	else if (vmc->vmc_owner.uid != uid &&
1556 	    vmc->vmc_owner.uid != vmcp->vmc_owner.uid) {
1557 		log_warnx("vm \"%s\" user mismatch", name);
1558 		errno = EPERM;
1559 		return (-1);
1560 	}
1561 
1562 	/* group */
1563 	if (vmc->vmc_owner.gid == 0)
1564 		vmc->vmc_owner.gid = vmcp->vmc_owner.gid;
1565 	else if (vmc->vmc_owner.gid != vmcp->vmc_owner.gid) {
1566 		log_warnx("vm \"%s\" group mismatch", name);
1567 		errno = EPERM;
1568 		return (-1);
1569 	}
1570 
1571 	/* child instances */
1572 	if (vmc->vmc_insflags) {
1573 		log_warnx("vm \"%s\" cannot change instance permissions", name);
1574 		errno = EPERM;
1575 		return (-1);
1576 	}
1577 	if (vmcp->vmc_insflags & VMOP_CREATE_INSTANCE) {
1578 		vmc->vmc_insowner.gid = vmcp->vmc_insowner.gid;
1579 		vmc->vmc_insowner.uid = vmcp->vmc_insowner.gid;
1580 		vmc->vmc_insflags = vmcp->vmc_insflags;
1581 	} else {
1582 		vmc->vmc_insowner.gid = 0;
1583 		vmc->vmc_insowner.uid = 0;
1584 		vmc->vmc_insflags = 0;
1585 	}
1586 
1587 	/* finished, remove instance flags */
1588 	vmc->vmc_flags &= ~VMOP_CREATE_INSTANCE;
1589 
1590 	return (0);
1591 }
1592 
1593 /*
1594  * vm_checkperm
1595  *
1596  * Checks if the user represented by the 'uid' parameter is allowed to
1597  * manipulate the VM described by the 'vm' parameter (or connect to said VM's
1598  * console.)
1599  *
1600  * Parameters:
1601  *  vm: the VM whose permission is to be checked
1602  *  vmo: the required uid/gid to be checked
1603  *  uid: the user ID of the user making the request
1604  *
1605  * Return values:
1606  *   0: the permission should be granted
1607  *  -1: the permission check failed (also returned if vm == null)
1608  */
1609 int
1610 vm_checkperm(struct vmd_vm *vm, struct vmop_owner *vmo, uid_t uid)
1611 {
1612 	struct group	*gr;
1613 	struct passwd	*pw;
1614 	char		**grmem;
1615 
1616 	/* root has no restrictions */
1617 	if (uid == 0)
1618 		return (0);
1619 
1620 	if (vmo == NULL)
1621 		return (-1);
1622 
1623 	/* check user */
1624 	if (vm == NULL) {
1625 		if  (vmo->uid == uid)
1626 			return (0);
1627 	} else {
1628 		/*
1629 		 * check user of running vm (the owner of a running vm can
1630 		 * be different to (or more specific than) the configured owner.
1631 		 */
1632 		if (((vm->vm_state & VM_STATE_RUNNING) && vm->vm_uid == uid) ||
1633 		    (!(vm->vm_state & VM_STATE_RUNNING) && vmo->uid == uid))
1634 			return (0);
1635 	}
1636 
1637 	/* check groups */
1638 	if (vmo->gid != -1) {
1639 		if ((pw = getpwuid(uid)) == NULL)
1640 			return (-1);
1641 		if (pw->pw_gid == vmo->gid)
1642 			return (0);
1643 		if ((gr = getgrgid(vmo->gid)) != NULL) {
1644 			for (grmem = gr->gr_mem; *grmem; grmem++)
1645 				if (strcmp(*grmem, pw->pw_name) == 0)
1646 					return (0);
1647 		}
1648 	}
1649 
1650 	return (-1);
1651 }
1652 
1653 /*
1654  * vm_checkinsflag
1655  *
1656  * Checks wheter the non-root user is allowed to set an instance option.
1657  *
1658  * Parameters:
1659  *  vmc: the VM create parameters
1660  *  flag: the flag to be checked
1661  *  uid: the user ID of the user making the request
1662  *
1663  * Return values:
1664  *   0: the permission should be granted
1665  *  -1: the permission check failed (also returned if vm == null)
1666  */
1667 int
1668 vm_checkinsflag(struct vmop_create_params *vmc, unsigned int flag, uid_t uid)
1669 {
1670 	/* root has no restrictions */
1671 	if (uid == 0)
1672 		return (0);
1673 
1674 	if ((vmc->vmc_insflags & flag) == 0)
1675 		return (-1);
1676 
1677 	return (0);
1678 }
1679 
1680 /*
1681  * vm_checkaccess
1682  *
1683  * Checks if the user represented by the 'uid' parameter is allowed to
1684  * access the file described by the 'path' parameter.
1685  *
1686  * Parameters:
1687  *  fd: the file descriptor of the opened file
1688  *  uflag: check if the userid has access to the file
1689  *  uid: the user ID of the user making the request
1690  *  amode: the access flags of R_OK and W_OK
1691  *
1692  * Return values:
1693  *   0: the permission should be granted
1694  *  -1: the permission check failed
1695  */
1696 int
1697 vm_checkaccess(int fd, unsigned int uflag, uid_t uid, int amode)
1698 {
1699 	struct group	*gr;
1700 	struct passwd	*pw;
1701 	char		**grmem;
1702 	struct stat	 st;
1703 	mode_t		 mode;
1704 
1705 	if (fd == -1)
1706 		return (-1);
1707 
1708 	/*
1709 	 * File has to be accessible and a regular file
1710 	 */
1711 	if (fstat(fd, &st) == -1 || !S_ISREG(st.st_mode))
1712 		return (-1);
1713 
1714 	/* root has no restrictions */
1715 	if (uid == 0 || uflag == 0)
1716 		return (0);
1717 
1718 	/* check other */
1719 	mode = amode & W_OK ? S_IWOTH : 0;
1720 	mode |= amode & R_OK ? S_IROTH : 0;
1721 	if ((st.st_mode & mode) == mode)
1722 		return (0);
1723 
1724 	/* check user */
1725 	mode = amode & W_OK ? S_IWUSR : 0;
1726 	mode |= amode & R_OK ? S_IRUSR : 0;
1727 	if (uid == st.st_uid && (st.st_mode & mode) == mode)
1728 		return (0);
1729 
1730 	/* check groups */
1731 	mode = amode & W_OK ? S_IWGRP : 0;
1732 	mode |= amode & R_OK ? S_IRGRP : 0;
1733 	if ((st.st_mode & mode) != mode)
1734 		return (-1);
1735 	if ((pw = getpwuid(uid)) == NULL)
1736 		return (-1);
1737 	if (pw->pw_gid == st.st_gid)
1738 		return (0);
1739 	if ((gr = getgrgid(st.st_gid)) != NULL) {
1740 		for (grmem = gr->gr_mem; *grmem; grmem++)
1741 			if (strcmp(*grmem, pw->pw_name) == 0)
1742 				return (0);
1743 	}
1744 
1745 	return (-1);
1746 }
1747 
1748 int
1749 vm_opentty(struct vmd_vm *vm)
1750 {
1751 	struct ptmget		 ptm;
1752 	struct stat		 st;
1753 	struct group		*gr;
1754 	uid_t			 uid;
1755 	gid_t			 gid;
1756 	mode_t			 mode;
1757 	int			 on;
1758 
1759 	/*
1760 	 * Open tty with pre-opened PTM fd
1761 	 */
1762 	if ((ioctl(env->vmd_ptmfd, PTMGET, &ptm) == -1))
1763 		return (-1);
1764 
1765 	/*
1766 	 * We use user ioctl(2) mode to pass break commands.
1767 	 */
1768 	on = 1;
1769 	if (ioctl(ptm.cfd, TIOCUCNTL, &on) == -1)
1770 		fatal("could not enable user ioctl mode");
1771 
1772 	vm->vm_tty = ptm.cfd;
1773 	close(ptm.sfd);
1774 	if ((vm->vm_ttyname = strdup(ptm.sn)) == NULL)
1775 		goto fail;
1776 
1777 	uid = vm->vm_uid;
1778 	gid = vm->vm_params.vmc_owner.gid;
1779 
1780 	if (vm->vm_params.vmc_owner.gid != -1) {
1781 		mode = 0660;
1782 	} else if ((gr = getgrnam("tty")) != NULL) {
1783 		gid = gr->gr_gid;
1784 		mode = 0620;
1785 	} else {
1786 		mode = 0600;
1787 		gid = 0;
1788 	}
1789 
1790 	log_debug("%s: vm %s tty %s uid %d gid %d mode %o",
1791 	    __func__, vm->vm_params.vmc_params.vcp_name,
1792 	    vm->vm_ttyname, uid, gid, mode);
1793 
1794 	/*
1795 	 * Change ownership and mode of the tty as required.
1796 	 * Loosely based on the implementation of sshpty.c
1797 	 */
1798 	if (stat(vm->vm_ttyname, &st) == -1)
1799 		goto fail;
1800 
1801 	if (st.st_uid != uid || st.st_gid != gid) {
1802 		if (chown(vm->vm_ttyname, uid, gid) == -1) {
1803 			log_warn("chown %s %d %d failed, uid %d",
1804 			    vm->vm_ttyname, uid, gid, getuid());
1805 
1806 			/* Ignore failure on read-only filesystems */
1807 			if (!((errno == EROFS) &&
1808 			    (st.st_uid == uid || st.st_uid == 0)))
1809 				goto fail;
1810 		}
1811 	}
1812 
1813 	if ((st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)) != mode) {
1814 		if (chmod(vm->vm_ttyname, mode) == -1) {
1815 			log_warn("chmod %s %o failed, uid %d",
1816 			    vm->vm_ttyname, mode, getuid());
1817 
1818 			/* Ignore failure on read-only filesystems */
1819 			if (!((errno == EROFS) &&
1820 			    (st.st_uid == uid || st.st_uid == 0)))
1821 				goto fail;
1822 		}
1823 	}
1824 
1825 	return (0);
1826  fail:
1827 	vm_closetty(vm);
1828 	return (-1);
1829 }
1830 
1831 void
1832 vm_closetty(struct vmd_vm *vm)
1833 {
1834 	if (vm->vm_tty != -1) {
1835 		/* Release and close the tty */
1836 		if (fchown(vm->vm_tty, 0, 0) == -1)
1837 			log_warn("chown %s 0 0 failed", vm->vm_ttyname);
1838 		if (fchmod(vm->vm_tty, 0666) == -1)
1839 			log_warn("chmod %s 0666 failed", vm->vm_ttyname);
1840 		close(vm->vm_tty);
1841 		vm->vm_tty = -1;
1842 	}
1843 	free(vm->vm_ttyname);
1844 	vm->vm_ttyname = NULL;
1845 }
1846 
1847 void
1848 switch_remove(struct vmd_switch *vsw)
1849 {
1850 	if (vsw == NULL)
1851 		return;
1852 
1853 	TAILQ_REMOVE(env->vmd_switches, vsw, sw_entry);
1854 
1855 	free(vsw->sw_group);
1856 	free(vsw->sw_name);
1857 	free(vsw);
1858 }
1859 
1860 struct vmd_switch *
1861 switch_getbyname(const char *name)
1862 {
1863 	struct vmd_switch	*vsw;
1864 
1865 	if (name == NULL)
1866 		return (NULL);
1867 	TAILQ_FOREACH(vsw, env->vmd_switches, sw_entry) {
1868 		if (strcmp(vsw->sw_name, name) == 0)
1869 			return (vsw);
1870 	}
1871 
1872 	return (NULL);
1873 }
1874 
1875 struct vmd_user *
1876 user_get(uid_t uid)
1877 {
1878 	struct vmd_user		*usr;
1879 
1880 	if (uid == 0)
1881 		return (NULL);
1882 
1883 	/* first try to find an existing user */
1884 	TAILQ_FOREACH(usr, env->vmd_users, usr_entry) {
1885 		if (usr->usr_id.uid == uid)
1886 			goto done;
1887 	}
1888 
1889 	if ((usr = calloc(1, sizeof(*usr))) == NULL) {
1890 		log_warn("could not allocate user");
1891 		return (NULL);
1892 	}
1893 
1894 	usr->usr_id.uid = uid;
1895 	usr->usr_id.gid = -1;
1896 	TAILQ_INSERT_TAIL(env->vmd_users, usr, usr_entry);
1897 
1898  done:
1899 	DPRINTF("%s: uid %d #%d +",
1900 	    __func__, usr->usr_id.uid, usr->usr_refcnt + 1);
1901 	usr->usr_refcnt++;
1902 
1903 	return (usr);
1904 }
1905 
1906 void
1907 user_put(struct vmd_user *usr)
1908 {
1909 	if (usr == NULL)
1910 		return;
1911 
1912 	DPRINTF("%s: uid %d #%d -",
1913 	    __func__, usr->usr_id.uid, usr->usr_refcnt - 1);
1914 
1915 	if (--usr->usr_refcnt > 0)
1916 		return;
1917 
1918 	TAILQ_REMOVE(env->vmd_users, usr, usr_entry);
1919 	free(usr);
1920 }
1921 
1922 void
1923 user_inc(struct vm_create_params *vcp, struct vmd_user *usr, int inc)
1924 {
1925 	char	 mem[FMT_SCALED_STRSIZE];
1926 
1927 	if (usr == NULL)
1928 		return;
1929 
1930 	/* increment or decrement counters */
1931 	inc = inc ? 1 : -1;
1932 
1933 	usr->usr_maxcpu += vcp->vcp_ncpus * inc;
1934 	usr->usr_maxmem += vcp->vcp_memranges[0].vmr_size * inc;
1935 	usr->usr_maxifs += vcp->vcp_nnics * inc;
1936 
1937 	if (log_getverbose() > 1) {
1938 		(void)fmt_scaled(usr->usr_maxmem * 1024 * 1024, mem);
1939 		log_debug("%s: %c uid %d ref %d cpu %llu mem %s ifs %llu",
1940 		    __func__, inc == 1 ? '+' : '-',
1941 		    usr->usr_id.uid, usr->usr_refcnt,
1942 		    usr->usr_maxcpu, mem, usr->usr_maxifs);
1943 	}
1944 }
1945 
1946 int
1947 user_checklimit(struct vmd_user *usr, struct vm_create_params *vcp)
1948 {
1949 	const char	*limit = "";
1950 
1951 	/* XXX make the limits configurable */
1952 	if (usr->usr_maxcpu > VM_DEFAULT_USER_MAXCPU) {
1953 		limit = "cpu ";
1954 		goto fail;
1955 	}
1956 	if (usr->usr_maxmem > VM_DEFAULT_USER_MAXMEM) {
1957 		limit = "memory ";
1958 		goto fail;
1959 	}
1960 	if (usr->usr_maxifs > VM_DEFAULT_USER_MAXIFS) {
1961 		limit = "interface ";
1962 		goto fail;
1963 	}
1964 
1965 	return (0);
1966 
1967  fail:
1968 	log_warnx("%s: user %d %slimit reached", vcp->vcp_name,
1969 	    usr->usr_id.uid, limit);
1970 	return (-1);
1971 }
1972 
1973 char *
1974 get_string(uint8_t *ptr, size_t len)
1975 {
1976 	size_t	 i;
1977 
1978 	for (i = 0; i < len; i++)
1979 		if (!isprint(ptr[i]))
1980 			break;
1981 
1982 	return strndup(ptr, i);
1983 }
1984 
1985 uint32_t
1986 prefixlen2mask(uint8_t prefixlen)
1987 {
1988 	if (prefixlen == 0)
1989 		return (0);
1990 
1991 	if (prefixlen > 32)
1992 		prefixlen = 32;
1993 
1994 	return (htonl(0xffffffff << (32 - prefixlen)));
1995 }
1996 
1997 void
1998 prefixlen2mask6(uint8_t prefixlen, struct in6_addr *mask)
1999 {
2000 	struct in6_addr	 s6;
2001 	int		 i;
2002 
2003 	if (prefixlen > 128)
2004 		prefixlen = 128;
2005 
2006 	memset(&s6, 0, sizeof(s6));
2007 	for (i = 0; i < prefixlen / 8; i++)
2008 		s6.s6_addr[i] = 0xff;
2009 	i = prefixlen % 8;
2010 	if (i)
2011 		s6.s6_addr[prefixlen / 8] = 0xff00 >> i;
2012 
2013 	memcpy(mask, &s6, sizeof(s6));
2014 }
2015 
2016 void
2017 getmonotime(struct timeval *tv)
2018 {
2019 	struct timespec	 ts;
2020 
2021 	if (clock_gettime(CLOCK_MONOTONIC, &ts))
2022 		fatal("clock_gettime");
2023 
2024 	TIMESPEC_TO_TIMEVAL(tv, &ts);
2025 }
2026