xref: /freebsd/usr.sbin/bhyve/bhyverun.c (revision 069ac184)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/types.h>
30 #ifndef WITHOUT_CAPSICUM
31 #include <sys/capsicum.h>
32 #endif
33 #include <sys/mman.h>
34 #ifdef BHYVE_SNAPSHOT
35 #include <sys/socket.h>
36 #include <sys/stat.h>
37 #endif
38 #include <sys/time.h>
39 #ifdef BHYVE_SNAPSHOT
40 #include <sys/un.h>
41 #endif
42 
43 #include <machine/atomic.h>
44 
45 #ifndef WITHOUT_CAPSICUM
46 #include <capsicum_helpers.h>
47 #endif
48 #include <stdio.h>
49 #include <stdlib.h>
50 #include <string.h>
51 #include <err.h>
52 #include <errno.h>
53 #ifdef BHYVE_SNAPSHOT
54 #include <fcntl.h>
55 #endif
56 #include <libgen.h>
57 #include <unistd.h>
58 #include <assert.h>
59 #include <pthread.h>
60 #include <pthread_np.h>
61 #include <sysexits.h>
62 #include <stdbool.h>
63 #include <stdint.h>
64 #ifdef BHYVE_SNAPSHOT
65 #include <ucl.h>
66 #include <unistd.h>
67 
68 #include <libxo/xo.h>
69 #endif
70 
71 #include <vmmapi.h>
72 
73 #include "acpi.h"
74 #include "bhyverun.h"
75 #include "bootrom.h"
76 #include "config.h"
77 #include "debug.h"
78 #ifdef BHYVE_GDB
79 #include "gdb.h"
80 #endif
81 #include "mem.h"
82 #include "mevent.h"
83 #include "pci_emul.h"
84 #ifdef __amd64__
85 #include "amd64/pci_lpc.h"
86 #endif
87 #include "qemu_fwcfg.h"
88 #ifdef BHYVE_SNAPSHOT
89 #include "snapshot.h"
90 #endif
91 #include "tpm_device.h"
92 #include "vmgenc.h"
93 #include "vmexit.h"
94 
95 #define MB		(1024UL * 1024)
96 #define GB		(1024UL * MB)
97 
98 int guest_ncpus;
99 uint16_t cpu_cores, cpu_sockets, cpu_threads;
100 
101 int raw_stdio = 0;
102 
103 static char *progname;
104 static const int BSP = 0;
105 
106 static cpuset_t cpumask;
107 
108 static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu);
109 
110 static struct vcpu_info {
111 	struct vmctx	*ctx;
112 	struct vcpu	*vcpu;
113 	int		vcpuid;
114 } *vcpu_info;
115 
116 static cpuset_t **vcpumap;
117 
118 static void
119 usage(int code)
120 {
121 
122 	fprintf(stderr,
123 		"Usage: %s [-AaCDeHhPSuWwxY]\n"
124 		"       %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n"
125 		"       %*s [-G port] [-k config_file] [-l lpc] [-m mem] [-o var=value]\n"
126 		"       %*s [-p vcpu:hostcpu] [-r file] [-s pci] [-U uuid] vmname\n"
127 		"       -A: create ACPI tables\n"
128 		"       -a: local apic is in xAPIC mode (deprecated)\n"
129 		"       -C: include guest memory in core file\n"
130 		"       -c: number of CPUs and/or topology specification\n"
131 		"       -D: destroy on power-off\n"
132 		"       -e: exit on unhandled I/O access\n"
133 		"       -G: start a debug server\n"
134 		"       -H: vmexit from the guest on HLT\n"
135 		"       -h: help\n"
136 		"       -k: key=value flat config file\n"
137 		"       -K: PS2 keyboard layout\n"
138 		"       -l: LPC device configuration\n"
139 		"       -m: memory size\n"
140 		"       -o: set config 'var' to 'value'\n"
141 		"       -P: vmexit from the guest on pause\n"
142 		"       -p: pin 'vcpu' to 'hostcpu'\n"
143 #ifdef BHYVE_SNAPSHOT
144 		"       -r: path to checkpoint file\n"
145 #endif
146 		"       -S: guest memory cannot be swapped\n"
147 		"       -s: <slot,driver,configinfo> PCI slot config\n"
148 		"       -U: UUID\n"
149 		"       -u: RTC keeps UTC time\n"
150 		"       -W: force virtio to use single-vector MSI\n"
151 		"       -w: ignore unimplemented MSRs\n"
152 		"       -x: local APIC is in x2APIC mode\n"
153 		"       -Y: disable MPtable generation\n",
154 		progname, (int)strlen(progname), "", (int)strlen(progname), "",
155 		(int)strlen(progname), "");
156 
157 	exit(code);
158 }
159 
160 /*
161  * XXX This parser is known to have the following issues:
162  * 1.  It accepts null key=value tokens ",," as setting "cpus" to an
163  *     empty string.
164  *
165  * The acceptance of a null specification ('-c ""') is by design to match the
166  * manual page syntax specification, this results in a topology of 1 vCPU.
167  */
168 static int
169 topology_parse(const char *opt)
170 {
171 	char *cp, *str, *tofree;
172 
173 	if (*opt == '\0') {
174 		set_config_value("sockets", "1");
175 		set_config_value("cores", "1");
176 		set_config_value("threads", "1");
177 		set_config_value("cpus", "1");
178 		return (0);
179 	}
180 
181 	tofree = str = strdup(opt);
182 	if (str == NULL)
183 		errx(4, "Failed to allocate memory");
184 
185 	while ((cp = strsep(&str, ",")) != NULL) {
186 		if (strncmp(cp, "cpus=", strlen("cpus=")) == 0)
187 			set_config_value("cpus", cp + strlen("cpus="));
188 		else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0)
189 			set_config_value("sockets", cp + strlen("sockets="));
190 		else if (strncmp(cp, "cores=", strlen("cores=")) == 0)
191 			set_config_value("cores", cp + strlen("cores="));
192 		else if (strncmp(cp, "threads=", strlen("threads=")) == 0)
193 			set_config_value("threads", cp + strlen("threads="));
194 		else if (strchr(cp, '=') != NULL)
195 			goto out;
196 		else
197 			set_config_value("cpus", cp);
198 	}
199 	free(tofree);
200 	return (0);
201 
202 out:
203 	free(tofree);
204 	return (-1);
205 }
206 
207 static int
208 parse_int_value(const char *key, const char *value, int minval, int maxval)
209 {
210 	char *cp;
211 	long lval;
212 
213 	errno = 0;
214 	lval = strtol(value, &cp, 0);
215 	if (errno != 0 || *cp != '\0' || cp == value || lval < minval ||
216 	    lval > maxval)
217 		errx(4, "Invalid value for %s: '%s'", key, value);
218 	return (lval);
219 }
220 
221 /*
222  * Set the sockets, cores, threads, and guest_cpus variables based on
223  * the configured topology.
224  *
225  * The limits of UINT16_MAX are due to the types passed to
226  * vm_set_topology().  vmm.ko may enforce tighter limits.
227  */
228 static void
229 calc_topology(void)
230 {
231 	const char *value;
232 	bool explicit_cpus;
233 	uint64_t ncpus;
234 
235 	value = get_config_value("cpus");
236 	if (value != NULL) {
237 		guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX);
238 		explicit_cpus = true;
239 	} else {
240 		guest_ncpus = 1;
241 		explicit_cpus = false;
242 	}
243 	value = get_config_value("cores");
244 	if (value != NULL)
245 		cpu_cores = parse_int_value("cores", value, 1, UINT16_MAX);
246 	else
247 		cpu_cores = 1;
248 	value = get_config_value("threads");
249 	if (value != NULL)
250 		cpu_threads = parse_int_value("threads", value, 1, UINT16_MAX);
251 	else
252 		cpu_threads = 1;
253 	value = get_config_value("sockets");
254 	if (value != NULL)
255 		cpu_sockets = parse_int_value("sockets", value, 1, UINT16_MAX);
256 	else
257 		cpu_sockets = guest_ncpus;
258 
259 	/*
260 	 * Compute sockets * cores * threads avoiding overflow.  The
261 	 * range check above insures these are 16 bit values.
262 	 */
263 	ncpus = (uint64_t)cpu_sockets * cpu_cores * cpu_threads;
264 	if (ncpus > UINT16_MAX)
265 		errx(4, "Computed number of vCPUs too high: %ju",
266 		    (uintmax_t)ncpus);
267 
268 	if (explicit_cpus) {
269 		if (guest_ncpus != (int)ncpus)
270 			errx(4, "Topology (%d sockets, %d cores, %d threads) "
271 			    "does not match %d vCPUs",
272 			    cpu_sockets, cpu_cores, cpu_threads,
273 			    guest_ncpus);
274 	} else
275 		guest_ncpus = ncpus;
276 }
277 
278 static int
279 pincpu_parse(const char *opt)
280 {
281 	const char *value;
282 	char *newval;
283 	char key[16];
284 	int vcpu, pcpu;
285 
286 	if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) {
287 		fprintf(stderr, "invalid format: %s\n", opt);
288 		return (-1);
289 	}
290 
291 	if (vcpu < 0) {
292 		fprintf(stderr, "invalid vcpu '%d'\n", vcpu);
293 		return (-1);
294 	}
295 
296 	if (pcpu < 0 || pcpu >= CPU_SETSIZE) {
297 		fprintf(stderr, "hostcpu '%d' outside valid range from "
298 		    "0 to %d\n", pcpu, CPU_SETSIZE - 1);
299 		return (-1);
300 	}
301 
302 	snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu);
303 	value = get_config_value(key);
304 
305 	if (asprintf(&newval, "%s%s%d", value != NULL ? value : "",
306 	    value != NULL ? "," : "", pcpu) == -1) {
307 		perror("failed to build new cpuset string");
308 		return (-1);
309 	}
310 
311 	set_config_value(key, newval);
312 	free(newval);
313 	return (0);
314 }
315 
316 static void
317 parse_cpuset(int vcpu, const char *list, cpuset_t *set)
318 {
319 	char *cp, *token;
320 	int pcpu, start;
321 
322 	CPU_ZERO(set);
323 	start = -1;
324 	token = __DECONST(char *, list);
325 	for (;;) {
326 		pcpu = strtoul(token, &cp, 0);
327 		if (cp == token)
328 			errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list);
329 		if (pcpu < 0 || pcpu >= CPU_SETSIZE)
330 			errx(4, "hostcpu '%d' outside valid range from 0 to %d",
331 			    pcpu, CPU_SETSIZE - 1);
332 		switch (*cp) {
333 		case ',':
334 		case '\0':
335 			if (start >= 0) {
336 				if (start > pcpu)
337 					errx(4, "Invalid hostcpu range %d-%d",
338 					    start, pcpu);
339 				while (start < pcpu) {
340 					CPU_SET(start, set);
341 					start++;
342 				}
343 				start = -1;
344 			}
345 			CPU_SET(pcpu, set);
346 			break;
347 		case '-':
348 			if (start >= 0)
349 				errx(4, "invalid cpuset for vcpu %d: '%s'",
350 				    vcpu, list);
351 			start = pcpu;
352 			break;
353 		default:
354 			errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list);
355 		}
356 		if (*cp == '\0')
357 			break;
358 		token = cp + 1;
359 	}
360 }
361 
362 static void
363 build_vcpumaps(void)
364 {
365 	char key[16];
366 	const char *value;
367 	int vcpu;
368 
369 	vcpumap = calloc(guest_ncpus, sizeof(*vcpumap));
370 	for (vcpu = 0; vcpu < guest_ncpus; vcpu++) {
371 		snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu);
372 		value = get_config_value(key);
373 		if (value == NULL)
374 			continue;
375 		vcpumap[vcpu] = malloc(sizeof(cpuset_t));
376 		if (vcpumap[vcpu] == NULL)
377 			err(4, "Failed to allocate cpuset for vcpu %d", vcpu);
378 		parse_cpuset(vcpu, value, vcpumap[vcpu]);
379 	}
380 }
381 
382 void *
383 paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
384 {
385 
386 	return (vm_map_gpa(ctx, gaddr, len));
387 }
388 
389 #ifdef BHYVE_SNAPSHOT
390 uintptr_t
391 paddr_host2guest(struct vmctx *ctx, void *addr)
392 {
393 	return (vm_rev_map_gpa(ctx, addr));
394 }
395 #endif
396 
397 int
398 fbsdrun_virtio_msix(void)
399 {
400 
401 	return (get_config_bool_default("virtio_msix", true));
402 }
403 
404 struct vcpu *
405 fbsdrun_vcpu(int vcpuid)
406 {
407 	return (vcpu_info[vcpuid].vcpu);
408 }
409 
410 static void *
411 fbsdrun_start_thread(void *param)
412 {
413 	char tname[MAXCOMLEN + 1];
414 	struct vcpu_info *vi = param;
415 	int error;
416 
417 	snprintf(tname, sizeof(tname), "vcpu %d", vi->vcpuid);
418 	pthread_set_name_np(pthread_self(), tname);
419 
420 	if (vcpumap[vi->vcpuid] != NULL) {
421 		error = pthread_setaffinity_np(pthread_self(),
422 		    sizeof(cpuset_t), vcpumap[vi->vcpuid]);
423 		assert(error == 0);
424 	}
425 
426 #ifdef BHYVE_SNAPSHOT
427 	checkpoint_cpu_add(vi->vcpuid);
428 #endif
429 #ifdef BHYVE_GDB
430 	gdb_cpu_add(vi->vcpu);
431 #endif
432 
433 	vm_loop(vi->ctx, vi->vcpu);
434 
435 	/* not reached */
436 	exit(1);
437 	return (NULL);
438 }
439 
440 void
441 fbsdrun_addcpu(int vcpuid)
442 {
443 	struct vcpu_info *vi;
444 	pthread_t thr;
445 	int error;
446 
447 	vi = &vcpu_info[vcpuid];
448 
449 	error = vm_activate_cpu(vi->vcpu);
450 	if (error != 0)
451 		err(EX_OSERR, "could not activate CPU %d", vi->vcpuid);
452 
453 	CPU_SET_ATOMIC(vcpuid, &cpumask);
454 
455 	vm_suspend_cpu(vi->vcpu);
456 
457 	error = pthread_create(&thr, NULL, fbsdrun_start_thread, vi);
458 	assert(error == 0);
459 }
460 
461 void
462 fbsdrun_deletecpu(int vcpu)
463 {
464 	static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
465 	static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
466 
467 	pthread_mutex_lock(&resetcpu_mtx);
468 	if (!CPU_ISSET(vcpu, &cpumask)) {
469 		EPRINTLN("Attempting to delete unknown cpu %d", vcpu);
470 		exit(4);
471 	}
472 
473 	CPU_CLR(vcpu, &cpumask);
474 
475 	if (vcpu != BSP) {
476 		pthread_cond_signal(&resetcpu_cond);
477 		pthread_mutex_unlock(&resetcpu_mtx);
478 		pthread_exit(NULL);
479 		/* NOTREACHED */
480 	}
481 
482 	while (!CPU_EMPTY(&cpumask)) {
483 		pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx);
484 	}
485 	pthread_mutex_unlock(&resetcpu_mtx);
486 }
487 
488 int
489 fbsdrun_suspendcpu(int vcpuid)
490 {
491 	return (vm_suspend_cpu(vcpu_info[vcpuid].vcpu));
492 }
493 
494 static void
495 vm_loop(struct vmctx *ctx, struct vcpu *vcpu)
496 {
497 	struct vm_exit vme;
498 	struct vm_run vmrun;
499 	int error, rc;
500 	enum vm_exitcode exitcode;
501 	cpuset_t active_cpus, dmask;
502 
503 	error = vm_active_cpus(ctx, &active_cpus);
504 	assert(CPU_ISSET(vcpu_id(vcpu), &active_cpus));
505 
506 	vmrun.vm_exit = &vme;
507 	vmrun.cpuset = &dmask;
508 	vmrun.cpusetsize = sizeof(dmask);
509 
510 	while (1) {
511 		error = vm_run(vcpu, &vmrun);
512 		if (error != 0)
513 			break;
514 
515 		exitcode = vme.exitcode;
516 		if (exitcode >= VM_EXITCODE_MAX ||
517 		    vmexit_handlers[exitcode] == NULL) {
518 			warnx("vm_loop: unexpected exitcode 0x%x", exitcode);
519 			exit(4);
520 		}
521 
522 		rc = (*vmexit_handlers[exitcode])(ctx, vcpu, &vmrun);
523 
524 		switch (rc) {
525 		case VMEXIT_CONTINUE:
526 			break;
527 		case VMEXIT_ABORT:
528 			abort();
529 		default:
530 			exit(4);
531 		}
532 	}
533 	EPRINTLN("vm_run error %d, errno %d", error, errno);
534 }
535 
536 static int
537 num_vcpus_allowed(struct vmctx *ctx, struct vcpu *vcpu)
538 {
539 	uint16_t sockets, cores, threads, maxcpus;
540 	int tmp, error;
541 
542 	/*
543 	 * The guest is allowed to spinup more than one processor only if the
544 	 * UNRESTRICTED_GUEST capability is available.
545 	 */
546 	error = vm_get_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, &tmp);
547 	if (error != 0)
548 		return (1);
549 
550 	error = vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus);
551 	if (error == 0)
552 		return (maxcpus);
553 	else
554 		return (1);
555 }
556 
557 static struct vmctx *
558 do_open(const char *vmname)
559 {
560 	struct vmctx *ctx;
561 	int error;
562 	bool reinit, romboot;
563 
564 	reinit = romboot = false;
565 
566 #ifdef __amd64__
567 	if (lpc_bootrom())
568 		romboot = true;
569 #endif
570 
571 	error = vm_create(vmname);
572 	if (error) {
573 		if (errno == EEXIST) {
574 			if (romboot) {
575 				reinit = true;
576 			} else {
577 				/*
578 				 * The virtual machine has been setup by the
579 				 * userspace bootloader.
580 				 */
581 			}
582 		} else {
583 			perror("vm_create");
584 			exit(4);
585 		}
586 	} else {
587 		if (!romboot) {
588 			/*
589 			 * If the virtual machine was just created then a
590 			 * bootrom must be configured to boot it.
591 			 */
592 			fprintf(stderr, "virtual machine cannot be booted\n");
593 			exit(4);
594 		}
595 	}
596 
597 	ctx = vm_open(vmname);
598 	if (ctx == NULL) {
599 		perror("vm_open");
600 		exit(4);
601 	}
602 
603 #ifndef WITHOUT_CAPSICUM
604 	if (vm_limit_rights(ctx) != 0)
605 		err(EX_OSERR, "vm_limit_rights");
606 #endif
607 
608 	if (reinit) {
609 		error = vm_reinit(ctx);
610 		if (error) {
611 			perror("vm_reinit");
612 			exit(4);
613 		}
614 	}
615 	error = vm_set_topology(ctx, cpu_sockets, cpu_cores, cpu_threads, 0);
616 	if (error)
617 		errx(EX_OSERR, "vm_set_topology");
618 	return (ctx);
619 }
620 
621 static bool
622 parse_config_option(const char *option)
623 {
624 	const char *value;
625 	char *path;
626 
627 	value = strchr(option, '=');
628 	if (value == NULL || value[1] == '\0')
629 		return (false);
630 	path = strndup(option, value - option);
631 	if (path == NULL)
632 		err(4, "Failed to allocate memory");
633 	set_config_value(path, value + 1);
634 	return (true);
635 }
636 
637 static void
638 parse_simple_config_file(const char *path)
639 {
640 	FILE *fp;
641 	char *line, *cp;
642 	size_t linecap;
643 	unsigned int lineno;
644 
645 	fp = fopen(path, "r");
646 	if (fp == NULL)
647 		err(4, "Failed to open configuration file %s", path);
648 	line = NULL;
649 	linecap = 0;
650 	lineno = 1;
651 	for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) {
652 		if (*line == '#' || *line == '\n')
653 			continue;
654 		cp = strchr(line, '\n');
655 		if (cp != NULL)
656 			*cp = '\0';
657 		if (!parse_config_option(line))
658 			errx(4, "%s line %u: invalid config option '%s'", path,
659 			    lineno, line);
660 	}
661 	free(line);
662 	fclose(fp);
663 }
664 
665 #ifdef BHYVE_GDB
666 static void
667 parse_gdb_options(const char *opt)
668 {
669 	const char *sport;
670 	char *colon;
671 
672 	if (opt[0] == 'w') {
673 		set_config_bool("gdb.wait", true);
674 		opt++;
675 	}
676 
677 	colon = strrchr(opt, ':');
678 	if (colon == NULL) {
679 		sport = opt;
680 	} else {
681 		*colon = '\0';
682 		colon++;
683 		sport = colon;
684 		set_config_value("gdb.address", opt);
685 	}
686 
687 	set_config_value("gdb.port", sport);
688 }
689 #endif
690 
691 int
692 main(int argc, char *argv[])
693 {
694 	int c, error;
695 	int max_vcpus, memflags;
696 	struct vcpu *bsp;
697 	struct vmctx *ctx;
698 	size_t memsize;
699 	const char *optstr, *value, *vmname;
700 #ifdef BHYVE_SNAPSHOT
701 	char *restore_file;
702 	struct restore_state rstate;
703 
704 	restore_file = NULL;
705 #endif
706 
707 	bhyve_init_config();
708 
709 	progname = basename(argv[0]);
710 
711 #ifdef BHYVE_SNAPSHOT
712 	optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:r:";
713 #else
714 	optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:";
715 #endif
716 	while ((c = getopt(argc, argv, optstr)) != -1) {
717 		switch (c) {
718 #ifdef __amd64__
719 		case 'a':
720 			set_config_bool("x86.x2apic", false);
721 			break;
722 #endif
723 		case 'A':
724 			/*
725 			 * NOP. For backward compatibility. Most systems don't
726 			 * work properly without sane ACPI tables. Therefore,
727 			 * we're always generating them.
728 			 */
729 			break;
730 		case 'D':
731 			set_config_bool("destroy_on_poweroff", true);
732 			break;
733 		case 'p':
734 			if (pincpu_parse(optarg) != 0) {
735 				errx(EX_USAGE, "invalid vcpu pinning "
736 				    "configuration '%s'", optarg);
737 			}
738 			break;
739 		case 'c':
740 			if (topology_parse(optarg) != 0) {
741 			    errx(EX_USAGE, "invalid cpu topology "
742 				"'%s'", optarg);
743 			}
744 			break;
745 		case 'C':
746 			set_config_bool("memory.guest_in_core", true);
747 			break;
748 		case 'f':
749 			if (qemu_fwcfg_parse_cmdline_arg(optarg) != 0) {
750 			    errx(EX_USAGE, "invalid fwcfg item '%s'", optarg);
751 			}
752 			break;
753 #ifdef BHYVE_GDB
754 		case 'G':
755 			parse_gdb_options(optarg);
756 			break;
757 #endif
758 		case 'k':
759 			parse_simple_config_file(optarg);
760 			break;
761 		case 'K':
762 			set_config_value("keyboard.layout", optarg);
763 			break;
764 #ifdef __amd64__
765 		case 'l':
766 			if (strncmp(optarg, "help", strlen(optarg)) == 0) {
767 				lpc_print_supported_devices();
768 				exit(0);
769 			} else if (lpc_device_parse(optarg) != 0) {
770 				errx(EX_USAGE, "invalid lpc device "
771 				    "configuration '%s'", optarg);
772 			}
773 			break;
774 #endif
775 #ifdef BHYVE_SNAPSHOT
776 		case 'r':
777 			restore_file = optarg;
778 			break;
779 #endif
780 		case 's':
781 			if (strncmp(optarg, "help", strlen(optarg)) == 0) {
782 				pci_print_supported_devices();
783 				exit(0);
784 			} else if (pci_parse_slot(optarg) != 0)
785 				exit(4);
786 			else
787 				break;
788 		case 'S':
789 			set_config_bool("memory.wired", true);
790 			break;
791 		case 'm':
792 			set_config_value("memory.size", optarg);
793 			break;
794 		case 'o':
795 			if (!parse_config_option(optarg))
796 				errx(EX_USAGE, "invalid configuration option '%s'", optarg);
797 			break;
798 #ifdef __amd64__
799 		case 'H':
800 			set_config_bool("x86.vmexit_on_hlt", true);
801 			break;
802 		case 'I':
803 			/*
804 			 * The "-I" option was used to add an ioapic to the
805 			 * virtual machine.
806 			 *
807 			 * An ioapic is now provided unconditionally for each
808 			 * virtual machine and this option is now deprecated.
809 			 */
810 			break;
811 		case 'P':
812 			set_config_bool("x86.vmexit_on_pause", true);
813 			break;
814 		case 'e':
815 			set_config_bool("x86.strictio", true);
816 			break;
817 		case 'u':
818 			set_config_bool("rtc.use_localtime", false);
819 			break;
820 #endif
821 		case 'U':
822 			set_config_value("uuid", optarg);
823 			break;
824 #ifdef __amd64__
825 		case 'w':
826 			set_config_bool("x86.strictmsr", false);
827 			break;
828 #endif
829 		case 'W':
830 			set_config_bool("virtio_msix", false);
831 			break;
832 #ifdef __amd64__
833 		case 'x':
834 			set_config_bool("x86.x2apic", true);
835 			break;
836 		case 'Y':
837 			set_config_bool("x86.mptable", false);
838 			break;
839 #endif
840 		case 'h':
841 			usage(0);
842 		default:
843 			usage(1);
844 		}
845 	}
846 	argc -= optind;
847 	argv += optind;
848 
849 	if (argc > 1)
850 		usage(1);
851 
852 #ifdef BHYVE_SNAPSHOT
853 	if (restore_file != NULL) {
854 		error = load_restore_file(restore_file, &rstate);
855 		if (error) {
856 			fprintf(stderr, "Failed to read checkpoint info from "
857 					"file: '%s'.\n", restore_file);
858 			exit(1);
859 		}
860 		vmname = lookup_vmname(&rstate);
861 		if (vmname != NULL)
862 			set_config_value("name", vmname);
863 	}
864 #endif
865 
866 	if (argc == 1)
867 		set_config_value("name", argv[0]);
868 
869 	vmname = get_config_value("name");
870 	if (vmname == NULL)
871 		usage(1);
872 
873 	if (get_config_bool_default("config.dump", false)) {
874 		dump_config();
875 		exit(1);
876 	}
877 
878 	calc_topology();
879 	build_vcpumaps();
880 
881 	value = get_config_value("memory.size");
882 	error = vm_parse_memsize(value, &memsize);
883 	if (error)
884 		errx(EX_USAGE, "invalid memsize '%s'", value);
885 
886 	ctx = do_open(vmname);
887 
888 #ifdef BHYVE_SNAPSHOT
889 	if (restore_file != NULL) {
890 		guest_ncpus = lookup_guest_ncpus(&rstate);
891 		memflags = lookup_memflags(&rstate);
892 		memsize = lookup_memsize(&rstate);
893 	}
894 
895 	if (guest_ncpus < 1) {
896 		fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus);
897 		exit(1);
898 	}
899 #endif
900 
901 	bsp = vm_vcpu_open(ctx, BSP);
902 	max_vcpus = num_vcpus_allowed(ctx, bsp);
903 	if (guest_ncpus > max_vcpus) {
904 		fprintf(stderr, "%d vCPUs requested but only %d available\n",
905 			guest_ncpus, max_vcpus);
906 		exit(4);
907 	}
908 
909 	bhyve_init_vcpu(bsp);
910 
911 	/* Allocate per-VCPU resources. */
912 	vcpu_info = calloc(guest_ncpus, sizeof(*vcpu_info));
913 	for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) {
914 		vcpu_info[vcpuid].ctx = ctx;
915 		vcpu_info[vcpuid].vcpuid = vcpuid;
916 		if (vcpuid == BSP)
917 			vcpu_info[vcpuid].vcpu = bsp;
918 		else
919 			vcpu_info[vcpuid].vcpu = vm_vcpu_open(ctx, vcpuid);
920 	}
921 
922 	memflags = 0;
923 	if (get_config_bool_default("memory.wired", false))
924 		memflags |= VM_MEM_F_WIRED;
925 	if (get_config_bool_default("memory.guest_in_core", false))
926 		memflags |= VM_MEM_F_INCORE;
927 	vm_set_memflags(ctx, memflags);
928 	error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
929 	if (error) {
930 		fprintf(stderr, "Unable to setup memory (%d)\n", errno);
931 		exit(4);
932 	}
933 
934 	init_mem(guest_ncpus);
935 	init_bootrom(ctx);
936 	if (bhyve_init_platform(ctx, bsp) != 0)
937 		exit(4);
938 
939 	if (qemu_fwcfg_init(ctx) != 0) {
940 		fprintf(stderr, "qemu fwcfg initialization error\n");
941 		exit(4);
942 	}
943 
944 	if (qemu_fwcfg_add_file("opt/bhyve/hw.ncpu", sizeof(guest_ncpus),
945 	    &guest_ncpus) != 0) {
946 		fprintf(stderr, "Could not add qemu fwcfg opt/bhyve/hw.ncpu\n");
947 		exit(4);
948 	}
949 
950 	/*
951 	 * Exit if a device emulation finds an error in its initialization
952 	 */
953 	if (init_pci(ctx) != 0) {
954 		EPRINTLN("Device emulation initialization error: %s",
955 		    strerror(errno));
956 		exit(4);
957 	}
958 	if (init_tpm(ctx) != 0) {
959 		EPRINTLN("Failed to init TPM device");
960 		exit(4);
961 	}
962 
963 	/*
964 	 * Initialize after PCI, to allow a bootrom file to reserve the high
965 	 * region.
966 	 */
967 	if (get_config_bool("acpi_tables"))
968 		vmgenc_init(ctx);
969 
970 #ifdef BHYVE_GDB
971 	init_gdb(ctx);
972 #endif
973 
974 	/*
975 	 * Add all vCPUs.
976 	 */
977 	for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++)
978 		bhyve_start_vcpu(vcpu_info[vcpuid].vcpu, vcpuid == BSP);
979 
980 #ifdef BHYVE_SNAPSHOT
981 	if (restore_file != NULL) {
982 		FPRINTLN(stdout, "Pausing pci devs...");
983 		if (vm_pause_devices() != 0) {
984 			EPRINTLN("Failed to pause PCI device state.");
985 			exit(1);
986 		}
987 
988 		FPRINTLN(stdout, "Restoring vm mem...");
989 		if (restore_vm_mem(ctx, &rstate) != 0) {
990 			EPRINTLN("Failed to restore VM memory.");
991 			exit(1);
992 		}
993 
994 		FPRINTLN(stdout, "Restoring pci devs...");
995 		if (vm_restore_devices(&rstate) != 0) {
996 			EPRINTLN("Failed to restore PCI device state.");
997 			exit(1);
998 		}
999 
1000 		FPRINTLN(stdout, "Restoring kernel structs...");
1001 		if (vm_restore_kern_structs(ctx, &rstate) != 0) {
1002 			EPRINTLN("Failed to restore kernel structs.");
1003 			exit(1);
1004 		}
1005 
1006 		FPRINTLN(stdout, "Resuming pci devs...");
1007 		if (vm_resume_devices() != 0) {
1008 			EPRINTLN("Failed to resume PCI device state.");
1009 			exit(1);
1010 		}
1011 	}
1012 #endif
1013 
1014 	if (bhyve_init_platform_late(ctx, bsp) != 0)
1015 		exit(4);
1016 
1017 	/*
1018 	 * Change the proc title to include the VM name.
1019 	 */
1020 	setproctitle("%s", vmname);
1021 
1022 #ifdef BHYVE_SNAPSHOT
1023 	/*
1024 	 * checkpointing thread for communication with bhyvectl
1025 	 */
1026 	if (init_checkpoint_thread(ctx) != 0)
1027 		errx(EX_OSERR, "Failed to start checkpoint thread");
1028 #endif
1029 
1030 #ifndef WITHOUT_CAPSICUM
1031 	caph_cache_catpages();
1032 
1033 	if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1)
1034 		errx(EX_OSERR, "Unable to apply rights for sandbox");
1035 
1036 	if (caph_enter() == -1)
1037 		errx(EX_OSERR, "cap_enter() failed");
1038 #endif
1039 
1040 #ifdef BHYVE_SNAPSHOT
1041 	if (restore_file != NULL) {
1042 		destroy_restore_state(&rstate);
1043 		if (vm_restore_time(ctx) < 0)
1044 			err(EX_OSERR, "Unable to restore time");
1045 
1046 		for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++)
1047 			vm_resume_cpu(vcpu_info[vcpuid].vcpu);
1048 	} else
1049 #endif
1050 		vm_resume_cpu(bsp);
1051 
1052 	/*
1053 	 * Head off to the main event dispatch loop
1054 	 */
1055 	mevent_dispatch();
1056 
1057 	exit(4);
1058 }
1059