xref: /freebsd/usr.sbin/bhyve/bhyverun.c (revision 1d386b48)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 #include <sys/types.h>
31 #ifndef WITHOUT_CAPSICUM
32 #include <sys/capsicum.h>
33 #endif
34 #include <sys/mman.h>
35 #ifdef BHYVE_SNAPSHOT
36 #include <sys/socket.h>
37 #include <sys/stat.h>
38 #endif
39 #include <sys/time.h>
40 #ifdef BHYVE_SNAPSHOT
41 #include <sys/un.h>
42 #endif
43 
44 #include <amd64/vmm/intel/vmcs.h>
45 #include <x86/apicreg.h>
46 
47 #include <machine/atomic.h>
48 #include <machine/segments.h>
49 
50 #ifndef WITHOUT_CAPSICUM
51 #include <capsicum_helpers.h>
52 #endif
53 #include <stdio.h>
54 #include <stdlib.h>
55 #include <string.h>
56 #include <err.h>
57 #include <errno.h>
58 #ifdef BHYVE_SNAPSHOT
59 #include <fcntl.h>
60 #endif
61 #include <libgen.h>
62 #include <unistd.h>
63 #include <assert.h>
64 #include <pthread.h>
65 #include <pthread_np.h>
66 #include <sysexits.h>
67 #include <stdbool.h>
68 #include <stdint.h>
69 #ifdef BHYVE_SNAPSHOT
70 #include <ucl.h>
71 #include <unistd.h>
72 
73 #include <libxo/xo.h>
74 #endif
75 
76 #include <machine/vmm.h>
77 #ifndef WITHOUT_CAPSICUM
78 #include <machine/vmm_dev.h>
79 #endif
80 #include <machine/vmm_instruction_emul.h>
81 #include <vmmapi.h>
82 
83 #include "bhyverun.h"
84 #include "acpi.h"
85 #include "atkbdc.h"
86 #include "bootrom.h"
87 #include "config.h"
88 #include "inout.h"
89 #include "debug.h"
90 #include "e820.h"
91 #include "fwctl.h"
92 #include "gdb.h"
93 #include "ioapic.h"
94 #include "kernemu_dev.h"
95 #include "mem.h"
96 #include "mevent.h"
97 #include "mptbl.h"
98 #include "pci_emul.h"
99 #include "pci_irq.h"
100 #include "pci_lpc.h"
101 #include "qemu_fwcfg.h"
102 #include "smbiostbl.h"
103 #ifdef BHYVE_SNAPSHOT
104 #include "snapshot.h"
105 #endif
106 #include "xmsr.h"
107 #include "spinup_ap.h"
108 #include "rtc.h"
109 #include "vmgenc.h"
110 
111 #define MB		(1024UL * 1024)
112 #define GB		(1024UL * MB)
113 
114 static const char * const vmx_exit_reason_desc[] = {
115 	[EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)",
116 	[EXIT_REASON_EXT_INTR] = "External interrupt",
117 	[EXIT_REASON_TRIPLE_FAULT] = "Triple fault",
118 	[EXIT_REASON_INIT] = "INIT signal",
119 	[EXIT_REASON_SIPI] = "Start-up IPI (SIPI)",
120 	[EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)",
121 	[EXIT_REASON_SMI] = "Other SMI",
122 	[EXIT_REASON_INTR_WINDOW] = "Interrupt window",
123 	[EXIT_REASON_NMI_WINDOW] = "NMI window",
124 	[EXIT_REASON_TASK_SWITCH] = "Task switch",
125 	[EXIT_REASON_CPUID] = "CPUID",
126 	[EXIT_REASON_GETSEC] = "GETSEC",
127 	[EXIT_REASON_HLT] = "HLT",
128 	[EXIT_REASON_INVD] = "INVD",
129 	[EXIT_REASON_INVLPG] = "INVLPG",
130 	[EXIT_REASON_RDPMC] = "RDPMC",
131 	[EXIT_REASON_RDTSC] = "RDTSC",
132 	[EXIT_REASON_RSM] = "RSM",
133 	[EXIT_REASON_VMCALL] = "VMCALL",
134 	[EXIT_REASON_VMCLEAR] = "VMCLEAR",
135 	[EXIT_REASON_VMLAUNCH] = "VMLAUNCH",
136 	[EXIT_REASON_VMPTRLD] = "VMPTRLD",
137 	[EXIT_REASON_VMPTRST] = "VMPTRST",
138 	[EXIT_REASON_VMREAD] = "VMREAD",
139 	[EXIT_REASON_VMRESUME] = "VMRESUME",
140 	[EXIT_REASON_VMWRITE] = "VMWRITE",
141 	[EXIT_REASON_VMXOFF] = "VMXOFF",
142 	[EXIT_REASON_VMXON] = "VMXON",
143 	[EXIT_REASON_CR_ACCESS] = "Control-register accesses",
144 	[EXIT_REASON_DR_ACCESS] = "MOV DR",
145 	[EXIT_REASON_INOUT] = "I/O instruction",
146 	[EXIT_REASON_RDMSR] = "RDMSR",
147 	[EXIT_REASON_WRMSR] = "WRMSR",
148 	[EXIT_REASON_INVAL_VMCS] =
149 	    "VM-entry failure due to invalid guest state",
150 	[EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading",
151 	[EXIT_REASON_MWAIT] = "MWAIT",
152 	[EXIT_REASON_MTF] = "Monitor trap flag",
153 	[EXIT_REASON_MONITOR] = "MONITOR",
154 	[EXIT_REASON_PAUSE] = "PAUSE",
155 	[EXIT_REASON_MCE_DURING_ENTRY] =
156 	    "VM-entry failure due to machine-check event",
157 	[EXIT_REASON_TPR] = "TPR below threshold",
158 	[EXIT_REASON_APIC_ACCESS] = "APIC access",
159 	[EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI",
160 	[EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR",
161 	[EXIT_REASON_LDTR_TR] = "Access to LDTR or TR",
162 	[EXIT_REASON_EPT_FAULT] = "EPT violation",
163 	[EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration",
164 	[EXIT_REASON_INVEPT] = "INVEPT",
165 	[EXIT_REASON_RDTSCP] = "RDTSCP",
166 	[EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired",
167 	[EXIT_REASON_INVVPID] = "INVVPID",
168 	[EXIT_REASON_WBINVD] = "WBINVD",
169 	[EXIT_REASON_XSETBV] = "XSETBV",
170 	[EXIT_REASON_APIC_WRITE] = "APIC write",
171 	[EXIT_REASON_RDRAND] = "RDRAND",
172 	[EXIT_REASON_INVPCID] = "INVPCID",
173 	[EXIT_REASON_VMFUNC] = "VMFUNC",
174 	[EXIT_REASON_ENCLS] = "ENCLS",
175 	[EXIT_REASON_RDSEED] = "RDSEED",
176 	[EXIT_REASON_PM_LOG_FULL] = "Page-modification log full",
177 	[EXIT_REASON_XSAVES] = "XSAVES",
178 	[EXIT_REASON_XRSTORS] = "XRSTORS"
179 };
180 
181 typedef int (*vmexit_handler_t)(struct vmctx *, struct vcpu *, struct vm_run *);
182 
183 int guest_ncpus;
184 uint16_t cpu_cores, cpu_sockets, cpu_threads;
185 
186 int raw_stdio = 0;
187 
188 static char *progname;
189 static const int BSP = 0;
190 
191 static cpuset_t cpumask;
192 
193 static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu);
194 
195 static struct vcpu_info {
196 	struct vmctx	*ctx;
197 	struct vcpu	*vcpu;
198 	int		vcpuid;
199 } *vcpu_info;
200 
201 static cpuset_t **vcpumap;
202 
203 static void
204 usage(int code)
205 {
206 
207 	fprintf(stderr,
208 		"Usage: %s [-AaCDeHhPSuWwxY]\n"
209 		"       %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n"
210 		"       %*s [-G port] [-k config_file] [-l lpc] [-m mem] [-o var=value]\n"
211 		"       %*s [-p vcpu:hostcpu] [-r file] [-s pci] [-U uuid] vmname\n"
212 		"       -A: create ACPI tables\n"
213 		"       -a: local apic is in xAPIC mode (deprecated)\n"
214 		"       -C: include guest memory in core file\n"
215 		"       -c: number of CPUs and/or topology specification\n"
216 		"       -D: destroy on power-off\n"
217 		"       -e: exit on unhandled I/O access\n"
218 		"       -G: start a debug server\n"
219 		"       -H: vmexit from the guest on HLT\n"
220 		"       -h: help\n"
221 		"       -k: key=value flat config file\n"
222 		"       -K: PS2 keyboard layout\n"
223 		"       -l: LPC device configuration\n"
224 		"       -m: memory size\n"
225 		"       -o: set config 'var' to 'value'\n"
226 		"       -P: vmexit from the guest on pause\n"
227 		"       -p: pin 'vcpu' to 'hostcpu'\n"
228 #ifdef BHYVE_SNAPSHOT
229 		"       -r: path to checkpoint file\n"
230 #endif
231 		"       -S: guest memory cannot be swapped\n"
232 		"       -s: <slot,driver,configinfo> PCI slot config\n"
233 		"       -U: UUID\n"
234 		"       -u: RTC keeps UTC time\n"
235 		"       -W: force virtio to use single-vector MSI\n"
236 		"       -w: ignore unimplemented MSRs\n"
237 		"       -x: local APIC is in x2APIC mode\n"
238 		"       -Y: disable MPtable generation\n",
239 		progname, (int)strlen(progname), "", (int)strlen(progname), "",
240 		(int)strlen(progname), "");
241 
242 	exit(code);
243 }
244 
245 /*
246  * XXX This parser is known to have the following issues:
247  * 1.  It accepts null key=value tokens ",," as setting "cpus" to an
248  *     empty string.
249  *
250  * The acceptance of a null specification ('-c ""') is by design to match the
251  * manual page syntax specification, this results in a topology of 1 vCPU.
252  */
253 static int
254 topology_parse(const char *opt)
255 {
256 	char *cp, *str, *tofree;
257 
258 	if (*opt == '\0') {
259 		set_config_value("sockets", "1");
260 		set_config_value("cores", "1");
261 		set_config_value("threads", "1");
262 		set_config_value("cpus", "1");
263 		return (0);
264 	}
265 
266 	tofree = str = strdup(opt);
267 	if (str == NULL)
268 		errx(4, "Failed to allocate memory");
269 
270 	while ((cp = strsep(&str, ",")) != NULL) {
271 		if (strncmp(cp, "cpus=", strlen("cpus=")) == 0)
272 			set_config_value("cpus", cp + strlen("cpus="));
273 		else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0)
274 			set_config_value("sockets", cp + strlen("sockets="));
275 		else if (strncmp(cp, "cores=", strlen("cores=")) == 0)
276 			set_config_value("cores", cp + strlen("cores="));
277 		else if (strncmp(cp, "threads=", strlen("threads=")) == 0)
278 			set_config_value("threads", cp + strlen("threads="));
279 		else if (strchr(cp, '=') != NULL)
280 			goto out;
281 		else
282 			set_config_value("cpus", cp);
283 	}
284 	free(tofree);
285 	return (0);
286 
287 out:
288 	free(tofree);
289 	return (-1);
290 }
291 
292 static int
293 parse_int_value(const char *key, const char *value, int minval, int maxval)
294 {
295 	char *cp;
296 	long lval;
297 
298 	errno = 0;
299 	lval = strtol(value, &cp, 0);
300 	if (errno != 0 || *cp != '\0' || cp == value || lval < minval ||
301 	    lval > maxval)
302 		errx(4, "Invalid value for %s: '%s'", key, value);
303 	return (lval);
304 }
305 
306 /*
307  * Set the sockets, cores, threads, and guest_cpus variables based on
308  * the configured topology.
309  *
310  * The limits of UINT16_MAX are due to the types passed to
311  * vm_set_topology().  vmm.ko may enforce tighter limits.
312  */
313 static void
314 calc_topology(void)
315 {
316 	const char *value;
317 	bool explicit_cpus;
318 	uint64_t ncpus;
319 
320 	value = get_config_value("cpus");
321 	if (value != NULL) {
322 		guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX);
323 		explicit_cpus = true;
324 	} else {
325 		guest_ncpus = 1;
326 		explicit_cpus = false;
327 	}
328 	value = get_config_value("cores");
329 	if (value != NULL)
330 		cpu_cores = parse_int_value("cores", value, 1, UINT16_MAX);
331 	else
332 		cpu_cores = 1;
333 	value = get_config_value("threads");
334 	if (value != NULL)
335 		cpu_threads = parse_int_value("threads", value, 1, UINT16_MAX);
336 	else
337 		cpu_threads = 1;
338 	value = get_config_value("sockets");
339 	if (value != NULL)
340 		cpu_sockets = parse_int_value("sockets", value, 1, UINT16_MAX);
341 	else
342 		cpu_sockets = guest_ncpus;
343 
344 	/*
345 	 * Compute sockets * cores * threads avoiding overflow.  The
346 	 * range check above insures these are 16 bit values.
347 	 */
348 	ncpus = (uint64_t)cpu_sockets * cpu_cores * cpu_threads;
349 	if (ncpus > UINT16_MAX)
350 		errx(4, "Computed number of vCPUs too high: %ju",
351 		    (uintmax_t)ncpus);
352 
353 	if (explicit_cpus) {
354 		if (guest_ncpus != (int)ncpus)
355 			errx(4, "Topology (%d sockets, %d cores, %d threads) "
356 			    "does not match %d vCPUs",
357 			    cpu_sockets, cpu_cores, cpu_threads,
358 			    guest_ncpus);
359 	} else
360 		guest_ncpus = ncpus;
361 }
362 
363 static int
364 pincpu_parse(const char *opt)
365 {
366 	const char *value;
367 	char *newval;
368 	char key[16];
369 	int vcpu, pcpu;
370 
371 	if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) {
372 		fprintf(stderr, "invalid format: %s\n", opt);
373 		return (-1);
374 	}
375 
376 	if (vcpu < 0) {
377 		fprintf(stderr, "invalid vcpu '%d'\n", vcpu);
378 		return (-1);
379 	}
380 
381 	if (pcpu < 0 || pcpu >= CPU_SETSIZE) {
382 		fprintf(stderr, "hostcpu '%d' outside valid range from "
383 		    "0 to %d\n", pcpu, CPU_SETSIZE - 1);
384 		return (-1);
385 	}
386 
387 	snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu);
388 	value = get_config_value(key);
389 
390 	if (asprintf(&newval, "%s%s%d", value != NULL ? value : "",
391 	    value != NULL ? "," : "", pcpu) == -1) {
392 		perror("failed to build new cpuset string");
393 		return (-1);
394 	}
395 
396 	set_config_value(key, newval);
397 	free(newval);
398 	return (0);
399 }
400 
401 static void
402 parse_cpuset(int vcpu, const char *list, cpuset_t *set)
403 {
404 	char *cp, *token;
405 	int pcpu, start;
406 
407 	CPU_ZERO(set);
408 	start = -1;
409 	token = __DECONST(char *, list);
410 	for (;;) {
411 		pcpu = strtoul(token, &cp, 0);
412 		if (cp == token)
413 			errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list);
414 		if (pcpu < 0 || pcpu >= CPU_SETSIZE)
415 			errx(4, "hostcpu '%d' outside valid range from 0 to %d",
416 			    pcpu, CPU_SETSIZE - 1);
417 		switch (*cp) {
418 		case ',':
419 		case '\0':
420 			if (start >= 0) {
421 				if (start > pcpu)
422 					errx(4, "Invalid hostcpu range %d-%d",
423 					    start, pcpu);
424 				while (start < pcpu) {
425 					CPU_SET(start, set);
426 					start++;
427 				}
428 				start = -1;
429 			}
430 			CPU_SET(pcpu, set);
431 			break;
432 		case '-':
433 			if (start >= 0)
434 				errx(4, "invalid cpuset for vcpu %d: '%s'",
435 				    vcpu, list);
436 			start = pcpu;
437 			break;
438 		default:
439 			errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list);
440 		}
441 		if (*cp == '\0')
442 			break;
443 		token = cp + 1;
444 	}
445 }
446 
447 static void
448 build_vcpumaps(void)
449 {
450 	char key[16];
451 	const char *value;
452 	int vcpu;
453 
454 	vcpumap = calloc(guest_ncpus, sizeof(*vcpumap));
455 	for (vcpu = 0; vcpu < guest_ncpus; vcpu++) {
456 		snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu);
457 		value = get_config_value(key);
458 		if (value == NULL)
459 			continue;
460 		vcpumap[vcpu] = malloc(sizeof(cpuset_t));
461 		if (vcpumap[vcpu] == NULL)
462 			err(4, "Failed to allocate cpuset for vcpu %d", vcpu);
463 		parse_cpuset(vcpu, value, vcpumap[vcpu]);
464 	}
465 }
466 
467 void
468 vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid,
469     int errcode)
470 {
471 	int error, restart_instruction;
472 
473 	restart_instruction = 1;
474 
475 	error = vm_inject_exception(vcpu, vector, errcode_valid, errcode,
476 	    restart_instruction);
477 	assert(error == 0);
478 }
479 
480 void *
481 paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
482 {
483 
484 	return (vm_map_gpa(ctx, gaddr, len));
485 }
486 
487 #ifdef BHYVE_SNAPSHOT
488 uintptr_t
489 paddr_host2guest(struct vmctx *ctx, void *addr)
490 {
491 	return (vm_rev_map_gpa(ctx, addr));
492 }
493 #endif
494 
495 int
496 fbsdrun_virtio_msix(void)
497 {
498 
499 	return (get_config_bool_default("virtio_msix", true));
500 }
501 
502 static void *
503 fbsdrun_start_thread(void *param)
504 {
505 	char tname[MAXCOMLEN + 1];
506 	struct vcpu_info *vi = param;
507 	int error;
508 
509 	snprintf(tname, sizeof(tname), "vcpu %d", vi->vcpuid);
510 	pthread_set_name_np(pthread_self(), tname);
511 
512 	if (vcpumap[vi->vcpuid] != NULL) {
513 		error = pthread_setaffinity_np(pthread_self(),
514 		    sizeof(cpuset_t), vcpumap[vi->vcpuid]);
515 		assert(error == 0);
516 	}
517 
518 #ifdef BHYVE_SNAPSHOT
519 	checkpoint_cpu_add(vi->vcpuid);
520 #endif
521 	gdb_cpu_add(vi->vcpu);
522 
523 	vm_loop(vi->ctx, vi->vcpu);
524 
525 	/* not reached */
526 	exit(1);
527 	return (NULL);
528 }
529 
530 static void
531 fbsdrun_addcpu(struct vcpu_info *vi)
532 {
533 	pthread_t thr;
534 	int error;
535 
536 	error = vm_activate_cpu(vi->vcpu);
537 	if (error != 0)
538 		err(EX_OSERR, "could not activate CPU %d", vi->vcpuid);
539 
540 	CPU_SET_ATOMIC(vi->vcpuid, &cpumask);
541 
542 	vm_suspend_cpu(vi->vcpu);
543 
544 	error = pthread_create(&thr, NULL, fbsdrun_start_thread, vi);
545 	assert(error == 0);
546 }
547 
548 static void
549 fbsdrun_deletecpu(int vcpu)
550 {
551 	static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
552 	static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
553 
554 	pthread_mutex_lock(&resetcpu_mtx);
555 	if (!CPU_ISSET(vcpu, &cpumask)) {
556 		fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu);
557 		exit(4);
558 	}
559 
560 	CPU_CLR(vcpu, &cpumask);
561 
562 	if (vcpu != BSP) {
563 		pthread_cond_signal(&resetcpu_cond);
564 		pthread_mutex_unlock(&resetcpu_mtx);
565 		pthread_exit(NULL);
566 		/* NOTREACHED */
567 	}
568 
569 	while (!CPU_EMPTY(&cpumask)) {
570 		pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx);
571 	}
572 	pthread_mutex_unlock(&resetcpu_mtx);
573 }
574 
575 static int
576 vmexit_inout(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
577 {
578 	struct vm_exit *vme;
579 	int error;
580 	int bytes, port, in;
581 
582 	vme = vmrun->vm_exit;
583 	port = vme->u.inout.port;
584 	bytes = vme->u.inout.bytes;
585 	in = vme->u.inout.in;
586 
587 	error = emulate_inout(ctx, vcpu, vme);
588 	if (error) {
589 		fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n",
590 		    in ? "in" : "out",
591 		    bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'),
592 		    port, vme->rip);
593 		return (VMEXIT_ABORT);
594 	} else {
595 		return (VMEXIT_CONTINUE);
596 	}
597 }
598 
599 static int
600 vmexit_rdmsr(struct vmctx *ctx __unused, struct vcpu *vcpu,
601     struct vm_run *vmrun)
602 {
603 	struct vm_exit *vme;
604 	uint64_t val;
605 	uint32_t eax, edx;
606 	int error;
607 
608 	vme = vmrun->vm_exit;
609 
610 	val = 0;
611 	error = emulate_rdmsr(vcpu, vme->u.msr.code, &val);
612 	if (error != 0) {
613 		fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
614 		    vme->u.msr.code, vcpu_id(vcpu));
615 		if (get_config_bool("x86.strictmsr")) {
616 			vm_inject_gp(vcpu);
617 			return (VMEXIT_CONTINUE);
618 		}
619 	}
620 
621 	eax = val;
622 	error = vm_set_register(vcpu, VM_REG_GUEST_RAX, eax);
623 	assert(error == 0);
624 
625 	edx = val >> 32;
626 	error = vm_set_register(vcpu, VM_REG_GUEST_RDX, edx);
627 	assert(error == 0);
628 
629 	return (VMEXIT_CONTINUE);
630 }
631 
632 static int
633 vmexit_wrmsr(struct vmctx *ctx __unused, struct vcpu *vcpu,
634     struct vm_run *vmrun)
635 {
636 	struct vm_exit *vme;
637 	int error;
638 
639 	vme = vmrun->vm_exit;
640 
641 	error = emulate_wrmsr(vcpu, vme->u.msr.code, vme->u.msr.wval);
642 	if (error != 0) {
643 		fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
644 		    vme->u.msr.code, vme->u.msr.wval, vcpu_id(vcpu));
645 		if (get_config_bool("x86.strictmsr")) {
646 			vm_inject_gp(vcpu);
647 			return (VMEXIT_CONTINUE);
648 		}
649 	}
650 	return (VMEXIT_CONTINUE);
651 }
652 
653 #define	DEBUG_EPT_MISCONFIG
654 #ifdef DEBUG_EPT_MISCONFIG
655 #define	VMCS_GUEST_PHYSICAL_ADDRESS	0x00002400
656 
657 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
658 static int ept_misconfig_ptenum;
659 #endif
660 
661 static const char *
662 vmexit_vmx_desc(uint32_t exit_reason)
663 {
664 
665 	if (exit_reason >= nitems(vmx_exit_reason_desc) ||
666 	    vmx_exit_reason_desc[exit_reason] == NULL)
667 		return ("Unknown");
668 	return (vmx_exit_reason_desc[exit_reason]);
669 }
670 
671 static int
672 vmexit_vmx(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
673 {
674 	struct vm_exit *vme;
675 
676 	vme = vmrun->vm_exit;
677 
678 	fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu));
679 	fprintf(stderr, "\treason\t\tVMX\n");
680 	fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip);
681 	fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length);
682 	fprintf(stderr, "\tstatus\t\t%d\n", vme->u.vmx.status);
683 	fprintf(stderr, "\texit_reason\t%u (%s)\n", vme->u.vmx.exit_reason,
684 	    vmexit_vmx_desc(vme->u.vmx.exit_reason));
685 	fprintf(stderr, "\tqualification\t0x%016lx\n",
686 	    vme->u.vmx.exit_qualification);
687 	fprintf(stderr, "\tinst_type\t\t%d\n", vme->u.vmx.inst_type);
688 	fprintf(stderr, "\tinst_error\t\t%d\n", vme->u.vmx.inst_error);
689 #ifdef DEBUG_EPT_MISCONFIG
690 	if (vme->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
691 		vm_get_register(vcpu,
692 		    VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
693 		    &ept_misconfig_gpa);
694 		vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
695 		    &ept_misconfig_ptenum);
696 		fprintf(stderr, "\tEPT misconfiguration:\n");
697 		fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
698 		fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
699 		    ept_misconfig_ptenum, ept_misconfig_pte[0],
700 		    ept_misconfig_pte[1], ept_misconfig_pte[2],
701 		    ept_misconfig_pte[3]);
702 	}
703 #endif	/* DEBUG_EPT_MISCONFIG */
704 	return (VMEXIT_ABORT);
705 }
706 
707 static int
708 vmexit_svm(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun)
709 {
710 	struct vm_exit *vme;
711 
712 	vme = vmrun->vm_exit;
713 
714 	fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu));
715 	fprintf(stderr, "\treason\t\tSVM\n");
716 	fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip);
717 	fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length);
718 	fprintf(stderr, "\texitcode\t%#lx\n", vme->u.svm.exitcode);
719 	fprintf(stderr, "\texitinfo1\t%#lx\n", vme->u.svm.exitinfo1);
720 	fprintf(stderr, "\texitinfo2\t%#lx\n", vme->u.svm.exitinfo2);
721 	return (VMEXIT_ABORT);
722 }
723 
724 static int
725 vmexit_bogus(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
726     struct vm_run *vmrun)
727 {
728 	assert(vmrun->vm_exit->inst_length == 0);
729 
730 	return (VMEXIT_CONTINUE);
731 }
732 
733 static int
734 vmexit_reqidle(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
735     struct vm_run *vmrun)
736 {
737 	assert(vmrun->vm_exit->inst_length == 0);
738 
739 	return (VMEXIT_CONTINUE);
740 }
741 
742 static int
743 vmexit_hlt(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
744     struct vm_run *vmrun __unused)
745 {
746 	/*
747 	 * Just continue execution with the next instruction. We use
748 	 * the HLT VM exit as a way to be friendly with the host
749 	 * scheduler.
750 	 */
751 	return (VMEXIT_CONTINUE);
752 }
753 
754 static int
755 vmexit_pause(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
756     struct vm_run *vmrun __unused)
757 {
758 	return (VMEXIT_CONTINUE);
759 }
760 
761 static int
762 vmexit_mtrap(struct vmctx *ctx __unused, struct vcpu *vcpu,
763     struct vm_run *vmrun)
764 {
765 	assert(vmrun->vm_exit->inst_length == 0);
766 
767 #ifdef BHYVE_SNAPSHOT
768 	checkpoint_cpu_suspend(vcpu_id(vcpu));
769 #endif
770 	gdb_cpu_mtrap(vcpu);
771 #ifdef BHYVE_SNAPSHOT
772 	checkpoint_cpu_resume(vcpu_id(vcpu));
773 #endif
774 
775 	return (VMEXIT_CONTINUE);
776 }
777 
778 static int
779 vmexit_inst_emul(struct vmctx *ctx __unused, struct vcpu *vcpu,
780     struct vm_run *vmrun)
781 {
782 	struct vm_exit *vme;
783 	struct vie *vie;
784 	int err, i, cs_d;
785 	enum vm_cpu_mode mode;
786 
787 	vme = vmrun->vm_exit;
788 
789 	vie = &vme->u.inst_emul.vie;
790 	if (!vie->decoded) {
791 		/*
792 		 * Attempt to decode in userspace as a fallback.  This allows
793 		 * updating instruction decode in bhyve without rebooting the
794 		 * kernel (rapid prototyping), albeit with much slower
795 		 * emulation.
796 		 */
797 		vie_restart(vie);
798 		mode = vme->u.inst_emul.paging.cpu_mode;
799 		cs_d = vme->u.inst_emul.cs_d;
800 		if (vmm_decode_instruction(mode, cs_d, vie) != 0)
801 			goto fail;
802 		if (vm_set_register(vcpu, VM_REG_GUEST_RIP,
803 		    vme->rip + vie->num_processed) != 0)
804 			goto fail;
805 	}
806 
807 	err = emulate_mem(vcpu, vme->u.inst_emul.gpa, vie,
808 	    &vme->u.inst_emul.paging);
809 	if (err) {
810 		if (err == ESRCH) {
811 			EPRINTLN("Unhandled memory access to 0x%lx\n",
812 			    vme->u.inst_emul.gpa);
813 		}
814 		goto fail;
815 	}
816 
817 	return (VMEXIT_CONTINUE);
818 
819 fail:
820 	fprintf(stderr, "Failed to emulate instruction sequence [ ");
821 	for (i = 0; i < vie->num_valid; i++)
822 		fprintf(stderr, "%02x", vie->inst[i]);
823 	FPRINTLN(stderr, " ] at 0x%lx", vme->rip);
824 	return (VMEXIT_ABORT);
825 }
826 
827 static int
828 vmexit_suspend(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
829 {
830 	struct vm_exit *vme;
831 	enum vm_suspend_how how;
832 	int vcpuid = vcpu_id(vcpu);
833 
834 	vme = vmrun->vm_exit;
835 
836 	how = vme->u.suspended.how;
837 
838 	fbsdrun_deletecpu(vcpuid);
839 
840 	switch (how) {
841 	case VM_SUSPEND_RESET:
842 		exit(0);
843 	case VM_SUSPEND_POWEROFF:
844 		if (get_config_bool_default("destroy_on_poweroff", false))
845 			vm_destroy(ctx);
846 		exit(1);
847 	case VM_SUSPEND_HALT:
848 		exit(2);
849 	case VM_SUSPEND_TRIPLEFAULT:
850 		exit(3);
851 	default:
852 		fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
853 		exit(100);
854 	}
855 	return (0);	/* NOTREACHED */
856 }
857 
858 static int
859 vmexit_debug(struct vmctx *ctx __unused, struct vcpu *vcpu,
860     struct vm_run *vmrun __unused)
861 {
862 
863 #ifdef BHYVE_SNAPSHOT
864 	checkpoint_cpu_suspend(vcpu_id(vcpu));
865 #endif
866 	gdb_cpu_suspend(vcpu);
867 #ifdef BHYVE_SNAPSHOT
868 	checkpoint_cpu_resume(vcpu_id(vcpu));
869 #endif
870 	/*
871 	 * XXX-MJ sleep for a short period to avoid chewing up the CPU in the
872 	 * window between activation of the vCPU thread and the STARTUP IPI.
873 	 */
874 	usleep(1000);
875 	return (VMEXIT_CONTINUE);
876 }
877 
878 static int
879 vmexit_breakpoint(struct vmctx *ctx __unused, struct vcpu *vcpu,
880     struct vm_run *vmrun)
881 {
882 	gdb_cpu_breakpoint(vcpu, vmrun->vm_exit);
883 	return (VMEXIT_CONTINUE);
884 }
885 
886 static int
887 vmexit_ipi(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
888     struct vm_run *vmrun)
889 {
890 	struct vm_exit *vme;
891 	cpuset_t *dmask;
892 	int error = -1;
893 	int i;
894 
895 	dmask = vmrun->cpuset;
896 	vme = vmrun->vm_exit;
897 
898 	switch (vme->u.ipi.mode) {
899 	case APIC_DELMODE_INIT:
900 		CPU_FOREACH_ISSET(i, dmask) {
901 			error = vm_suspend_cpu(vcpu_info[i].vcpu);
902 			if (error) {
903 				warnx("%s: failed to suspend cpu %d\n",
904 				    __func__, i);
905 				break;
906 			}
907 		}
908 		break;
909 	case APIC_DELMODE_STARTUP:
910 		CPU_FOREACH_ISSET(i, dmask) {
911 			spinup_ap(vcpu_info[i].vcpu,
912 			    vme->u.ipi.vector << PAGE_SHIFT);
913 		}
914 		error = 0;
915 		break;
916 	default:
917 		break;
918 	}
919 
920 	return (error);
921 }
922 
923 static const vmexit_handler_t handler[VM_EXITCODE_MAX] = {
924 	[VM_EXITCODE_INOUT]  = vmexit_inout,
925 	[VM_EXITCODE_INOUT_STR]  = vmexit_inout,
926 	[VM_EXITCODE_VMX]    = vmexit_vmx,
927 	[VM_EXITCODE_SVM]    = vmexit_svm,
928 	[VM_EXITCODE_BOGUS]  = vmexit_bogus,
929 	[VM_EXITCODE_REQIDLE] = vmexit_reqidle,
930 	[VM_EXITCODE_RDMSR]  = vmexit_rdmsr,
931 	[VM_EXITCODE_WRMSR]  = vmexit_wrmsr,
932 	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,
933 	[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
934 	[VM_EXITCODE_SUSPENDED] = vmexit_suspend,
935 	[VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
936 	[VM_EXITCODE_DEBUG] = vmexit_debug,
937 	[VM_EXITCODE_BPT] = vmexit_breakpoint,
938 	[VM_EXITCODE_IPI] = vmexit_ipi,
939 	[VM_EXITCODE_HLT] = vmexit_hlt,
940 	[VM_EXITCODE_PAUSE] = vmexit_pause,
941 };
942 
943 static void
944 vm_loop(struct vmctx *ctx, struct vcpu *vcpu)
945 {
946 	struct vm_exit vme;
947 	struct vm_run vmrun;
948 	int error, rc;
949 	enum vm_exitcode exitcode;
950 	cpuset_t active_cpus, dmask;
951 
952 	error = vm_active_cpus(ctx, &active_cpus);
953 	assert(CPU_ISSET(vcpu_id(vcpu), &active_cpus));
954 
955 	vmrun.vm_exit = &vme;
956 	vmrun.cpuset = &dmask;
957 	vmrun.cpusetsize = sizeof(dmask);
958 
959 	while (1) {
960 		error = vm_run(vcpu, &vmrun);
961 		if (error != 0)
962 			break;
963 
964 		exitcode = vme.exitcode;
965 		if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) {
966 			fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n",
967 			    exitcode);
968 			exit(4);
969 		}
970 
971 		rc = (*handler[exitcode])(ctx, vcpu, &vmrun);
972 
973 		switch (rc) {
974 		case VMEXIT_CONTINUE:
975 			break;
976 		case VMEXIT_ABORT:
977 			abort();
978 		default:
979 			exit(4);
980 		}
981 	}
982 	fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
983 }
984 
985 static int
986 num_vcpus_allowed(struct vmctx *ctx, struct vcpu *vcpu)
987 {
988 	uint16_t sockets, cores, threads, maxcpus;
989 	int tmp, error;
990 
991 	/*
992 	 * The guest is allowed to spinup more than one processor only if the
993 	 * UNRESTRICTED_GUEST capability is available.
994 	 */
995 	error = vm_get_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, &tmp);
996 	if (error != 0)
997 		return (1);
998 
999 	error = vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus);
1000 	if (error == 0)
1001 		return (maxcpus);
1002 	else
1003 		return (1);
1004 }
1005 
1006 static void
1007 fbsdrun_set_capabilities(struct vcpu *vcpu)
1008 {
1009 	int err, tmp;
1010 
1011 	if (get_config_bool_default("x86.vmexit_on_hlt", false)) {
1012 		err = vm_get_capability(vcpu, VM_CAP_HALT_EXIT, &tmp);
1013 		if (err < 0) {
1014 			fprintf(stderr, "VM exit on HLT not supported\n");
1015 			exit(4);
1016 		}
1017 		vm_set_capability(vcpu, VM_CAP_HALT_EXIT, 1);
1018 	}
1019 
1020 	if (get_config_bool_default("x86.vmexit_on_pause", false)) {
1021 		/*
1022 		 * pause exit support required for this mode
1023 		 */
1024 		err = vm_get_capability(vcpu, VM_CAP_PAUSE_EXIT, &tmp);
1025 		if (err < 0) {
1026 			fprintf(stderr,
1027 			    "SMP mux requested, no pause support\n");
1028 			exit(4);
1029 		}
1030 		vm_set_capability(vcpu, VM_CAP_PAUSE_EXIT, 1);
1031 	}
1032 
1033 	if (get_config_bool_default("x86.x2apic", false))
1034 		err = vm_set_x2apic_state(vcpu, X2APIC_ENABLED);
1035 	else
1036 		err = vm_set_x2apic_state(vcpu, X2APIC_DISABLED);
1037 
1038 	if (err) {
1039 		fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
1040 		exit(4);
1041 	}
1042 
1043 	vm_set_capability(vcpu, VM_CAP_ENABLE_INVPCID, 1);
1044 
1045 	err = vm_set_capability(vcpu, VM_CAP_IPI_EXIT, 1);
1046 	assert(err == 0);
1047 }
1048 
1049 static struct vmctx *
1050 do_open(const char *vmname)
1051 {
1052 	struct vmctx *ctx;
1053 	int error;
1054 	bool reinit, romboot;
1055 
1056 	reinit = romboot = false;
1057 
1058 	if (lpc_bootrom())
1059 		romboot = true;
1060 
1061 	error = vm_create(vmname);
1062 	if (error) {
1063 		if (errno == EEXIST) {
1064 			if (romboot) {
1065 				reinit = true;
1066 			} else {
1067 				/*
1068 				 * The virtual machine has been setup by the
1069 				 * userspace bootloader.
1070 				 */
1071 			}
1072 		} else {
1073 			perror("vm_create");
1074 			exit(4);
1075 		}
1076 	} else {
1077 		if (!romboot) {
1078 			/*
1079 			 * If the virtual machine was just created then a
1080 			 * bootrom must be configured to boot it.
1081 			 */
1082 			fprintf(stderr, "virtual machine cannot be booted\n");
1083 			exit(4);
1084 		}
1085 	}
1086 
1087 	ctx = vm_open(vmname);
1088 	if (ctx == NULL) {
1089 		perror("vm_open");
1090 		exit(4);
1091 	}
1092 
1093 #ifndef WITHOUT_CAPSICUM
1094 	if (vm_limit_rights(ctx) != 0)
1095 		err(EX_OSERR, "vm_limit_rights");
1096 #endif
1097 
1098 	if (reinit) {
1099 		error = vm_reinit(ctx);
1100 		if (error) {
1101 			perror("vm_reinit");
1102 			exit(4);
1103 		}
1104 	}
1105 	error = vm_set_topology(ctx, cpu_sockets, cpu_cores, cpu_threads, 0);
1106 	if (error)
1107 		errx(EX_OSERR, "vm_set_topology");
1108 	return (ctx);
1109 }
1110 
1111 static void
1112 spinup_vcpu(struct vcpu_info *vi, bool bsp)
1113 {
1114 	int error;
1115 
1116 	if (!bsp) {
1117 		fbsdrun_set_capabilities(vi->vcpu);
1118 
1119 		/*
1120 		 * Enable the 'unrestricted guest' mode for APs.
1121 		 *
1122 		 * APs startup in power-on 16-bit mode.
1123 		 */
1124 		error = vm_set_capability(vi->vcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
1125 		assert(error == 0);
1126 	}
1127 
1128 	fbsdrun_addcpu(vi);
1129 }
1130 
1131 static bool
1132 parse_config_option(const char *option)
1133 {
1134 	const char *value;
1135 	char *path;
1136 
1137 	value = strchr(option, '=');
1138 	if (value == NULL || value[1] == '\0')
1139 		return (false);
1140 	path = strndup(option, value - option);
1141 	if (path == NULL)
1142 		err(4, "Failed to allocate memory");
1143 	set_config_value(path, value + 1);
1144 	return (true);
1145 }
1146 
1147 static void
1148 parse_simple_config_file(const char *path)
1149 {
1150 	FILE *fp;
1151 	char *line, *cp;
1152 	size_t linecap;
1153 	unsigned int lineno;
1154 
1155 	fp = fopen(path, "r");
1156 	if (fp == NULL)
1157 		err(4, "Failed to open configuration file %s", path);
1158 	line = NULL;
1159 	linecap = 0;
1160 	lineno = 1;
1161 	for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) {
1162 		if (*line == '#' || *line == '\n')
1163 			continue;
1164 		cp = strchr(line, '\n');
1165 		if (cp != NULL)
1166 			*cp = '\0';
1167 		if (!parse_config_option(line))
1168 			errx(4, "%s line %u: invalid config option '%s'", path,
1169 			    lineno, line);
1170 	}
1171 	free(line);
1172 	fclose(fp);
1173 }
1174 
1175 static void
1176 parse_gdb_options(const char *opt)
1177 {
1178 	const char *sport;
1179 	char *colon;
1180 
1181 	if (opt[0] == 'w') {
1182 		set_config_bool("gdb.wait", true);
1183 		opt++;
1184 	}
1185 
1186 	colon = strrchr(opt, ':');
1187 	if (colon == NULL) {
1188 		sport = opt;
1189 	} else {
1190 		*colon = '\0';
1191 		colon++;
1192 		sport = colon;
1193 		set_config_value("gdb.address", opt);
1194 	}
1195 
1196 	set_config_value("gdb.port", sport);
1197 }
1198 
1199 static void
1200 set_defaults(void)
1201 {
1202 
1203 	set_config_bool("acpi_tables", false);
1204 	set_config_value("memory.size", "256M");
1205 	set_config_bool("x86.strictmsr", true);
1206 	set_config_value("lpc.fwcfg", "bhyve");
1207 }
1208 
1209 int
1210 main(int argc, char *argv[])
1211 {
1212 	int c, error;
1213 	int max_vcpus, memflags;
1214 	struct vcpu *bsp;
1215 	struct vmctx *ctx;
1216 	struct qemu_fwcfg_item *e820_fwcfg_item;
1217 	size_t memsize;
1218 	const char *optstr, *value, *vmname;
1219 #ifdef BHYVE_SNAPSHOT
1220 	char *restore_file;
1221 	struct restore_state rstate;
1222 
1223 	restore_file = NULL;
1224 #endif
1225 
1226 	init_config();
1227 	set_defaults();
1228 	progname = basename(argv[0]);
1229 
1230 #ifdef BHYVE_SNAPSHOT
1231 	optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:r:";
1232 #else
1233 	optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:";
1234 #endif
1235 	while ((c = getopt(argc, argv, optstr)) != -1) {
1236 		switch (c) {
1237 		case 'a':
1238 			set_config_bool("x86.x2apic", false);
1239 			break;
1240 		case 'A':
1241 			set_config_bool("acpi_tables", true);
1242 			break;
1243 		case 'D':
1244 			set_config_bool("destroy_on_poweroff", true);
1245 			break;
1246 		case 'p':
1247 			if (pincpu_parse(optarg) != 0) {
1248 				errx(EX_USAGE, "invalid vcpu pinning "
1249 				    "configuration '%s'", optarg);
1250 			}
1251 			break;
1252 		case 'c':
1253 			if (topology_parse(optarg) != 0) {
1254 			    errx(EX_USAGE, "invalid cpu topology "
1255 				"'%s'", optarg);
1256 			}
1257 			break;
1258 		case 'C':
1259 			set_config_bool("memory.guest_in_core", true);
1260 			break;
1261 		case 'f':
1262 			if (qemu_fwcfg_parse_cmdline_arg(optarg) != 0) {
1263 			    errx(EX_USAGE, "invalid fwcfg item '%s'", optarg);
1264 			}
1265 			break;
1266 		case 'G':
1267 			parse_gdb_options(optarg);
1268 			break;
1269 		case 'k':
1270 			parse_simple_config_file(optarg);
1271 			break;
1272 		case 'K':
1273 			set_config_value("keyboard.layout", optarg);
1274 			break;
1275 		case 'l':
1276 			if (strncmp(optarg, "help", strlen(optarg)) == 0) {
1277 				lpc_print_supported_devices();
1278 				exit(0);
1279 			} else if (lpc_device_parse(optarg) != 0) {
1280 				errx(EX_USAGE, "invalid lpc device "
1281 				    "configuration '%s'", optarg);
1282 			}
1283 			break;
1284 #ifdef BHYVE_SNAPSHOT
1285 		case 'r':
1286 			restore_file = optarg;
1287 			break;
1288 #endif
1289 		case 's':
1290 			if (strncmp(optarg, "help", strlen(optarg)) == 0) {
1291 				pci_print_supported_devices();
1292 				exit(0);
1293 			} else if (pci_parse_slot(optarg) != 0)
1294 				exit(4);
1295 			else
1296 				break;
1297 		case 'S':
1298 			set_config_bool("memory.wired", true);
1299 			break;
1300 		case 'm':
1301 			set_config_value("memory.size", optarg);
1302 			break;
1303 		case 'o':
1304 			if (!parse_config_option(optarg))
1305 				errx(EX_USAGE, "invalid configuration option '%s'", optarg);
1306 			break;
1307 		case 'H':
1308 			set_config_bool("x86.vmexit_on_hlt", true);
1309 			break;
1310 		case 'I':
1311 			/*
1312 			 * The "-I" option was used to add an ioapic to the
1313 			 * virtual machine.
1314 			 *
1315 			 * An ioapic is now provided unconditionally for each
1316 			 * virtual machine and this option is now deprecated.
1317 			 */
1318 			break;
1319 		case 'P':
1320 			set_config_bool("x86.vmexit_on_pause", true);
1321 			break;
1322 		case 'e':
1323 			set_config_bool("x86.strictio", true);
1324 			break;
1325 		case 'u':
1326 			set_config_bool("rtc.use_localtime", false);
1327 			break;
1328 		case 'U':
1329 			set_config_value("uuid", optarg);
1330 			break;
1331 		case 'w':
1332 			set_config_bool("x86.strictmsr", false);
1333 			break;
1334 		case 'W':
1335 			set_config_bool("virtio_msix", false);
1336 			break;
1337 		case 'x':
1338 			set_config_bool("x86.x2apic", true);
1339 			break;
1340 		case 'Y':
1341 			set_config_bool("x86.mptable", false);
1342 			break;
1343 		case 'h':
1344 			usage(0);
1345 		default:
1346 			usage(1);
1347 		}
1348 	}
1349 	argc -= optind;
1350 	argv += optind;
1351 
1352 	if (argc > 1)
1353 		usage(1);
1354 
1355 #ifdef BHYVE_SNAPSHOT
1356 	if (restore_file != NULL) {
1357 		error = load_restore_file(restore_file, &rstate);
1358 		if (error) {
1359 			fprintf(stderr, "Failed to read checkpoint info from "
1360 					"file: '%s'.\n", restore_file);
1361 			exit(1);
1362 		}
1363 		vmname = lookup_vmname(&rstate);
1364 		if (vmname != NULL)
1365 			set_config_value("name", vmname);
1366 	}
1367 #endif
1368 
1369 	if (argc == 1)
1370 		set_config_value("name", argv[0]);
1371 
1372 	vmname = get_config_value("name");
1373 	if (vmname == NULL)
1374 		usage(1);
1375 
1376 	if (get_config_bool_default("config.dump", false)) {
1377 		dump_config();
1378 		exit(1);
1379 	}
1380 
1381 	calc_topology();
1382 	build_vcpumaps();
1383 
1384 	value = get_config_value("memory.size");
1385 	error = vm_parse_memsize(value, &memsize);
1386 	if (error)
1387 		errx(EX_USAGE, "invalid memsize '%s'", value);
1388 
1389 	ctx = do_open(vmname);
1390 
1391 #ifdef BHYVE_SNAPSHOT
1392 	if (restore_file != NULL) {
1393 		guest_ncpus = lookup_guest_ncpus(&rstate);
1394 		memflags = lookup_memflags(&rstate);
1395 		memsize = lookup_memsize(&rstate);
1396 	}
1397 
1398 	if (guest_ncpus < 1) {
1399 		fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus);
1400 		exit(1);
1401 	}
1402 #endif
1403 
1404 	bsp = vm_vcpu_open(ctx, BSP);
1405 	max_vcpus = num_vcpus_allowed(ctx, bsp);
1406 	if (guest_ncpus > max_vcpus) {
1407 		fprintf(stderr, "%d vCPUs requested but only %d available\n",
1408 			guest_ncpus, max_vcpus);
1409 		exit(4);
1410 	}
1411 
1412 	fbsdrun_set_capabilities(bsp);
1413 
1414 	/* Allocate per-VCPU resources. */
1415 	vcpu_info = calloc(guest_ncpus, sizeof(*vcpu_info));
1416 	for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) {
1417 		vcpu_info[vcpuid].ctx = ctx;
1418 		vcpu_info[vcpuid].vcpuid = vcpuid;
1419 		if (vcpuid == BSP)
1420 			vcpu_info[vcpuid].vcpu = bsp;
1421 		else
1422 			vcpu_info[vcpuid].vcpu = vm_vcpu_open(ctx, vcpuid);
1423 	}
1424 
1425 	memflags = 0;
1426 	if (get_config_bool_default("memory.wired", false))
1427 		memflags |= VM_MEM_F_WIRED;
1428 	if (get_config_bool_default("memory.guest_in_core", false))
1429 		memflags |= VM_MEM_F_INCORE;
1430 	vm_set_memflags(ctx, memflags);
1431 	error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
1432 	if (error) {
1433 		fprintf(stderr, "Unable to setup memory (%d)\n", errno);
1434 		exit(4);
1435 	}
1436 
1437 	error = init_msr();
1438 	if (error) {
1439 		fprintf(stderr, "init_msr error %d", error);
1440 		exit(4);
1441 	}
1442 
1443 	init_mem(guest_ncpus);
1444 	init_inout();
1445 	kernemu_dev_init();
1446 	init_bootrom(ctx);
1447 	atkbdc_init(ctx);
1448 	pci_irq_init(ctx);
1449 	ioapic_init(ctx);
1450 
1451 	rtc_init(ctx);
1452 	sci_init(ctx);
1453 
1454 	if (qemu_fwcfg_init(ctx) != 0) {
1455 		fprintf(stderr, "qemu fwcfg initialization error");
1456 		exit(4);
1457 	}
1458 
1459 	if (qemu_fwcfg_add_file("opt/bhyve/hw.ncpu", sizeof(guest_ncpus),
1460 	    &guest_ncpus) != 0) {
1461 		fprintf(stderr, "Could not add qemu fwcfg opt/bhyve/hw.ncpu");
1462 		exit(4);
1463 	}
1464 
1465 	if (e820_init(ctx) != 0) {
1466 		fprintf(stderr, "Unable to setup E820");
1467 		exit(4);
1468 	}
1469 
1470 	/*
1471 	 * Exit if a device emulation finds an error in its initialization
1472 	 */
1473 	if (init_pci(ctx) != 0) {
1474 		perror("device emulation initialization error");
1475 		exit(4);
1476 	}
1477 
1478 	/*
1479 	 * Initialize after PCI, to allow a bootrom file to reserve the high
1480 	 * region.
1481 	 */
1482 	if (get_config_bool("acpi_tables"))
1483 		vmgenc_init(ctx);
1484 
1485 	init_gdb(ctx);
1486 
1487 	if (lpc_bootrom()) {
1488 		if (vm_set_capability(bsp, VM_CAP_UNRESTRICTED_GUEST, 1)) {
1489 			fprintf(stderr, "ROM boot failed: unrestricted guest "
1490 			    "capability not available\n");
1491 			exit(4);
1492 		}
1493 		error = vcpu_reset(bsp);
1494 		assert(error == 0);
1495 	}
1496 
1497 	/*
1498 	 * Add all vCPUs.
1499 	 */
1500 	for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++)
1501 		spinup_vcpu(&vcpu_info[vcpuid], vcpuid == BSP);
1502 
1503 #ifdef BHYVE_SNAPSHOT
1504 	if (restore_file != NULL) {
1505 		fprintf(stdout, "Pausing pci devs...\r\n");
1506 		if (vm_pause_devices() != 0) {
1507 			fprintf(stderr, "Failed to pause PCI device state.\n");
1508 			exit(1);
1509 		}
1510 
1511 		fprintf(stdout, "Restoring vm mem...\r\n");
1512 		if (restore_vm_mem(ctx, &rstate) != 0) {
1513 			fprintf(stderr, "Failed to restore VM memory.\n");
1514 			exit(1);
1515 		}
1516 
1517 		fprintf(stdout, "Restoring pci devs...\r\n");
1518 		if (vm_restore_devices(&rstate) != 0) {
1519 			fprintf(stderr, "Failed to restore PCI device state.\n");
1520 			exit(1);
1521 		}
1522 
1523 		fprintf(stdout, "Restoring kernel structs...\r\n");
1524 		if (vm_restore_kern_structs(ctx, &rstate) != 0) {
1525 			fprintf(stderr, "Failed to restore kernel structs.\n");
1526 			exit(1);
1527 		}
1528 
1529 		fprintf(stdout, "Resuming pci devs...\r\n");
1530 		if (vm_resume_devices() != 0) {
1531 			fprintf(stderr, "Failed to resume PCI device state.\n");
1532 			exit(1);
1533 		}
1534 	}
1535 #endif
1536 
1537 	/*
1538 	 * build the guest tables, MP etc.
1539 	 */
1540 	if (get_config_bool_default("x86.mptable", true)) {
1541 		error = mptable_build(ctx, guest_ncpus);
1542 		if (error) {
1543 			perror("error to build the guest tables");
1544 			exit(4);
1545 		}
1546 	}
1547 
1548 	error = smbios_build(ctx);
1549 	if (error != 0)
1550 		exit(4);
1551 
1552 	if (get_config_bool("acpi_tables")) {
1553 		error = acpi_build(ctx, guest_ncpus);
1554 		assert(error == 0);
1555 	}
1556 
1557 	e820_fwcfg_item = e820_get_fwcfg_item();
1558 	if (e820_fwcfg_item == NULL) {
1559 	    fprintf(stderr, "invalid e820 table");
1560 		exit(4);
1561 	}
1562 	if (qemu_fwcfg_add_file("etc/e820", e820_fwcfg_item->size,
1563 		e820_fwcfg_item->data) != 0) {
1564 		fprintf(stderr, "could not add qemu fwcfg etc/e820");
1565 		exit(4);
1566 	}
1567 	free(e820_fwcfg_item);
1568 
1569 	if (lpc_bootrom() && strcmp(lpc_fwcfg(), "bhyve") == 0) {
1570 		fwctl_init();
1571 	}
1572 
1573 	/*
1574 	 * Change the proc title to include the VM name.
1575 	 */
1576 	setproctitle("%s", vmname);
1577 
1578 #ifdef BHYVE_SNAPSHOT
1579 	/* initialize mutex/cond variables */
1580 	init_snapshot();
1581 
1582 	/*
1583 	 * checkpointing thread for communication with bhyvectl
1584 	 */
1585 	if (init_checkpoint_thread(ctx) != 0)
1586 		errx(EX_OSERR, "Failed to start checkpoint thread");
1587 #endif
1588 
1589 #ifndef WITHOUT_CAPSICUM
1590 	caph_cache_catpages();
1591 
1592 	if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1)
1593 		errx(EX_OSERR, "Unable to apply rights for sandbox");
1594 
1595 	if (caph_enter() == -1)
1596 		errx(EX_OSERR, "cap_enter() failed");
1597 #endif
1598 
1599 #ifdef BHYVE_SNAPSHOT
1600 	if (restore_file != NULL) {
1601 		destroy_restore_state(&rstate);
1602 		if (vm_restore_time(ctx) < 0)
1603 			err(EX_OSERR, "Unable to restore time");
1604 
1605 		for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++)
1606 			vm_resume_cpu(vcpu_info[vcpuid].vcpu);
1607 	} else
1608 #endif
1609 		vm_resume_cpu(bsp);
1610 
1611 	/*
1612 	 * Head off to the main event dispatch loop
1613 	 */
1614 	mevent_dispatch();
1615 
1616 	exit(4);
1617 }
1618