xref: /freebsd/usr.sbin/bhyveload/bhyveload.c (revision 1edb7116)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2011 Google, Inc.
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 #include <sys/ioctl.h>
57 #include <sys/stat.h>
58 #include <sys/disk.h>
59 #include <sys/queue.h>
60 
61 #include <machine/specialreg.h>
62 #include <machine/vmm.h>
63 
64 #include <assert.h>
65 #include <dirent.h>
66 #include <dlfcn.h>
67 #include <errno.h>
68 #include <err.h>
69 #include <fcntl.h>
70 #include <getopt.h>
71 #include <libgen.h>
72 #include <limits.h>
73 #include <setjmp.h>
74 #include <stdio.h>
75 #include <stdlib.h>
76 #include <string.h>
77 #include <sysexits.h>
78 #include <termios.h>
79 #include <unistd.h>
80 
81 #include <capsicum_helpers.h>
82 #include <vmmapi.h>
83 
84 #include "userboot.h"
85 
86 #define	MB	(1024 * 1024UL)
87 #define	GB	(1024 * 1024 * 1024UL)
88 #define	BSP	0
89 
90 #define	NDISKS	32
91 
92 /*
93  * Reason for our loader reload and reentry, though these aren't really used
94  * at the moment.
95  */
96 enum {
97 	/* 0 cannot be allocated; setjmp(3) return. */
98 	JMP_SWAPLOADER = 0x01,
99 	JMP_REBOOT,
100 };
101 
102 static struct termios term, oldterm;
103 static int disk_fd[NDISKS];
104 static int ndisks;
105 static int consin_fd, consout_fd;
106 static int hostbase_fd = -1;
107 
108 static void *loader_hdl;
109 static char *loader;
110 static int explicit_loader_fd = -1;
111 static jmp_buf jb;
112 
113 static char *vmname, *progname;
114 static struct vmctx *ctx;
115 static struct vcpu *vcpu;
116 
117 static uint64_t gdtbase, cr3, rsp;
118 
119 static void cb_exit(void *arg, int v);
120 
121 /*
122  * Console i/o callbacks
123  */
124 
125 static void
126 cb_putc(void *arg __unused, int ch)
127 {
128 	char c = ch;
129 
130 	(void) write(consout_fd, &c, 1);
131 }
132 
133 static int
134 cb_getc(void *arg __unused)
135 {
136 	char c;
137 
138 	if (read(consin_fd, &c, 1) == 1)
139 		return (c);
140 	return (-1);
141 }
142 
143 static int
144 cb_poll(void *arg __unused)
145 {
146 	int n;
147 
148 	if (ioctl(consin_fd, FIONREAD, &n) >= 0)
149 		return (n > 0);
150 	return (0);
151 }
152 
153 /*
154  * Host filesystem i/o callbacks
155  */
156 
157 struct cb_file {
158 	int cf_isdir;
159 	size_t cf_size;
160 	struct stat cf_stat;
161 	union {
162 		int fd;
163 		DIR *dir;
164 	} cf_u;
165 };
166 
167 static int
168 cb_open(void *arg __unused, const char *filename, void **hp)
169 {
170 	struct cb_file *cf;
171 	struct stat sb;
172 	int fd, flags;
173 
174 	cf = NULL;
175 	fd = -1;
176 	flags = O_RDONLY | O_RESOLVE_BENEATH;
177 	if (hostbase_fd == -1)
178 		return (ENOENT);
179 
180 	/* Absolute paths are relative to our hostbase, chop off leading /. */
181 	if (filename[0] == '/')
182 		filename++;
183 
184 	/* Lookup of /, use . instead. */
185 	if (filename[0] == '\0')
186 		filename = ".";
187 
188 	if (fstatat(hostbase_fd, filename, &sb, AT_RESOLVE_BENEATH) < 0)
189 		return (errno);
190 
191 	if (!S_ISDIR(sb.st_mode) && !S_ISREG(sb.st_mode))
192 		return (EINVAL);
193 
194 	if (S_ISDIR(sb.st_mode))
195 		flags |= O_DIRECTORY;
196 
197 	/* May be opening the root dir */
198 	fd = openat(hostbase_fd, filename, flags);
199 	if (fd < 0)
200 		return (errno);
201 
202 	cf = malloc(sizeof(struct cb_file));
203 	if (cf == NULL) {
204 		close(fd);
205 		return (ENOMEM);
206 	}
207 
208 	cf->cf_stat = sb;
209 	cf->cf_size = cf->cf_stat.st_size;
210 
211 	if (S_ISDIR(cf->cf_stat.st_mode)) {
212 		cf->cf_isdir = 1;
213 		cf->cf_u.dir = fdopendir(fd);
214 		if (cf->cf_u.dir == NULL) {
215 			close(fd);
216 			free(cf);
217 			return (ENOMEM);
218 		}
219 	} else {
220 		assert(S_ISREG(cf->cf_stat.st_mode));
221 		cf->cf_isdir = 0;
222 		cf->cf_u.fd = fd;
223 	}
224 	*hp = cf;
225 	return (0);
226 }
227 
228 static int
229 cb_close(void *arg __unused, void *h)
230 {
231 	struct cb_file *cf = h;
232 
233 	if (cf->cf_isdir)
234 		closedir(cf->cf_u.dir);
235 	else
236 		close(cf->cf_u.fd);
237 	free(cf);
238 
239 	return (0);
240 }
241 
242 static int
243 cb_isdir(void *arg __unused, void *h)
244 {
245 	struct cb_file *cf = h;
246 
247 	return (cf->cf_isdir);
248 }
249 
250 static int
251 cb_read(void *arg __unused, void *h, void *buf, size_t size, size_t *resid)
252 {
253 	struct cb_file *cf = h;
254 	ssize_t sz;
255 
256 	if (cf->cf_isdir)
257 		return (EINVAL);
258 	sz = read(cf->cf_u.fd, buf, size);
259 	if (sz < 0)
260 		return (EINVAL);
261 	*resid = size - sz;
262 	return (0);
263 }
264 
265 static int
266 cb_readdir(void *arg __unused, void *h, uint32_t *fileno_return,
267     uint8_t *type_return, size_t *namelen_return, char *name)
268 {
269 	struct cb_file *cf = h;
270 	struct dirent *dp;
271 
272 	if (!cf->cf_isdir)
273 		return (EINVAL);
274 
275 	dp = readdir(cf->cf_u.dir);
276 	if (!dp)
277 		return (ENOENT);
278 
279 	/*
280 	 * Note: d_namlen is in the range 0..255 and therefore less
281 	 * than PATH_MAX so we don't need to test before copying.
282 	 */
283 	*fileno_return = dp->d_fileno;
284 	*type_return = dp->d_type;
285 	*namelen_return = dp->d_namlen;
286 	memcpy(name, dp->d_name, dp->d_namlen);
287 	name[dp->d_namlen] = 0;
288 
289 	return (0);
290 }
291 
292 static int
293 cb_seek(void *arg __unused, void *h, uint64_t offset, int whence)
294 {
295 	struct cb_file *cf = h;
296 
297 	if (cf->cf_isdir)
298 		return (EINVAL);
299 	if (lseek(cf->cf_u.fd, offset, whence) < 0)
300 		return (errno);
301 	return (0);
302 }
303 
304 static int
305 cb_stat(void *arg __unused, void *h, struct stat *sbp)
306 {
307 	struct cb_file *cf = h;
308 
309 	memset(sbp, 0, sizeof(struct stat));
310 	sbp->st_mode = cf->cf_stat.st_mode;
311 	sbp->st_uid = cf->cf_stat.st_uid;
312 	sbp->st_gid = cf->cf_stat.st_gid;
313 	sbp->st_size = cf->cf_stat.st_size;
314 	sbp->st_mtime = cf->cf_stat.st_mtime;
315 	sbp->st_dev = cf->cf_stat.st_dev;
316 	sbp->st_ino = cf->cf_stat.st_ino;
317 
318 	return (0);
319 }
320 
321 /*
322  * Disk image i/o callbacks
323  */
324 
325 static int
326 cb_diskread(void *arg __unused, int unit, uint64_t from, void *to, size_t size,
327     size_t *resid)
328 {
329 	ssize_t n;
330 
331 	if (unit < 0 || unit >= ndisks)
332 		return (EIO);
333 	n = pread(disk_fd[unit], to, size, from);
334 	if (n < 0)
335 		return (errno);
336 	*resid = size - n;
337 	return (0);
338 }
339 
340 static int
341 cb_diskwrite(void *arg __unused, int unit, uint64_t offset, void *src,
342     size_t size, size_t *resid)
343 {
344 	ssize_t n;
345 
346 	if (unit < 0 || unit >= ndisks)
347 		return (EIO);
348 	n = pwrite(disk_fd[unit], src, size, offset);
349 	if (n < 0)
350 		return (errno);
351 	*resid = size - n;
352 	return (0);
353 }
354 
355 static int
356 cb_diskioctl(void *arg __unused, int unit, u_long cmd, void *data)
357 {
358 	struct stat sb;
359 
360 	if (unit < 0 || unit >= ndisks)
361 		return (EBADF);
362 
363 	switch (cmd) {
364 	case DIOCGSECTORSIZE:
365 		*(u_int *)data = 512;
366 		break;
367 	case DIOCGMEDIASIZE:
368 		if (fstat(disk_fd[unit], &sb) != 0)
369 			return (ENOTTY);
370 		if (S_ISCHR(sb.st_mode) &&
371 		    ioctl(disk_fd[unit], DIOCGMEDIASIZE, &sb.st_size) != 0)
372 				return (ENOTTY);
373 		*(off_t *)data = sb.st_size;
374 		break;
375 	default:
376 		return (ENOTTY);
377 	}
378 
379 	return (0);
380 }
381 
382 /*
383  * Guest virtual machine i/o callbacks
384  */
385 static int
386 cb_copyin(void *arg __unused, const void *from, uint64_t to, size_t size)
387 {
388 	char *ptr;
389 
390 	to &= 0x7fffffff;
391 
392 	ptr = vm_map_gpa(ctx, to, size);
393 	if (ptr == NULL)
394 		return (EFAULT);
395 
396 	memcpy(ptr, from, size);
397 	return (0);
398 }
399 
400 static int
401 cb_copyout(void *arg __unused, uint64_t from, void *to, size_t size)
402 {
403 	char *ptr;
404 
405 	from &= 0x7fffffff;
406 
407 	ptr = vm_map_gpa(ctx, from, size);
408 	if (ptr == NULL)
409 		return (EFAULT);
410 
411 	memcpy(to, ptr, size);
412 	return (0);
413 }
414 
415 static void
416 cb_setreg(void *arg __unused, int r, uint64_t v)
417 {
418 	int error;
419 	enum vm_reg_name vmreg;
420 
421 	vmreg = VM_REG_LAST;
422 
423 	switch (r) {
424 	case 4:
425 		vmreg = VM_REG_GUEST_RSP;
426 		rsp = v;
427 		break;
428 	default:
429 		break;
430 	}
431 
432 	if (vmreg == VM_REG_LAST) {
433 		printf("test_setreg(%d): not implemented\n", r);
434 		cb_exit(NULL, USERBOOT_EXIT_QUIT);
435 	}
436 
437 	error = vm_set_register(vcpu, vmreg, v);
438 	if (error) {
439 		perror("vm_set_register");
440 		cb_exit(NULL, USERBOOT_EXIT_QUIT);
441 	}
442 }
443 
444 static void
445 cb_setmsr(void *arg __unused, int r, uint64_t v)
446 {
447 	int error;
448 	enum vm_reg_name vmreg;
449 
450 	vmreg = VM_REG_LAST;
451 
452 	switch (r) {
453 	case MSR_EFER:
454 		vmreg = VM_REG_GUEST_EFER;
455 		break;
456 	default:
457 		break;
458 	}
459 
460 	if (vmreg == VM_REG_LAST) {
461 		printf("test_setmsr(%d): not implemented\n", r);
462 		cb_exit(NULL, USERBOOT_EXIT_QUIT);
463 	}
464 
465 	error = vm_set_register(vcpu, vmreg, v);
466 	if (error) {
467 		perror("vm_set_msr");
468 		cb_exit(NULL, USERBOOT_EXIT_QUIT);
469 	}
470 }
471 
472 static void
473 cb_setcr(void *arg __unused, int r, uint64_t v)
474 {
475 	int error;
476 	enum vm_reg_name vmreg;
477 
478 	vmreg = VM_REG_LAST;
479 
480 	switch (r) {
481 	case 0:
482 		vmreg = VM_REG_GUEST_CR0;
483 		break;
484 	case 3:
485 		vmreg = VM_REG_GUEST_CR3;
486 		cr3 = v;
487 		break;
488 	case 4:
489 		vmreg = VM_REG_GUEST_CR4;
490 		break;
491 	default:
492 		break;
493 	}
494 
495 	if (vmreg == VM_REG_LAST) {
496 		printf("test_setcr(%d): not implemented\n", r);
497 		cb_exit(NULL, USERBOOT_EXIT_QUIT);
498 	}
499 
500 	error = vm_set_register(vcpu, vmreg, v);
501 	if (error) {
502 		perror("vm_set_cr");
503 		cb_exit(NULL, USERBOOT_EXIT_QUIT);
504 	}
505 }
506 
507 static void
508 cb_setgdt(void *arg __unused, uint64_t base, size_t size)
509 {
510 	int error;
511 
512 	error = vm_set_desc(vcpu, VM_REG_GUEST_GDTR, base, size - 1, 0);
513 	if (error != 0) {
514 		perror("vm_set_desc(gdt)");
515 		cb_exit(NULL, USERBOOT_EXIT_QUIT);
516 	}
517 
518 	gdtbase = base;
519 }
520 
521 static void
522 cb_exec(void *arg __unused, uint64_t rip)
523 {
524 	int error;
525 
526 	if (cr3 == 0)
527 		error = vm_setup_freebsd_registers_i386(vcpu, rip, gdtbase,
528 		    rsp);
529 	else
530 		error = vm_setup_freebsd_registers(vcpu, rip, cr3, gdtbase,
531 		    rsp);
532 	if (error) {
533 		perror("vm_setup_freebsd_registers");
534 		cb_exit(NULL, USERBOOT_EXIT_QUIT);
535 	}
536 
537 	cb_exit(NULL, 0);
538 }
539 
540 /*
541  * Misc
542  */
543 
544 static void
545 cb_delay(void *arg __unused, int usec)
546 {
547 
548 	usleep(usec);
549 }
550 
551 static void
552 cb_exit(void *arg __unused, int v)
553 {
554 
555 	tcsetattr(consout_fd, TCSAFLUSH, &oldterm);
556 	if (v == USERBOOT_EXIT_REBOOT)
557 		longjmp(jb, JMP_REBOOT);
558 	exit(v);
559 }
560 
561 static void
562 cb_getmem(void *arg __unused, uint64_t *ret_lowmem, uint64_t *ret_highmem)
563 {
564 
565 	*ret_lowmem = vm_get_lowmem_size(ctx);
566 	*ret_highmem = vm_get_highmem_size(ctx);
567 }
568 
569 struct env {
570 	char *str;	/* name=value */
571 	SLIST_ENTRY(env) next;
572 };
573 
574 static SLIST_HEAD(envhead, env) envhead;
575 
576 static void
577 addenv(const char *str)
578 {
579 	struct env *env;
580 
581 	env = malloc(sizeof(struct env));
582 	if (env == NULL)
583 		err(EX_OSERR, "malloc");
584 	env->str = strdup(str);
585 	if (env->str == NULL)
586 		err(EX_OSERR, "strdup");
587 	SLIST_INSERT_HEAD(&envhead, env, next);
588 }
589 
590 static char *
591 cb_getenv(void *arg __unused, int num)
592 {
593 	int i;
594 	struct env *env;
595 
596 	i = 0;
597 	SLIST_FOREACH(env, &envhead, next) {
598 		if (i == num)
599 			return (env->str);
600 		i++;
601 	}
602 
603 	return (NULL);
604 }
605 
606 static int
607 cb_vm_set_register(void *arg __unused, int vcpuid, int reg, uint64_t val)
608 {
609 
610 	assert(vcpuid == BSP);
611 	return (vm_set_register(vcpu, reg, val));
612 }
613 
614 static int
615 cb_vm_set_desc(void *arg __unused, int vcpuid, int reg, uint64_t base,
616     u_int limit, u_int access)
617 {
618 
619 	assert(vcpuid == BSP);
620 	return (vm_set_desc(vcpu, reg, base, limit, access));
621 }
622 
623 static void
624 cb_swap_interpreter(void *arg __unused, const char *interp_req)
625 {
626 
627 	/*
628 	 * If the user specified a loader but we detected a mismatch, we should
629 	 * not try to pivot to a different loader on them.
630 	 */
631 	free(loader);
632 	if (explicit_loader_fd != -1) {
633 		perror("requested loader interpreter does not match guest userboot");
634 		cb_exit(NULL, 1);
635 	}
636 	if (interp_req == NULL || *interp_req == '\0') {
637 		perror("guest failed to request an interpreter");
638 		cb_exit(NULL, 1);
639 	}
640 
641 	if (asprintf(&loader, "userboot_%s.so", interp_req) == -1)
642 		err(EX_OSERR, "malloc");
643 	longjmp(jb, JMP_SWAPLOADER);
644 }
645 
646 static struct loader_callbacks cb = {
647 	.getc = cb_getc,
648 	.putc = cb_putc,
649 	.poll = cb_poll,
650 
651 	.open = cb_open,
652 	.close = cb_close,
653 	.isdir = cb_isdir,
654 	.read = cb_read,
655 	.readdir = cb_readdir,
656 	.seek = cb_seek,
657 	.stat = cb_stat,
658 
659 	.diskread = cb_diskread,
660 	.diskwrite = cb_diskwrite,
661 	.diskioctl = cb_diskioctl,
662 
663 	.copyin = cb_copyin,
664 	.copyout = cb_copyout,
665 	.setreg = cb_setreg,
666 	.setmsr = cb_setmsr,
667 	.setcr = cb_setcr,
668 	.setgdt = cb_setgdt,
669 	.exec = cb_exec,
670 
671 	.delay = cb_delay,
672 	.exit = cb_exit,
673 	.getmem = cb_getmem,
674 
675 	.getenv = cb_getenv,
676 
677 	/* Version 4 additions */
678 	.vm_set_register = cb_vm_set_register,
679 	.vm_set_desc = cb_vm_set_desc,
680 
681 	/* Version 5 additions */
682 	.swap_interpreter = cb_swap_interpreter,
683 };
684 
685 static int
686 altcons_open(char *path)
687 {
688 	struct stat sb;
689 	int err;
690 	int fd;
691 
692 	/*
693 	 * Allow stdio to be passed in so that the same string
694 	 * can be used for the bhyveload console and bhyve com-port
695 	 * parameters
696 	 */
697 	if (!strcmp(path, "stdio"))
698 		return (0);
699 
700 	err = stat(path, &sb);
701 	if (err == 0) {
702 		if (!S_ISCHR(sb.st_mode))
703 			err = ENOTSUP;
704 		else {
705 			fd = open(path, O_RDWR | O_NONBLOCK);
706 			if (fd < 0)
707 				err = errno;
708 			else
709 				consin_fd = consout_fd = fd;
710 		}
711 	}
712 
713 	return (err);
714 }
715 
716 static int
717 disk_open(char *path)
718 {
719 	int fd;
720 
721 	if (ndisks >= NDISKS)
722 		return (ERANGE);
723 
724 	fd = open(path, O_RDWR);
725 	if (fd < 0)
726 		return (errno);
727 
728 	disk_fd[ndisks] = fd;
729 	ndisks++;
730 
731 	return (0);
732 }
733 
734 static void
735 usage(void)
736 {
737 
738 	fprintf(stderr,
739 	    "usage: %s [-S][-c <console-device>] [-d <disk-path>] [-e <name=value>]\n"
740 	    "       %*s [-h <host-path>] [-m memsize[K|k|M|m|G|g|T|t]] <vmname>\n",
741 	    progname,
742 	    (int)strlen(progname), "");
743 	exit(1);
744 }
745 
746 static void
747 hostbase_open(const char *base)
748 {
749 	cap_rights_t rights;
750 
751 	if (hostbase_fd != -1)
752 		close(hostbase_fd);
753 	hostbase_fd = open(base, O_DIRECTORY | O_PATH);
754 	if (hostbase_fd == -1)
755 		err(EX_OSERR, "open");
756 
757 	if (caph_rights_limit(hostbase_fd, cap_rights_init(&rights, CAP_FSTATAT,
758 	    CAP_LOOKUP, CAP_PREAD)) < 0)
759 		err(EX_OSERR, "caph_rights_limit");
760 }
761 
762 static void
763 loader_open(int bootfd)
764 {
765 	int fd;
766 
767 	if (loader == NULL) {
768 		loader = strdup("userboot.so");
769 		if (loader == NULL)
770 			err(EX_OSERR, "malloc");
771 	}
772 
773 	assert(bootfd >= 0 || explicit_loader_fd >= 0);
774 	if (explicit_loader_fd >= 0)
775 		fd = explicit_loader_fd;
776 	else
777 		fd = openat(bootfd, loader, O_RDONLY | O_RESOLVE_BENEATH);
778 	if (fd == -1)
779 		err(EX_OSERR, "openat");
780 
781 	loader_hdl = fdlopen(fd, RTLD_LOCAL);
782 	if (!loader_hdl)
783 		errx(EX_OSERR, "dlopen: %s", dlerror());
784 	if (fd != explicit_loader_fd)
785 		close(fd);
786 }
787 
788 int
789 main(int argc, char** argv)
790 {
791 	void (*func)(struct loader_callbacks *, void *, int, int);
792 	uint64_t mem_size;
793 	int bootfd, opt, error, memflags, need_reinit;
794 
795 	bootfd = -1;
796 	progname = basename(argv[0]);
797 
798 	memflags = 0;
799 	mem_size = 256 * MB;
800 
801 	consin_fd = STDIN_FILENO;
802 	consout_fd = STDOUT_FILENO;
803 
804 	while ((opt = getopt(argc, argv, "CSc:d:e:h:l:m:")) != -1) {
805 		switch (opt) {
806 		case 'c':
807 			error = altcons_open(optarg);
808 			if (error != 0)
809 				errx(EX_USAGE, "Could not open '%s'", optarg);
810 			break;
811 
812 		case 'd':
813 			error = disk_open(optarg);
814 			if (error != 0)
815 				errx(EX_USAGE, "Could not open '%s'", optarg);
816 			break;
817 
818 		case 'e':
819 			addenv(optarg);
820 			break;
821 
822 		case 'h':
823 			hostbase_open(optarg);
824 			break;
825 
826 		case 'l':
827 			if (loader != NULL)
828 				errx(EX_USAGE, "-l can only be given once");
829 			loader = strdup(optarg);
830 			if (loader == NULL)
831 				err(EX_OSERR, "malloc");
832 			explicit_loader_fd = open(loader, O_RDONLY);
833 			if (explicit_loader_fd == -1)
834 				err(EX_OSERR, "%s", loader);
835 			break;
836 
837 		case 'm':
838 			error = vm_parse_memsize(optarg, &mem_size);
839 			if (error != 0)
840 				errx(EX_USAGE, "Invalid memsize '%s'", optarg);
841 			break;
842 		case 'C':
843 			memflags |= VM_MEM_F_INCORE;
844 			break;
845 		case 'S':
846 			memflags |= VM_MEM_F_WIRED;
847 			break;
848 		case '?':
849 			usage();
850 		}
851 	}
852 
853 	argc -= optind;
854 	argv += optind;
855 
856 	if (argc != 1)
857 		usage();
858 
859 	vmname = argv[0];
860 
861 	need_reinit = 0;
862 	error = vm_create(vmname);
863 	if (error) {
864 		if (errno != EEXIST)
865 			err(1, "vm_create");
866 		need_reinit = 1;
867 	}
868 
869 	ctx = vm_open(vmname);
870 	if (ctx == NULL)
871 		err(1, "vm_open");
872 
873 	/*
874 	 * If we weren't given an explicit loader to use, we need to support the
875 	 * guest requesting a different one.
876 	 */
877 	if (explicit_loader_fd == -1) {
878 		cap_rights_t rights;
879 
880 		bootfd = open("/boot", O_DIRECTORY | O_PATH);
881 		if (bootfd == -1)
882 			err(1, "open");
883 
884 		/*
885 		 * bootfd will be used to do a lookup of our loader and do an
886 		 * fdlopen(3) on the loader; thus, we need mmap(2) in addition
887 		 * to the more usual lookup rights.
888 		 */
889 		if (caph_rights_limit(bootfd, cap_rights_init(&rights,
890 		    CAP_FSTATAT, CAP_LOOKUP, CAP_MMAP_RX, CAP_PREAD)) < 0)
891 			err(1, "caph_rights_limit");
892 	}
893 
894 	vcpu = vm_vcpu_open(ctx, BSP);
895 
896 	caph_cache_catpages();
897 	if (caph_enter() < 0)
898 		err(1, "caph_enter");
899 
900 	/*
901 	 * setjmp in the case the guest wants to swap out interpreter,
902 	 * cb_swap_interpreter will swap out loader as appropriate and set
903 	 * need_reinit so that we end up in a clean state once again.
904 	 */
905 	if (setjmp(jb) != 0) {
906 		dlclose(loader_hdl);
907 		loader_hdl = NULL;
908 
909 		need_reinit = 1;
910 	}
911 
912 	if (need_reinit) {
913 		error = vm_reinit(ctx);
914 		if (error)
915 			err(1, "vm_reinit");
916 	}
917 
918 	vm_set_memflags(ctx, memflags);
919 	error = vm_setup_memory(ctx, mem_size, VM_MMAP_ALL);
920 	if (error)
921 		err(1, "vm_setup_memory");
922 
923 	loader_open(bootfd);
924 	func = dlsym(loader_hdl, "loader_main");
925 	if (!func)
926 		errx(1, "dlsym: %s", dlerror());
927 
928 	tcgetattr(consout_fd, &term);
929 	oldterm = term;
930 	cfmakeraw(&term);
931 	term.c_cflag |= CLOCAL;
932 
933 	tcsetattr(consout_fd, TCSAFLUSH, &term);
934 
935 	addenv("smbios.bios.vendor=BHYVE");
936 	addenv("boot_serial=1");
937 
938 	func(&cb, NULL, USERBOOT_VERSION_5, ndisks);
939 
940 	free(loader);
941 	return (0);
942 }
943