xref: /netbsd/sys/arch/x86/x86/x86_machdep.c (revision e1a61c42)
1 /*	$NetBSD: x86_machdep.c,v 1.153 2022/12/23 16:05:44 bouyer Exp $	*/
2 
3 /*-
4  * Copyright (c) 2002, 2006, 2007 YAMAMOTO Takashi,
5  * Copyright (c) 2005, 2008, 2009, 2019 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Julio M. Merino Vidal, and Andrew Doran.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __KERNEL_RCSID(0, "$NetBSD: x86_machdep.c,v 1.153 2022/12/23 16:05:44 bouyer Exp $");
35 
36 #include "opt_modular.h"
37 #include "opt_physmem.h"
38 #include "opt_splash.h"
39 #include "opt_kaslr.h"
40 #include "opt_svs.h"
41 #include "opt_xen.h"
42 
43 #include <sys/types.h>
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/kcore.h>
47 #include <sys/errno.h>
48 #include <sys/kauth.h>
49 #include <sys/mutex.h>
50 #include <sys/cpu.h>
51 #include <sys/intr.h>
52 #include <sys/atomic.h>
53 #include <sys/module.h>
54 #include <sys/sysctl.h>
55 #include <sys/extent.h>
56 #include <sys/rnd.h>
57 
58 #include <x86/bootspace.h>
59 #include <x86/cpuvar.h>
60 #include <x86/cputypes.h>
61 #include <x86/efi.h>
62 #include <x86/machdep.h>
63 #include <x86/nmi.h>
64 #include <x86/pio.h>
65 
66 #include <dev/splash/splash.h>
67 #include <dev/isa/isareg.h>
68 #include <dev/ic/i8042reg.h>
69 #include <dev/mm.h>
70 
71 #include <machine/bootinfo.h>
72 #include <machine/pmap_private.h>
73 #include <machine/vmparam.h>
74 
75 #include <uvm/uvm_extern.h>
76 
77 #include "tsc.h"
78 
79 #include "acpica.h"
80 #include "ioapic.h"
81 #include "lapic.h"
82 
83 #if NACPICA > 0
84 #include <dev/acpi/acpivar.h>
85 #endif
86 
87 #if NIOAPIC > 0 || NACPICA > 0
88 #include <machine/i82093var.h>
89 #endif
90 
91 #include "opt_md.h"
92 #if defined(MEMORY_DISK_HOOKS) && defined(MEMORY_DISK_DYNAMIC)
93 #include <dev/md.h>
94 #endif
95 
96 void (*x86_cpu_idle)(void);
97 static bool x86_cpu_idle_ipi;
98 static char x86_cpu_idle_text[16];
99 
100 static bool x86_user_ldt_enabled __read_mostly = false;
101 
102 #ifdef XEN
103 
104 #include <xen/xen.h>
105 #include <xen/hypervisor.h>
106 #endif
107 
108 #ifndef XENPV
109 void (*delay_func)(unsigned int) = i8254_delay;
110 void (*x86_initclock_func)(void) = i8254_initclocks;
111 #else /* XENPV */
112 void (*delay_func)(unsigned int) = xen_delay;
113 void (*x86_initclock_func)(void) = xen_initclocks;
114 #endif
115 
116 
117 /* --------------------------------------------------------------------- */
118 
119 /*
120  * Main bootinfo structure.  This is filled in by the bootstrap process
121  * done in locore.S based on the information passed by the boot loader.
122  */
123 struct bootinfo bootinfo;
124 
125 /* --------------------------------------------------------------------- */
126 
127 bool bootmethod_efi;
128 
129 static kauth_listener_t x86_listener;
130 
131 extern paddr_t lowmem_rsvd, avail_start, avail_end;
132 
133 vaddr_t msgbuf_vaddr;
134 
135 struct msgbuf_p_seg msgbuf_p_seg[VM_PHYSSEG_MAX];
136 
137 unsigned int msgbuf_p_cnt = 0;
138 
139 void init_x86_msgbuf(void);
140 
141 /*
142  * Given the type of a bootinfo entry, looks for a matching item inside
143  * the bootinfo structure.  If found, returns a pointer to it (which must
144  * then be casted to the appropriate bootinfo_* type); otherwise, returns
145  * NULL.
146  */
147 void *
lookup_bootinfo(int type)148 lookup_bootinfo(int type)
149 {
150 	bool found;
151 	int i;
152 	struct btinfo_common *bic;
153 
154 	bic = (struct btinfo_common *)(bootinfo.bi_data);
155 	found = FALSE;
156 	for (i = 0; i < bootinfo.bi_nentries && !found; i++) {
157 		if (bic->type == type)
158 			found = TRUE;
159 		else
160 			bic = (struct btinfo_common *)
161 			    ((uint8_t *)bic + bic->len);
162 	}
163 
164 	return found ? bic : NULL;
165 }
166 
167 #ifdef notyet
168 /*
169  * List the available bootinfo entries.
170  */
171 static const char *btinfo_str[] = {
172 	BTINFO_STR
173 };
174 
175 void
aprint_bootinfo(void)176 aprint_bootinfo(void)
177 {
178 	int i;
179 	struct btinfo_common *bic;
180 
181 	aprint_normal("bootinfo:");
182 	bic = (struct btinfo_common *)(bootinfo.bi_data);
183 	for (i = 0; i < bootinfo.bi_nentries; i++) {
184 		if (bic->type >= 0 && bic->type < __arraycount(btinfo_str))
185 			aprint_normal(" %s", btinfo_str[bic->type]);
186 		else
187 			aprint_normal(" %d", bic->type);
188 		bic = (struct btinfo_common *)
189 		    ((uint8_t *)bic + bic->len);
190 	}
191 	aprint_normal("\n");
192 }
193 #endif
194 
195 /*
196  * mm_md_physacc: check if given pa is accessible.
197  */
198 int
mm_md_physacc(paddr_t pa,vm_prot_t prot)199 mm_md_physacc(paddr_t pa, vm_prot_t prot)
200 {
201 	extern phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
202 	extern int mem_cluster_cnt;
203 	int i;
204 
205 	for (i = 0; i < mem_cluster_cnt; i++) {
206 		const phys_ram_seg_t *seg = &mem_clusters[i];
207 		paddr_t lstart = seg->start;
208 
209 		if (lstart <= pa && pa - lstart <= seg->size) {
210 			return 0;
211 		}
212 	}
213 	return kauth_authorize_machdep(kauth_cred_get(),
214 	    KAUTH_MACHDEP_UNMANAGEDMEM, NULL, NULL, NULL, NULL);
215 }
216 
217 #ifdef MODULAR
218 /*
219  * Push any modules loaded by the boot loader.
220  */
221 void
module_init_md(void)222 module_init_md(void)
223 {
224 	struct btinfo_modulelist *biml;
225 	struct bi_modulelist_entry *bi, *bimax;
226 
227 	biml = lookup_bootinfo(BTINFO_MODULELIST);
228 	if (biml == NULL) {
229 		aprint_debug("No module info at boot\n");
230 		return;
231 	}
232 
233 	bi = (struct bi_modulelist_entry *)((uint8_t *)biml + sizeof(*biml));
234 	bimax = bi + biml->num;
235 	for (; bi < bimax; bi++) {
236 		switch (bi->type) {
237 		case BI_MODULE_ELF:
238 			aprint_debug("Prep module path=%s len=%d pa=%x\n",
239 			    bi->path, bi->len, bi->base);
240 			KASSERT(trunc_page(bi->base) == bi->base);
241 			module_prime(bi->path,
242 #ifdef KASLR
243 			    (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base),
244 #else
245 			    (void *)((uintptr_t)bi->base + KERNBASE),
246 #endif
247 			    bi->len);
248 			break;
249 		case BI_MODULE_IMAGE:
250 #ifdef SPLASHSCREEN
251 			aprint_debug("Splash image path=%s len=%d pa=%x\n",
252 			    bi->path, bi->len, bi->base);
253 			KASSERT(trunc_page(bi->base) == bi->base);
254 			splash_setimage(
255 #ifdef KASLR
256 			    (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base),
257 #else
258 			    (void *)((uintptr_t)bi->base + KERNBASE),
259 #endif
260 			    bi->len);
261 #endif
262 			break;
263 		case BI_MODULE_RND:
264 			/* handled in x86_rndseed */
265 			break;
266 		case BI_MODULE_FS:
267 			aprint_debug("File-system image path=%s len=%d pa=%x\n",
268 			    bi->path, bi->len, bi->base);
269 			KASSERT(trunc_page(bi->base) == bi->base);
270 #if defined(MEMORY_DISK_HOOKS) && defined(MEMORY_DISK_DYNAMIC)
271 			md_root_setconf(
272 #ifdef KASLR
273 			    (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base),
274 #else
275 			    (void *)((uintptr_t)bi->base + KERNBASE),
276 #endif
277 			    bi->len);
278 #endif
279 			break;
280 		default:
281 			aprint_debug("Skipping non-ELF module\n");
282 			break;
283 		}
284 	}
285 }
286 #endif	/* MODULAR */
287 
288 void
x86_rndseed(void)289 x86_rndseed(void)
290 {
291 	struct btinfo_modulelist *biml;
292 	struct bi_modulelist_entry *bi, *bimax;
293 
294 	biml = lookup_bootinfo(BTINFO_MODULELIST);
295 	if (biml == NULL) {
296 		aprint_debug("No module info at boot\n");
297 		return;
298 	}
299 
300 	bi = (struct bi_modulelist_entry *)((uint8_t *)biml + sizeof(*biml));
301 	bimax = bi + biml->num;
302 	for (; bi < bimax; bi++) {
303 		switch (bi->type) {
304 		case BI_MODULE_RND:
305 			aprint_debug("Random seed data path=%s len=%d pa=%x\n",
306 				     bi->path, bi->len, bi->base);
307 			KASSERT(trunc_page(bi->base) == bi->base);
308 			rnd_seed(
309 #ifdef KASLR
310 			    (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base),
311 #else
312 			    (void *)((uintptr_t)bi->base + KERNBASE),
313 #endif
314 			     bi->len);
315 		}
316 	}
317 }
318 
319 void
cpu_need_resched(struct cpu_info * ci,struct lwp * l,int flags)320 cpu_need_resched(struct cpu_info *ci, struct lwp *l, int flags)
321 {
322 
323 	KASSERT(kpreempt_disabled());
324 
325 	if ((flags & RESCHED_IDLE) != 0) {
326 		if ((flags & RESCHED_REMOTE) != 0 &&
327 		    x86_cpu_idle_ipi != false) {
328 			cpu_kick(ci);
329 		}
330 		return;
331 	}
332 
333 #ifdef __HAVE_PREEMPTION
334 	if ((flags & RESCHED_KPREEMPT) != 0) {
335 		if ((flags & RESCHED_REMOTE) != 0) {
336 #ifdef XENPV
337 			xen_send_ipi(ci, XEN_IPI_KPREEMPT);
338 #else
339 			x86_send_ipi(ci, X86_IPI_KPREEMPT);
340 #endif
341 		} else {
342 			softint_trigger(1 << SIR_PREEMPT);
343 		}
344 		return;
345 	}
346 #endif
347 
348 	KASSERT((flags & RESCHED_UPREEMPT) != 0);
349 	if ((flags & RESCHED_REMOTE) != 0) {
350 		cpu_kick(ci);
351 	} else {
352 		aston(l);
353 	}
354 }
355 
356 void
cpu_signotify(struct lwp * l)357 cpu_signotify(struct lwp *l)
358 {
359 
360 	KASSERT(kpreempt_disabled());
361 
362 	if (l->l_cpu != curcpu()) {
363 		cpu_kick(l->l_cpu);
364 	} else {
365 		aston(l);
366 	}
367 }
368 
369 void
cpu_need_proftick(struct lwp * l)370 cpu_need_proftick(struct lwp *l)
371 {
372 
373 	KASSERT(kpreempt_disabled());
374 	KASSERT(l->l_cpu == curcpu());
375 
376 	l->l_pflag |= LP_OWEUPC;
377 	aston(l);
378 }
379 
380 bool
cpu_intr_p(void)381 cpu_intr_p(void)
382 {
383 	uint64_t ncsw;
384 	int idepth;
385 	lwp_t *l;
386 
387 	l = curlwp;
388 	if (__predict_false(l->l_cpu == NULL)) {
389 		KASSERT(l == &lwp0);
390 		return false;
391 	}
392 	do {
393 		ncsw = l->l_ncsw;
394 		__insn_barrier();
395 		idepth = l->l_cpu->ci_idepth;
396 		__insn_barrier();
397 	} while (__predict_false(ncsw != l->l_ncsw));
398 
399 	return idepth >= 0;
400 }
401 
402 #ifdef __HAVE_PREEMPTION
403 /*
404  * Called to check MD conditions that would prevent preemption, and to
405  * arrange for those conditions to be rechecked later.
406  */
407 bool
cpu_kpreempt_enter(uintptr_t where,int s)408 cpu_kpreempt_enter(uintptr_t where, int s)
409 {
410 	struct pcb *pcb;
411 	lwp_t *l;
412 
413 	KASSERT(kpreempt_disabled());
414 	l = curlwp;
415 
416 	/*
417 	 * If SPL raised, can't go.  Note this implies that spin
418 	 * mutexes at IPL_NONE are _not_ valid to use.
419 	 */
420 	if (s > IPL_PREEMPT) {
421 		softint_trigger(1 << SIR_PREEMPT);
422 		return false;
423 	}
424 
425 	/* Must save cr2 or it could be clobbered. */
426 	pcb = lwp_getpcb(l);
427 	pcb->pcb_cr2 = rcr2();
428 
429 	return true;
430 }
431 
432 /*
433  * Called after returning from a kernel preemption, and called with
434  * preemption disabled.
435  */
436 void
cpu_kpreempt_exit(uintptr_t where)437 cpu_kpreempt_exit(uintptr_t where)
438 {
439 	extern char x86_copyfunc_start, x86_copyfunc_end;
440 	struct pcb *pcb;
441 
442 	KASSERT(kpreempt_disabled());
443 
444 	/*
445 	 * If we interrupted any of the copy functions we must reload
446 	 * the pmap when resuming, as they cannot tolerate it being
447 	 * swapped out.
448 	 */
449 	if (where >= (uintptr_t)&x86_copyfunc_start &&
450 	    where < (uintptr_t)&x86_copyfunc_end) {
451 		pmap_load();
452 	}
453 
454 	/* Restore cr2 only after the pmap, as pmap_load can block. */
455 	pcb = lwp_getpcb(curlwp);
456 	lcr2(pcb->pcb_cr2);
457 }
458 
459 /*
460  * Return true if preemption is disabled for MD reasons.  Must be called
461  * with preemption disabled, and thus is only for diagnostic checks.
462  */
463 bool
cpu_kpreempt_disabled(void)464 cpu_kpreempt_disabled(void)
465 {
466 
467 	return curcpu()->ci_ilevel > IPL_NONE;
468 }
469 #endif	/* __HAVE_PREEMPTION */
470 
471 SYSCTL_SETUP(sysctl_machdep_cpu_idle, "sysctl machdep cpu_idle")
472 {
473 	const struct sysctlnode	*mnode, *node;
474 
475 	sysctl_createv(NULL, 0, NULL, &mnode,
476 	    CTLFLAG_PERMANENT, CTLTYPE_NODE, "machdep", NULL,
477 	    NULL, 0, NULL, 0, CTL_MACHDEP, CTL_EOL);
478 
479 	sysctl_createv(NULL, 0, &mnode, &node,
480 		       CTLFLAG_PERMANENT, CTLTYPE_STRING, "idle-mechanism",
481 		       SYSCTL_DESCR("Mechanism used for the idle loop."),
482 		       NULL, 0, x86_cpu_idle_text, 0,
483 		       CTL_CREATE, CTL_EOL);
484 }
485 
486 void
x86_cpu_idle_init(void)487 x86_cpu_idle_init(void)
488 {
489 
490 #ifndef XENPV
491 	if ((cpu_feature[1] & CPUID2_MONITOR) == 0)
492 		x86_cpu_idle_set(x86_cpu_idle_halt, "halt", true);
493 	else
494 		x86_cpu_idle_set(x86_cpu_idle_mwait, "mwait", false);
495 #else
496 	x86_cpu_idle_set(x86_cpu_idle_xen, "xen", true);
497 #endif
498 }
499 
500 void
x86_cpu_idle_get(void (** func)(void),char * text,size_t len)501 x86_cpu_idle_get(void (**func)(void), char *text, size_t len)
502 {
503 
504 	*func = x86_cpu_idle;
505 
506 	(void)strlcpy(text, x86_cpu_idle_text, len);
507 }
508 
509 void
x86_cpu_idle_set(void (* func)(void),const char * text,bool ipi)510 x86_cpu_idle_set(void (*func)(void), const char *text, bool ipi)
511 {
512 
513 	x86_cpu_idle = func;
514 	x86_cpu_idle_ipi = ipi;
515 
516 	(void)strlcpy(x86_cpu_idle_text, text, sizeof(x86_cpu_idle_text));
517 }
518 
519 #ifndef XENPV
520 
521 #define KBTOB(x)	((size_t)(x) * 1024UL)
522 #define MBTOB(x)	((size_t)(x) * 1024UL * 1024UL)
523 
524 static struct {
525 	int freelist;
526 	uint64_t limit;
527 } x86_freelists[VM_NFREELIST] = {
528 	{ VM_FREELIST_DEFAULT, 0 },
529 #ifdef VM_FREELIST_FIRST1T
530 	/* 40-bit addresses needed for modern graphics. */
531 	{ VM_FREELIST_FIRST1T,	1ULL * 1024 * 1024 * 1024 * 1024 },
532 #endif
533 #ifdef VM_FREELIST_FIRST64G
534 	/* 36-bit addresses needed for oldish graphics. */
535 	{ VM_FREELIST_FIRST64G, 64ULL * 1024 * 1024 * 1024 },
536 #endif
537 #ifdef VM_FREELIST_FIRST4G
538 	/* 32-bit addresses needed for PCI 32-bit DMA and old graphics. */
539 	{ VM_FREELIST_FIRST4G,  4ULL * 1024 * 1024 * 1024 },
540 #endif
541 	/* 30-bit addresses needed for ancient graphics. */
542 	{ VM_FREELIST_FIRST1G,	1ULL * 1024 * 1024 * 1024 },
543 	/* 24-bit addresses needed for ISA DMA. */
544 	{ VM_FREELIST_FIRST16,	16 * 1024 * 1024 },
545 };
546 
547 int
x86_select_freelist(uint64_t maxaddr)548 x86_select_freelist(uint64_t maxaddr)
549 {
550 	unsigned int i;
551 
552 	if (avail_end <= maxaddr)
553 		return VM_NFREELIST;
554 
555 	for (i = 0; i < __arraycount(x86_freelists); i++) {
556 		if ((x86_freelists[i].limit - 1) <= maxaddr)
557 			return x86_freelists[i].freelist;
558 	}
559 
560 	panic("no freelist for maximum address %"PRIx64, maxaddr);
561 }
562 
563 static int
x86_add_cluster(uint64_t seg_start,uint64_t seg_end,uint32_t type)564 x86_add_cluster(uint64_t seg_start, uint64_t seg_end, uint32_t type)
565 {
566 	extern struct extent *iomem_ex;
567 	const uint64_t endext = MAXIOMEM + 1;
568 	uint64_t new_physmem = 0;
569 	phys_ram_seg_t *cluster;
570 	int i;
571 
572 	if (seg_end > MAXPHYSMEM) {
573 		aprint_verbose("WARNING: skipping large memory map entry: "
574 		    "0x%"PRIx64"/0x%"PRIx64"/0x%x\n",
575 		    seg_start, (seg_end - seg_start), type);
576 		return 0;
577 	}
578 
579 	/*
580 	 * XXX: Chop the last page off the size so that it can fit in avail_end.
581 	 */
582 	if (seg_end == MAXPHYSMEM)
583 		seg_end -= PAGE_SIZE;
584 
585 	if (seg_end <= seg_start)
586 		return 0;
587 
588 	for (i = 0; i < mem_cluster_cnt; i++) {
589 		cluster = &mem_clusters[i];
590 		if ((cluster->start == round_page(seg_start)) &&
591 		    (cluster->size == trunc_page(seg_end) - cluster->start)) {
592 #ifdef DEBUG_MEMLOAD
593 			printf("WARNING: skipping duplicate segment entry\n");
594 #endif
595 			return 0;
596 		}
597 	}
598 
599 	/*
600 	 * This cluster is used by RAM. If it is included in the iomem extent,
601 	 * allocate it from there, so that we won't unintentionally reuse it
602 	 * later with extent_alloc_region. A way to avoid collision (with UVM
603 	 * for example).
604 	 *
605 	 * This is done before the addresses are page rounded just to make
606 	 * sure we get them all.
607 	 */
608 	if (seg_start < endext) {
609 		uint64_t io_end;
610 
611 		if (seg_end > endext)
612 			io_end = endext;
613 		else
614 			io_end = seg_end;
615 
616 		if (iomem_ex != NULL && extent_alloc_region(iomem_ex, seg_start,
617 		    io_end - seg_start, EX_NOWAIT)) {
618 			/* XXX What should we do? */
619 			printf("WARNING: CAN't ALLOCATE MEMORY SEGMENT "
620 			    "(0x%"PRIx64"/0x%"PRIx64"/0x%x) FROM "
621 			    "IOMEM EXTENT MAP!\n",
622 			    seg_start, seg_end - seg_start, type);
623 			return 0;
624 		}
625 	}
626 
627 	/* If it's not free memory, skip it. */
628 	if (type != BIM_Memory)
629 		return 0;
630 
631 	if (mem_cluster_cnt >= VM_PHYSSEG_MAX) {
632 		printf("WARNING: too many memory segments"
633 		    "(increase VM_PHYSSEG_MAX)");
634 		return -1;
635 	}
636 
637 #ifdef PHYSMEM_MAX_ADDR
638 	if (seg_start >= MBTOB(PHYSMEM_MAX_ADDR))
639 		return 0;
640 	if (seg_end > MBTOB(PHYSMEM_MAX_ADDR))
641 		seg_end = MBTOB(PHYSMEM_MAX_ADDR);
642 #endif
643 
644 	seg_start = round_page(seg_start);
645 	seg_end = trunc_page(seg_end);
646 
647 	if (seg_start == seg_end)
648 		return 0;
649 
650 	cluster = &mem_clusters[mem_cluster_cnt];
651 	cluster->start = seg_start;
652 	if (iomem_ex != NULL)
653 		new_physmem = physmem + atop(seg_end - seg_start);
654 
655 #ifdef PHYSMEM_MAX_SIZE
656 	if (iomem_ex != NULL) {
657 		if (physmem >= atop(MBTOB(PHYSMEM_MAX_SIZE)))
658 			return 0;
659 		if (new_physmem > atop(MBTOB(PHYSMEM_MAX_SIZE))) {
660 			seg_end = seg_start + MBTOB(PHYSMEM_MAX_SIZE) - ptoa(physmem);
661 			new_physmem = atop(MBTOB(PHYSMEM_MAX_SIZE));
662 		}
663 	}
664 #endif
665 
666 	cluster->size = seg_end - seg_start;
667 
668 	if (iomem_ex != NULL) {
669 		if (avail_end < seg_end)
670 			avail_end = seg_end;
671 		physmem = new_physmem;
672 	}
673 	mem_cluster_cnt++;
674 
675 	return 0;
676 }
677 
678 static int
x86_parse_clusters(struct btinfo_memmap * bim)679 x86_parse_clusters(struct btinfo_memmap *bim)
680 {
681 	uint64_t seg_start, seg_end;
682 	uint64_t addr, size;
683 	uint32_t type;
684 	int x;
685 
686 	KASSERT(bim != NULL);
687 	KASSERT(bim->num > 0);
688 
689 #ifdef DEBUG_MEMLOAD
690 	printf("MEMMAP: %s MEMORY MAP (%d ENTRIES):\n",
691 	    lookup_bootinfo(BTINFO_EFIMEMMAP) != NULL ? "UEFI" : "BIOS",
692 	    bim->num);
693 #endif
694 
695 	for (x = 0; x < bim->num; x++) {
696 		addr = bim->entry[x].addr;
697 		size = bim->entry[x].size;
698 		type = bim->entry[x].type;
699 #ifdef DEBUG_MEMLOAD
700 		printf("MEMMAP: 0x%016" PRIx64 "-0x%016" PRIx64
701 		    "\n\tsize=0x%016" PRIx64 ", type=%d(%s)\n",
702 		    addr, addr + size - 1, size, type,
703 		    (type == BIM_Memory) ?  "Memory" :
704 		    (type == BIM_Reserved) ?  "Reserved" :
705 		    (type == BIM_ACPI) ? "ACPI" :
706 		    (type == BIM_NVS) ? "NVS" :
707 		    (type == BIM_PMEM) ? "Persistent" :
708 		    (type == BIM_PRAM) ? "Persistent (Legacy)" :
709 		    "unknown");
710 #endif
711 
712 		/* If the segment is not memory, skip it. */
713 		switch (type) {
714 		case BIM_Memory:
715 		case BIM_ACPI:
716 		case BIM_NVS:
717 			break;
718 		default:
719 			continue;
720 		}
721 
722 		/* If the segment is smaller than a page, skip it. */
723 		if (size < PAGE_SIZE)
724 			continue;
725 
726 		seg_start = addr;
727 		seg_end = addr + size;
728 
729 		/*
730 		 * XXX XXX: Avoid the ISA I/O MEM.
731 		 *
732 		 * Some laptops (for example, Toshiba Satellite2550X) report
733 		 * this area as valid.
734 		 */
735 		if (seg_start < IOM_END && seg_end > IOM_BEGIN) {
736 			printf("WARNING: memory map entry overlaps "
737 			    "with ``Compatibility Holes'': "
738 			    "0x%"PRIx64"/0x%"PRIx64"/0x%x\n", seg_start,
739 			    seg_end - seg_start, type);
740 
741 			if (x86_add_cluster(seg_start, IOM_BEGIN, type) == -1)
742 				break;
743 			if (x86_add_cluster(IOM_END, seg_end, type) == -1)
744 				break;
745 		} else {
746 			if (x86_add_cluster(seg_start, seg_end, type) == -1)
747 				break;
748 		}
749 	}
750 
751 	return 0;
752 }
753 
754 static int
x86_fake_clusters(void)755 x86_fake_clusters(void)
756 {
757 	extern struct extent *iomem_ex;
758 	phys_ram_seg_t *cluster;
759 	KASSERT(mem_cluster_cnt == 0);
760 
761 	/*
762 	 * Allocate the physical addresses used by RAM from the iomem extent
763 	 * map. This is done before the addresses are page rounded just to make
764 	 * sure we get them all.
765 	 */
766 	if (extent_alloc_region(iomem_ex, 0, KBTOB(biosbasemem), EX_NOWAIT)) {
767 		/* XXX What should we do? */
768 		printf("WARNING: CAN'T ALLOCATE BASE MEMORY FROM "
769 		    "IOMEM EXTENT MAP!\n");
770 	}
771 
772 	cluster = &mem_clusters[0];
773 	cluster->start = 0;
774 	cluster->size = trunc_page(KBTOB(biosbasemem));
775 	physmem += atop(cluster->size);
776 
777 	if (extent_alloc_region(iomem_ex, IOM_END, KBTOB(biosextmem),
778 	    EX_NOWAIT)) {
779 		/* XXX What should we do? */
780 		printf("WARNING: CAN'T ALLOCATE EXTENDED MEMORY FROM "
781 		    "IOMEM EXTENT MAP!\n");
782 	}
783 
784 #if NISADMA > 0
785 	/*
786 	 * Some motherboards/BIOSes remap the 384K of RAM that would
787 	 * normally be covered by the ISA hole to the end of memory
788 	 * so that it can be used.  However, on a 16M system, this
789 	 * would cause bounce buffers to be allocated and used.
790 	 * This is not desirable behaviour, as more than 384K of
791 	 * bounce buffers might be allocated.  As a work-around,
792 	 * we round memory down to the nearest 1M boundary if
793 	 * we're using any isadma devices and the remapped memory
794 	 * is what puts us over 16M.
795 	 */
796 	if (biosextmem > (15*1024) && biosextmem < (16*1024)) {
797 		char pbuf[9];
798 
799 		format_bytes(pbuf, sizeof(pbuf), biosextmem - (15*1024));
800 		printf("Warning: ignoring %s of remapped memory\n", pbuf);
801 		biosextmem = (15*1024);
802 	}
803 #endif
804 
805 	cluster = &mem_clusters[1];
806 	cluster->start = IOM_END;
807 	cluster->size = trunc_page(KBTOB(biosextmem));
808 	physmem += atop(cluster->size);
809 
810 	mem_cluster_cnt = 2;
811 
812 	avail_end = IOM_END + trunc_page(KBTOB(biosextmem));
813 
814 	return 0;
815 }
816 
817 /*
818  * x86_load_region: load the physical memory region from seg_start to seg_end
819  * into the VM system.
820  */
821 static void
x86_load_region(uint64_t seg_start,uint64_t seg_end)822 x86_load_region(uint64_t seg_start, uint64_t seg_end)
823 {
824 	unsigned int i;
825 	uint64_t tmp;
826 
827 	i = __arraycount(x86_freelists);
828 	while (i--) {
829 		if (x86_freelists[i].limit <= seg_start)
830 			continue;
831 		if (x86_freelists[i].freelist == VM_FREELIST_DEFAULT)
832 			continue;
833 		tmp = MIN(x86_freelists[i].limit, seg_end);
834 		if (tmp == seg_start)
835 			continue;
836 
837 #ifdef DEBUG_MEMLOAD
838 		printf("loading freelist %d 0x%"PRIx64"-0x%"PRIx64
839 		    " (0x%"PRIx64"-0x%"PRIx64")\n", x86_freelists[i].freelist,
840 		    seg_start, tmp, (uint64_t)atop(seg_start),
841 		    (uint64_t)atop(tmp));
842 #endif
843 
844 		uvm_page_physload(atop(seg_start), atop(tmp), atop(seg_start),
845 		    atop(tmp), x86_freelists[i].freelist);
846 		seg_start = tmp;
847 	}
848 
849 	if (seg_start != seg_end) {
850 #ifdef DEBUG_MEMLOAD
851 		printf("loading default 0x%"PRIx64"-0x%"PRIx64
852 		    " (0x%"PRIx64"-0x%"PRIx64")\n", seg_start, seg_end,
853 		    (uint64_t)atop(seg_start), (uint64_t)atop(seg_end));
854 #endif
855 		uvm_page_physload(atop(seg_start), atop(seg_end),
856 		    atop(seg_start), atop(seg_end), VM_FREELIST_DEFAULT);
857 	}
858 }
859 
860 #ifdef XEN
861 static void
x86_add_xen_clusters(void)862 x86_add_xen_clusters(void)
863 {
864 	if (hvm_start_info->memmap_entries > 0) {
865 		struct hvm_memmap_table_entry *map_entry;
866 		map_entry = (void *)((uintptr_t)hvm_start_info->memmap_paddr + KERNBASE);
867 		for (int i = 0; i < hvm_start_info->memmap_entries; i++) {
868 			if (map_entry[i].size < PAGE_SIZE)
869 				continue;
870 			switch (map_entry[i].type) {
871 			case XEN_HVM_MEMMAP_TYPE_RAM:
872 				x86_add_cluster(map_entry[i].addr,
873 				    map_entry[i].addr + map_entry[i].size,
874 				    BIM_Memory);
875 				break;
876 			case XEN_HVM_MEMMAP_TYPE_ACPI:
877 				x86_add_cluster(map_entry[i].addr,
878 				    map_entry[i].addr + map_entry[i].size,
879 				    BIM_ACPI);
880 				break;
881 			}
882 		}
883 	} else {
884 		struct xen_memory_map memmap;
885 		static struct _xen_mmap {
886 			struct btinfo_memmap bim;
887 			struct bi_memmap_entry map[128]; /* same as FreeBSD */
888 		} __packed xen_mmap;
889 		int err;
890 
891 		memmap.nr_entries = 128;
892 		set_xen_guest_handle(memmap.buffer, &xen_mmap.bim.entry[0]);
893 		if ((err = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap))
894 		    < 0)
895 			panic("XENMEM_memory_map %d", err);
896 		xen_mmap.bim.num = memmap.nr_entries;
897 		x86_parse_clusters(&xen_mmap.bim);
898 	}
899 }
900 #endif /* XEN */
901 /*
902  * init_x86_clusters: retrieve the memory clusters provided by the BIOS, and
903  * initialize mem_clusters.
904  */
905 void
init_x86_clusters(void)906 init_x86_clusters(void)
907 {
908 	struct btinfo_memmap *bim;
909 	struct btinfo_efimemmap *biem;
910 
911 	/*
912 	 * Check to see if we have a memory map from the BIOS (passed to us by
913 	 * the boot program).
914 	 */
915 #ifdef XEN
916 	if (vm_guest == VM_GUEST_XENPVH) {
917 		x86_add_xen_clusters();
918 	}
919 #endif /* XEN */
920 
921 #ifdef i386
922 	extern int biosmem_implicit;
923 	biem = lookup_bootinfo(BTINFO_EFIMEMMAP);
924 	if (biem != NULL)
925 		bim = efi_get_e820memmap();
926 	else
927 		bim = lookup_bootinfo(BTINFO_MEMMAP);
928 	if ((biosmem_implicit || (biosbasemem == 0 && biosextmem == 0)) &&
929 	    bim != NULL && bim->num > 0)
930 		x86_parse_clusters(bim);
931 #else
932 #if !defined(REALBASEMEM) && !defined(REALEXTMEM)
933 	biem = lookup_bootinfo(BTINFO_EFIMEMMAP);
934 	if (biem != NULL)
935 		bim = efi_get_e820memmap();
936 	else
937 		bim = lookup_bootinfo(BTINFO_MEMMAP);
938 	if (bim != NULL && bim->num > 0)
939 		x86_parse_clusters(bim);
940 #else
941 	(void)bim, (void)biem;
942 #endif
943 #endif
944 
945 	if (mem_cluster_cnt == 0) {
946 		/*
947 		 * If x86_parse_clusters didn't find any valid segment, create
948 		 * fake clusters.
949 		 */
950 		x86_fake_clusters();
951 	}
952 }
953 
954 /*
955  * init_x86_vm: initialize the VM system on x86. We basically internalize as
956  * many physical pages as we can, starting at lowmem_rsvd, but we don't
957  * internalize the kernel physical pages (from pa_kstart to pa_kend).
958  */
959 int
init_x86_vm(paddr_t pa_kend)960 init_x86_vm(paddr_t pa_kend)
961 {
962 	extern struct bootspace bootspace;
963 	paddr_t pa_kstart = bootspace.head.pa;
964 	uint64_t seg_start, seg_end;
965 	uint64_t seg_start1, seg_end1;
966 	int x;
967 	unsigned i;
968 
969 	for (i = 0; i < __arraycount(x86_freelists); i++) {
970 		if (avail_end < x86_freelists[i].limit)
971 			x86_freelists[i].freelist = VM_FREELIST_DEFAULT;
972 	}
973 
974 	/*
975 	 * Now, load the memory clusters (which have already been rounded and
976 	 * truncated) into the VM system.
977 	 *
978 	 * NOTE: we assume that memory starts at 0.
979 	 */
980 	for (x = 0; x < mem_cluster_cnt; x++) {
981 		const phys_ram_seg_t *cluster = &mem_clusters[x];
982 
983 		seg_start = cluster->start;
984 		seg_end = cluster->start + cluster->size;
985 		seg_start1 = 0;
986 		seg_end1 = 0;
987 
988 #ifdef DEBUG_MEMLOAD
989 		printf("segment %" PRIx64 " - %" PRIx64 "\n",
990 		    seg_start, seg_end);
991 #endif
992 
993 		/* Skip memory before our available starting point. */
994 		if (seg_end <= lowmem_rsvd) {
995 #ifdef DEBUG_MEMLOAD
996 			printf("discard segment below starting point "
997 			    "%" PRIx64 " - %" PRIx64 "\n", seg_start, seg_end);
998 #endif
999 			continue;
1000 		}
1001 
1002 		if (seg_start <= lowmem_rsvd && lowmem_rsvd < seg_end) {
1003 			seg_start = lowmem_rsvd;
1004 			if (seg_start == seg_end) {
1005 #ifdef DEBUG_MEMLOAD
1006 				printf("discard segment below starting point "
1007 				    "%" PRIx64 " - %" PRIx64 "\n",
1008 				    seg_start, seg_end);
1009 
1010 
1011 #endif
1012 				continue;
1013 			}
1014 		}
1015 
1016 		/*
1017 		 * If this segment contains the kernel, split it in two, around
1018 		 * the kernel.
1019 		 *  [seg_start                       seg_end]
1020 		 *             [pa_kstart  pa_kend]
1021 		 */
1022 		if (seg_start <= pa_kstart && pa_kend <= seg_end) {
1023 #ifdef DEBUG_MEMLOAD
1024 			printf("split kernel overlapping to "
1025 			    "%" PRIx64 " - %" PRIxPADDR " and "
1026 			    "%" PRIxPADDR " - %" PRIx64 "\n",
1027 			    seg_start, pa_kstart, pa_kend, seg_end);
1028 #endif
1029 			seg_start1 = pa_kend;
1030 			seg_end1 = seg_end;
1031 			seg_end = pa_kstart;
1032 			KASSERT(seg_end < seg_end1);
1033 		}
1034 
1035 		/*
1036 		 * Discard a segment inside the kernel
1037 		 *  [pa_kstart                       pa_kend]
1038 		 *             [seg_start  seg_end]
1039 		 */
1040 		if (pa_kstart < seg_start && seg_end < pa_kend) {
1041 #ifdef DEBUG_MEMLOAD
1042 			printf("discard complete kernel overlap "
1043 			    "%" PRIx64 " - %" PRIx64 "\n", seg_start, seg_end);
1044 #endif
1045 			continue;
1046 		}
1047 
1048 		/*
1049 		 * Discard leading hunk that overlaps the kernel
1050 		 *  [pa_kstart             pa_kend]
1051 		 *            [seg_start            seg_end]
1052 		 */
1053 		if (pa_kstart < seg_start &&
1054 		    seg_start < pa_kend &&
1055 		    pa_kend < seg_end) {
1056 #ifdef DEBUG_MEMLOAD
1057 			printf("discard leading kernel overlap "
1058 			    "%" PRIx64 " - %" PRIxPADDR "\n",
1059 			    seg_start, pa_kend);
1060 #endif
1061 			seg_start = pa_kend;
1062 		}
1063 
1064 		/*
1065 		 * Discard trailing hunk that overlaps the kernel
1066 		 *             [pa_kstart            pa_kend]
1067 		 *  [seg_start              seg_end]
1068 		 */
1069 		if (seg_start < pa_kstart &&
1070 		    pa_kstart < seg_end &&
1071 		    seg_end < pa_kend) {
1072 #ifdef DEBUG_MEMLOAD
1073 			printf("discard trailing kernel overlap "
1074 			    "%" PRIxPADDR " - %" PRIx64 "\n",
1075 			    pa_kstart, seg_end);
1076 #endif
1077 			seg_end = pa_kstart;
1078 		}
1079 
1080 		/* First hunk */
1081 		if (seg_start != seg_end) {
1082 			x86_load_region(seg_start, seg_end);
1083 		}
1084 
1085 		/* Second hunk */
1086 		if (seg_start1 != seg_end1) {
1087 			x86_load_region(seg_start1, seg_end1);
1088 		}
1089 	}
1090 
1091 	return 0;
1092 }
1093 
1094 #endif /* !XENPV */
1095 
1096 void
init_x86_msgbuf(void)1097 init_x86_msgbuf(void)
1098 {
1099 	/* Message buffer is located at end of core. */
1100 	psize_t sz = round_page(MSGBUFSIZE);
1101 	psize_t reqsz = sz;
1102 	uvm_physseg_t x;
1103 
1104 search_again:
1105 	for (x = uvm_physseg_get_first();
1106 	     uvm_physseg_valid_p(x);
1107 	     x = uvm_physseg_get_next(x)) {
1108 
1109 		if (ctob(uvm_physseg_get_avail_end(x)) == avail_end)
1110 			break;
1111 	}
1112 
1113 	if (uvm_physseg_valid_p(x) == false)
1114 		panic("init_x86_msgbuf: can't find end of memory");
1115 
1116 	/* Shrink so it'll fit in the last segment. */
1117 	if (uvm_physseg_get_avail_end(x) - uvm_physseg_get_avail_start(x) < atop(sz))
1118 		sz = ctob(uvm_physseg_get_avail_end(x) - uvm_physseg_get_avail_start(x));
1119 
1120 	msgbuf_p_seg[msgbuf_p_cnt].sz = sz;
1121 	msgbuf_p_seg[msgbuf_p_cnt++].paddr = ctob(uvm_physseg_get_avail_end(x)) - sz;
1122 	uvm_physseg_unplug(uvm_physseg_get_end(x) - atop(sz), atop(sz));
1123 
1124 	/* Now find where the new avail_end is. */
1125 	avail_end = ctob(uvm_physseg_get_highest_frame());
1126 
1127 	if (sz == reqsz)
1128 		return;
1129 
1130 	reqsz -= sz;
1131 	if (msgbuf_p_cnt == VM_PHYSSEG_MAX) {
1132 		/* No more segments available, bail out. */
1133 		printf("WARNING: MSGBUFSIZE (%zu) too large, using %zu.\n",
1134 		    (size_t)MSGBUFSIZE, (size_t)(MSGBUFSIZE - reqsz));
1135 		return;
1136 	}
1137 
1138 	sz = reqsz;
1139 	goto search_again;
1140 }
1141 
1142 void
x86_reset(void)1143 x86_reset(void)
1144 {
1145 	uint8_t b;
1146 
1147 #if NACPICA > 0
1148 	/*
1149 	 * If ACPI is active, try to reset using the reset register
1150 	 * defined in the FADT.
1151 	 */
1152 	if (acpi_active) {
1153 		if (acpi_reset() == 0) {
1154 			delay(500000); /* wait 0.5 sec to see if that did it */
1155 		}
1156 	}
1157 #endif
1158 
1159 	/*
1160 	 * The keyboard controller has 4 random output pins, one of which is
1161 	 * connected to the RESET pin on the CPU in many PCs.  We tell the
1162 	 * keyboard controller to pulse this line a couple of times.
1163 	 */
1164 	outb(IO_KBD + KBCMDP, KBC_PULSE0);
1165 	delay(100000);
1166 	outb(IO_KBD + KBCMDP, KBC_PULSE0);
1167 	delay(100000);
1168 
1169 	/*
1170 	 * Attempt to force a reset via the Reset Control register at
1171 	 * I/O port 0xcf9.  Bit 2 forces a system reset when it
1172 	 * transitions from 0 to 1.  Bit 1 selects the type of reset
1173 	 * to attempt: 0 selects a "soft" reset, and 1 selects a
1174 	 * "hard" reset.  We try a "hard" reset.  The first write sets
1175 	 * bit 1 to select a "hard" reset and clears bit 2.  The
1176 	 * second write forces a 0 -> 1 transition in bit 2 to trigger
1177 	 * a reset.
1178 	 */
1179 	outb(0xcf9, 0x2);
1180 	outb(0xcf9, 0x6);
1181 	DELAY(500000);	/* wait 0.5 sec to see if that did it */
1182 
1183 	/*
1184 	 * Attempt to force a reset via the Fast A20 and Init register
1185 	 * at I/O port 0x92. Bit 1 serves as an alternate A20 gate.
1186 	 * Bit 0 asserts INIT# when set to 1. We are careful to only
1187 	 * preserve bit 1 while setting bit 0. We also must clear bit
1188 	 * 0 before setting it if it isn't already clear.
1189 	 */
1190 	b = inb(0x92);
1191 	if (b != 0xff) {
1192 		if ((b & 0x1) != 0)
1193 			outb(0x92, b & 0xfe);
1194 		outb(0x92, b | 0x1);
1195 		DELAY(500000);	/* wait 0.5 sec to see if that did it */
1196 	}
1197 }
1198 
1199 static int
x86_listener_cb(kauth_cred_t cred,kauth_action_t action,void * cookie,void * arg0,void * arg1,void * arg2,void * arg3)1200 x86_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
1201     void *arg0, void *arg1, void *arg2, void *arg3)
1202 {
1203 	int result;
1204 
1205 	result = KAUTH_RESULT_DEFER;
1206 
1207 	switch (action) {
1208 	case KAUTH_MACHDEP_IOPERM_GET:
1209 		result = KAUTH_RESULT_ALLOW;
1210 		break;
1211 
1212 	case KAUTH_MACHDEP_LDT_GET:
1213 	case KAUTH_MACHDEP_LDT_SET:
1214 		if (x86_user_ldt_enabled) {
1215 			result = KAUTH_RESULT_ALLOW;
1216 		}
1217 		break;
1218 
1219 	default:
1220 		break;
1221 	}
1222 
1223 	return result;
1224 }
1225 
1226 void
machdep_init(void)1227 machdep_init(void)
1228 {
1229 
1230 	x86_listener = kauth_listen_scope(KAUTH_SCOPE_MACHDEP,
1231 	    x86_listener_cb, NULL);
1232 }
1233 
1234 /*
1235  * x86_startup: x86 common startup routine
1236  *
1237  * called by cpu_startup.
1238  */
1239 
1240 void
x86_startup(void)1241 x86_startup(void)
1242 {
1243 #if !defined(XENPV)
1244 	nmi_init();
1245 #endif
1246 }
1247 
1248 const char *
get_booted_kernel(void)1249 get_booted_kernel(void)
1250 {
1251 	const struct btinfo_bootpath *bibp = lookup_bootinfo(BTINFO_BOOTPATH);
1252 	return bibp ? bibp->bootpath : NULL;
1253 }
1254 
1255 /*
1256  * machine dependent system variables.
1257  */
1258 static int
sysctl_machdep_booted_kernel(SYSCTLFN_ARGS)1259 sysctl_machdep_booted_kernel(SYSCTLFN_ARGS)
1260 {
1261 	struct btinfo_bootpath *bibp;
1262 	struct sysctlnode node;
1263 
1264 	bibp = lookup_bootinfo(BTINFO_BOOTPATH);
1265 	if (!bibp)
1266 		return ENOENT; /* ??? */
1267 
1268 	node = *rnode;
1269 	node.sysctl_data = bibp->bootpath;
1270 	node.sysctl_size = sizeof(bibp->bootpath);
1271 	return sysctl_lookup(SYSCTLFN_CALL(&node));
1272 }
1273 
1274 static int
sysctl_machdep_bootmethod(SYSCTLFN_ARGS)1275 sysctl_machdep_bootmethod(SYSCTLFN_ARGS)
1276 {
1277 	struct sysctlnode node;
1278 	char buf[5];
1279 
1280 	node = *rnode;
1281 	node.sysctl_data = buf;
1282 	if (bootmethod_efi)
1283 		memcpy(node.sysctl_data, "UEFI", 5);
1284 	else
1285 		memcpy(node.sysctl_data, "BIOS", 5);
1286 
1287 	return sysctl_lookup(SYSCTLFN_CALL(&node));
1288 }
1289 
1290 
1291 static int
sysctl_machdep_diskinfo(SYSCTLFN_ARGS)1292 sysctl_machdep_diskinfo(SYSCTLFN_ARGS)
1293 {
1294 	struct sysctlnode node;
1295 	extern struct bi_devmatch *x86_alldisks;
1296 	extern int x86_ndisks;
1297 
1298 	if (x86_alldisks == NULL)
1299 		return EOPNOTSUPP;
1300 
1301 	node = *rnode;
1302 	node.sysctl_data = x86_alldisks;
1303 	node.sysctl_size = sizeof(struct disklist) +
1304 	    (x86_ndisks - 1) * sizeof(struct nativedisk_info);
1305 	return sysctl_lookup(SYSCTLFN_CALL(&node));
1306 }
1307 
1308 #ifndef XENPV
1309 static int
sysctl_machdep_tsc_enable(SYSCTLFN_ARGS)1310 sysctl_machdep_tsc_enable(SYSCTLFN_ARGS)
1311 {
1312 	struct sysctlnode node;
1313 	int error, val;
1314 
1315 	val = *(int *)rnode->sysctl_data;
1316 
1317 	node = *rnode;
1318 	node.sysctl_data = &val;
1319 
1320 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
1321 	if (error != 0 || newp == NULL)
1322 		return error;
1323 
1324 	if (val == 1) {
1325 		tsc_user_enable();
1326 	} else if (val == 0) {
1327 		tsc_user_disable();
1328 	} else {
1329 		error = EINVAL;
1330 	}
1331 	if (error)
1332 		return error;
1333 
1334 	*(int *)rnode->sysctl_data = val;
1335 
1336 	return 0;
1337 }
1338 #endif
1339 
1340 static const char * const vm_guest_name[VM_LAST] = {
1341 	[VM_GUEST_NO] =		"none",
1342 	[VM_GUEST_VM] =		"generic",
1343 	[VM_GUEST_XENPV] =	"XenPV",
1344 	[VM_GUEST_XENPVH] =	"XenPVH",
1345 	[VM_GUEST_XENHVM] =	"XenHVM",
1346 	[VM_GUEST_XENPVHVM] =	"XenPVHVM",
1347 	[VM_GUEST_HV] =		"Hyper-V",
1348 	[VM_GUEST_VMWARE] =	"VMware",
1349 	[VM_GUEST_KVM] =	"KVM",
1350 	[VM_GUEST_VIRTUALBOX] =	"VirtualBox",
1351 };
1352 
1353 static int
sysctl_machdep_hypervisor(SYSCTLFN_ARGS)1354 sysctl_machdep_hypervisor(SYSCTLFN_ARGS)
1355 {
1356 	struct sysctlnode node;
1357 	const char *t = NULL;
1358 	char buf[64];
1359 
1360 	node = *rnode;
1361 	node.sysctl_data = buf;
1362 	if (vm_guest >= VM_GUEST_NO && vm_guest < VM_LAST)
1363 		t = vm_guest_name[vm_guest];
1364 	if (t == NULL)
1365 		t = "unknown";
1366 	strlcpy(buf, t, sizeof(buf));
1367 	return sysctl_lookup(SYSCTLFN_CALL(&node));
1368 }
1369 
1370 static void
const_sysctl(struct sysctllog ** clog,const char * name,int type,u_quad_t value,int tag)1371 const_sysctl(struct sysctllog **clog, const char *name, int type,
1372     u_quad_t value, int tag)
1373 {
1374 	(sysctl_createv)(clog, 0, NULL, NULL,
1375 		       CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
1376 		       type, name, NULL, NULL, value, NULL, 0,
1377 		       CTL_MACHDEP, tag, CTL_EOL);
1378 }
1379 
1380 SYSCTL_SETUP(sysctl_machdep_setup, "sysctl machdep subtree setup")
1381 {
1382 	extern uint64_t tsc_freq;
1383 #ifndef XENPV
1384 	extern int tsc_user_enabled;
1385 #endif
1386 	extern int sparse_dump;
1387 
1388 	sysctl_createv(clog, 0, NULL, NULL,
1389 		       CTLFLAG_PERMANENT,
1390 		       CTLTYPE_NODE, "machdep", NULL,
1391 		       NULL, 0, NULL, 0,
1392 		       CTL_MACHDEP, CTL_EOL);
1393 
1394 	sysctl_createv(clog, 0, NULL, NULL,
1395 		       CTLFLAG_PERMANENT,
1396 		       CTLTYPE_STRUCT, "console_device", NULL,
1397 		       sysctl_consdev, 0, NULL, sizeof(dev_t),
1398 		       CTL_MACHDEP, CPU_CONSDEV, CTL_EOL);
1399 	sysctl_createv(clog, 0, NULL, NULL,
1400 		       CTLFLAG_PERMANENT,
1401 		       CTLTYPE_STRING, "booted_kernel", NULL,
1402 		       sysctl_machdep_booted_kernel, 0, NULL, 0,
1403 		       CTL_MACHDEP, CPU_BOOTED_KERNEL, CTL_EOL);
1404 	sysctl_createv(clog, 0, NULL, NULL,
1405 		       CTLFLAG_PERMANENT,
1406 		       CTLTYPE_STRING, "bootmethod", NULL,
1407 		       sysctl_machdep_bootmethod, 0, NULL, 0,
1408 		       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
1409 	sysctl_createv(clog, 0, NULL, NULL,
1410 		       CTLFLAG_PERMANENT,
1411 		       CTLTYPE_STRUCT, "diskinfo", NULL,
1412 		       sysctl_machdep_diskinfo, 0, NULL, 0,
1413 		       CTL_MACHDEP, CPU_DISKINFO, CTL_EOL);
1414 	sysctl_createv(clog, 0, NULL, NULL,
1415 		       CTLFLAG_PERMANENT,
1416 		       CTLTYPE_STRING, "cpu_brand", NULL,
1417 		       NULL, 0, cpu_brand_string, 0,
1418 		       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
1419 	sysctl_createv(clog, 0, NULL, NULL,
1420 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1421 		       CTLTYPE_INT, "sparse_dump", NULL,
1422 		       NULL, 0, &sparse_dump, 0,
1423 		       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
1424 	sysctl_createv(clog, 0, NULL, NULL,
1425 		       CTLFLAG_PERMANENT,
1426 		       CTLTYPE_QUAD, "tsc_freq", NULL,
1427 		       NULL, 0, &tsc_freq, 0,
1428 		       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
1429 	sysctl_createv(clog, 0, NULL, NULL,
1430 		       CTLFLAG_PERMANENT,
1431 		       CTLTYPE_INT, "pae",
1432 		       SYSCTL_DESCR("Whether the kernel uses PAE"),
1433 		       NULL, 0, &use_pae, 0,
1434 		       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
1435 #ifndef XENPV
1436 	sysctl_createv(clog, 0, NULL, NULL,
1437 		       CTLFLAG_READWRITE,
1438 		       CTLTYPE_INT, "tsc_user_enable",
1439 		       SYSCTL_DESCR("RDTSC instruction enabled in usermode"),
1440 		       sysctl_machdep_tsc_enable, 0, &tsc_user_enabled, 0,
1441 		       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
1442 #endif
1443 	sysctl_createv(clog, 0, NULL, NULL,
1444 		       CTLFLAG_PERMANENT,
1445 		       CTLTYPE_STRING, "hypervisor", NULL,
1446 		       sysctl_machdep_hypervisor, 0, NULL, 0,
1447 		       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
1448 #ifdef SVS
1449 	const struct sysctlnode *svs_rnode = NULL;
1450 	sysctl_createv(clog, 0, NULL, &svs_rnode,
1451 		       CTLFLAG_PERMANENT,
1452 		       CTLTYPE_NODE, "svs", NULL,
1453 		       NULL, 0, NULL, 0,
1454 		       CTL_MACHDEP, CTL_CREATE);
1455 	sysctl_createv(clog, 0, &svs_rnode, NULL,
1456 		       CTLFLAG_PERMANENT,
1457 		       CTLTYPE_BOOL, "enabled",
1458 		       SYSCTL_DESCR("Whether the kernel uses SVS"),
1459 		       NULL, 0, &svs_enabled, 0,
1460 		       CTL_CREATE, CTL_EOL);
1461 	sysctl_createv(clog, 0, &svs_rnode, NULL,
1462 		       CTLFLAG_PERMANENT,
1463 		       CTLTYPE_BOOL, "pcid",
1464 		       SYSCTL_DESCR("Whether SVS uses PCID"),
1465 		       NULL, 0, &svs_pcid, 0,
1466 		       CTL_CREATE, CTL_EOL);
1467 #endif
1468 
1469 	sysctl_createv(clog, 0, NULL, NULL,
1470 		       CTLFLAG_READWRITE,
1471 		       CTLTYPE_BOOL, "user_ldt",
1472 		       SYSCTL_DESCR("Whether USER_LDT is enabled"),
1473 		       NULL, 0, &x86_user_ldt_enabled, 0,
1474 		       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
1475 
1476 #ifndef XENPV
1477 	void sysctl_speculation_init(struct sysctllog **);
1478 	sysctl_speculation_init(clog);
1479 #endif
1480 
1481 	/* None of these can ever change once the system has booted */
1482 	const_sysctl(clog, "fpu_present", CTLTYPE_INT, i386_fpu_present,
1483 	    CPU_FPU_PRESENT);
1484 	const_sysctl(clog, "osfxsr", CTLTYPE_INT, i386_use_fxsave,
1485 	    CPU_OSFXSR);
1486 	const_sysctl(clog, "sse", CTLTYPE_INT, i386_has_sse,
1487 	    CPU_SSE);
1488 	const_sysctl(clog, "sse2", CTLTYPE_INT, i386_has_sse2,
1489 	    CPU_SSE2);
1490 
1491 	const_sysctl(clog, "fpu_save", CTLTYPE_INT, x86_fpu_save,
1492 	    CPU_FPU_SAVE);
1493 	const_sysctl(clog, "fpu_save_size", CTLTYPE_INT, x86_fpu_save_size,
1494 	    CPU_FPU_SAVE_SIZE);
1495 	const_sysctl(clog, "xsave_features", CTLTYPE_QUAD, x86_xsave_features,
1496 	    CPU_XSAVE_FEATURES);
1497 
1498 #ifndef XENPV
1499 	const_sysctl(clog, "biosbasemem", CTLTYPE_INT, biosbasemem,
1500 	    CPU_BIOSBASEMEM);
1501 	const_sysctl(clog, "biosextmem", CTLTYPE_INT, biosextmem,
1502 	    CPU_BIOSEXTMEM);
1503 #endif
1504 }
1505 
1506 /* Here for want of a better place */
1507 #if defined(DOM0OPS) || !defined(XENPV)
1508 struct pic *
intr_findpic(int num)1509 intr_findpic(int num)
1510 {
1511 #if NIOAPIC > 0
1512 	struct ioapic_softc *pic;
1513 
1514 	pic = ioapic_find_bybase(num);
1515 	if (pic != NULL)
1516 		return &pic->sc_pic;
1517 #endif
1518 	if (num < NUM_LEGACY_IRQS)
1519 		return &i8259_pic;
1520 
1521 	return NULL;
1522 }
1523 #endif
1524 
1525 void
cpu_initclocks(void)1526 cpu_initclocks(void)
1527 {
1528 
1529 	/*
1530 	 * Re-calibrate TSC on boot CPU using most accurate time source,
1531 	 * thus making accurate TSC available for x86_initclock_func().
1532 	 */
1533 	cpu_get_tsc_freq(curcpu());
1534 
1535 	/* Now start the clocks on this CPU (the boot CPU). */
1536 	(*x86_initclock_func)();
1537 }
1538 
1539 int
x86_cpu_is_lcall(const void * ip)1540 x86_cpu_is_lcall(const void *ip)
1541 {
1542 	static const uint8_t lcall[] = { 0x9a, 0, 0, 0, 0 };
1543 	int error;
1544 	const size_t sz = sizeof(lcall) + 2;
1545 	uint8_t tmp[sizeof(lcall) + 2];
1546 
1547 	if ((error = copyin(ip, tmp, sz)) != 0)
1548 		return error;
1549 
1550 	if (memcmp(tmp, lcall, sizeof(lcall)) != 0 || tmp[sz - 1] != 0)
1551 		return EINVAL;
1552 
1553 	switch (tmp[sz - 2]) {
1554 	case (uint8_t)0x07: /* NetBSD */
1555 	case (uint8_t)0x87: /* BSD/OS */
1556 		return 0;
1557 	default:
1558 		return EINVAL;
1559 	}
1560 }
1561