xref: /freebsd/sys/amd64/amd64/machdep.c (revision 1d6230b0)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 1992 Terrence R. Lambert.
6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  */
40 
41 #include <sys/cdefs.h>
42 #include "opt_atpic.h"
43 #include "opt_cpu.h"
44 #include "opt_ddb.h"
45 #include "opt_inet.h"
46 #include "opt_isa.h"
47 #include "opt_kstack_pages.h"
48 #include "opt_maxmem.h"
49 #include "opt_pci.h"
50 #include "opt_platform.h"
51 #include "opt_sched.h"
52 
53 #include <sys/param.h>
54 #include <sys/proc.h>
55 #include <sys/systm.h>
56 #include <sys/asan.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/bus.h>
60 #include <sys/callout.h>
61 #include <sys/cons.h>
62 #include <sys/cpu.h>
63 #include <sys/csan.h>
64 #include <sys/efi.h>
65 #include <sys/eventhandler.h>
66 #include <sys/exec.h>
67 #include <sys/imgact.h>
68 #include <sys/kdb.h>
69 #include <sys/kernel.h>
70 #include <sys/ktr.h>
71 #include <sys/linker.h>
72 #include <sys/lock.h>
73 #include <sys/malloc.h>
74 #include <sys/memrange.h>
75 #include <sys/msan.h>
76 #include <sys/msgbuf.h>
77 #include <sys/mutex.h>
78 #include <sys/pcpu.h>
79 #include <sys/ptrace.h>
80 #include <sys/reboot.h>
81 #include <sys/reg.h>
82 #include <sys/rwlock.h>
83 #include <sys/sched.h>
84 #include <sys/signalvar.h>
85 #ifdef SMP
86 #include <sys/smp.h>
87 #endif
88 #include <sys/syscallsubr.h>
89 #include <sys/sysctl.h>
90 #include <sys/sysent.h>
91 #include <sys/sysproto.h>
92 #include <sys/ucontext.h>
93 #include <sys/vmmeter.h>
94 
95 #include <vm/vm.h>
96 #include <vm/vm_param.h>
97 #include <vm/vm_extern.h>
98 #include <vm/vm_kern.h>
99 #include <vm/vm_page.h>
100 #include <vm/vm_map.h>
101 #include <vm/vm_object.h>
102 #include <vm/vm_pager.h>
103 #include <vm/vm_phys.h>
104 #include <vm/vm_dumpset.h>
105 
106 #ifdef DDB
107 #ifndef KDB
108 #error KDB must be enabled in order for DDB to work!
109 #endif
110 #include <ddb/ddb.h>
111 #include <ddb/db_sym.h>
112 #endif
113 
114 #include <net/netisr.h>
115 
116 #include <dev/smbios/smbios.h>
117 
118 #include <machine/clock.h>
119 #include <machine/cpu.h>
120 #include <machine/cputypes.h>
121 #include <machine/frame.h>
122 #include <machine/intr_machdep.h>
123 #include <x86/mca.h>
124 #include <machine/md_var.h>
125 #include <machine/metadata.h>
126 #include <machine/pc/bios.h>
127 #include <machine/pcb.h>
128 #include <machine/proc.h>
129 #include <machine/sigframe.h>
130 #include <machine/specialreg.h>
131 #include <machine/trap.h>
132 #include <machine/tss.h>
133 #include <x86/ucode.h>
134 #include <x86/ifunc.h>
135 #ifdef SMP
136 #include <machine/smp.h>
137 #endif
138 #ifdef FDT
139 #include <x86/fdt.h>
140 #endif
141 
142 #ifdef DEV_ATPIC
143 #include <x86/isa/icu.h>
144 #else
145 #include <x86/apicvar.h>
146 #endif
147 
148 #include <isa/isareg.h>
149 #include <isa/rtc.h>
150 #include <x86/init.h>
151 
152 /* Sanity check for __curthread() */
153 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
154 
155 /*
156  * The PTI trampoline stack needs enough space for a hardware trapframe and a
157  * couple of scratch registers, as well as the trapframe left behind after an
158  * iret fault.
159  */
160 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
161     offsetof(struct pti_frame, pti_rip));
162 
163 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
164 
165 static void cpu_startup(void *);
166 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
167 
168 /* Probe 8254 PIT and TSC. */
169 static void native_clock_source_init(void);
170 
171 /* Preload data parse function */
172 static caddr_t native_parse_preload_data(u_int64_t);
173 
174 /* Native function to fetch and parse the e820 map */
175 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
176 
177 /* Default init_ops implementation. */
178 struct init_ops init_ops = {
179 	.parse_preload_data =		native_parse_preload_data,
180 	.early_clock_source_init =	native_clock_source_init,
181 	.early_delay =			i8254_delay,
182 	.parse_memmap =			native_parse_memmap,
183 };
184 
185 /*
186  * Physical address of the EFI System Table. Stashed from the metadata hints
187  * passed into the kernel and used by the EFI code to call runtime services.
188  */
189 vm_paddr_t efi_systbl_phys;
190 
191 /* Intel ICH registers */
192 #define ICH_PMBASE	0x400
193 #define ICH_SMI_EN	ICH_PMBASE + 0x30
194 
195 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
196 
197 int cold = 1;
198 
199 long Maxmem = 0;
200 long realmem = 0;
201 int late_console = 1;
202 
203 struct kva_md_info kmi;
204 
205 struct region_descriptor r_idt;
206 
207 struct pcpu *__pcpu;
208 struct pcpu temp_bsp_pcpu;
209 
210 struct mtx icu_lock;
211 
212 struct mem_range_softc mem_range_softc;
213 
214 struct mtx dt_lock;	/* lock for GDT and LDT */
215 
216 void (*vmm_resume_p)(void);
217 
218 bool efi_boot;
219 
220 static void
cpu_startup(void * dummy)221 cpu_startup(void *dummy)
222 {
223 	uintmax_t memsize;
224 	char *sysenv;
225 
226 	/*
227 	 * On MacBooks, we need to disallow the legacy USB circuit to
228 	 * generate an SMI# because this can cause several problems,
229 	 * namely: incorrect CPU frequency detection and failure to
230 	 * start the APs.
231 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
232 	 * Enable register) of the Intel ICH LPC Interface Bridge.
233 	 */
234 	sysenv = kern_getenv("smbios.system.product");
235 	if (sysenv != NULL) {
236 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
237 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
238 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
239 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
240 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
241 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
242 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
243 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
244 			if (bootverbose)
245 				printf("Disabling LEGACY_USB_EN bit on "
246 				    "Intel ICH.\n");
247 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
248 		}
249 		freeenv(sysenv);
250 	}
251 
252 	/*
253 	 * Good {morning,afternoon,evening,night}.
254 	 */
255 	startrtclock();
256 	printcpuinfo();
257 
258 	/*
259 	 * Display physical memory if SMBIOS reports reasonable amount.
260 	 */
261 	memsize = 0;
262 	sysenv = kern_getenv("smbios.memory.enabled");
263 	if (sysenv != NULL) {
264 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
265 		freeenv(sysenv);
266 	}
267 	if (memsize < ptoa((uintmax_t)vm_free_count()))
268 		memsize = ptoa((uintmax_t)Maxmem);
269 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
270 	realmem = atop(memsize);
271 
272 	/*
273 	 * Display any holes after the first chunk of extended memory.
274 	 */
275 	if (bootverbose) {
276 		int indx;
277 
278 		printf("Physical memory chunk(s):\n");
279 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
280 			vm_paddr_t size;
281 
282 			size = phys_avail[indx + 1] - phys_avail[indx];
283 			printf(
284 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
285 			    (uintmax_t)phys_avail[indx],
286 			    (uintmax_t)phys_avail[indx + 1] - 1,
287 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
288 		}
289 	}
290 
291 	vm_ksubmap_init(&kmi);
292 
293 	printf("avail memory = %ju (%ju MB)\n",
294 	    ptoa((uintmax_t)vm_free_count()),
295 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
296 #ifdef DEV_PCI
297 	if (bootverbose && intel_graphics_stolen_base != 0)
298 		printf("intel stolen mem: base %#jx size %ju MB\n",
299 		    (uintmax_t)intel_graphics_stolen_base,
300 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
301 #endif
302 
303 	/*
304 	 * Set up buffers, so they can be used to read disk labels.
305 	 */
306 	bufinit();
307 	vm_pager_bufferinit();
308 
309 	cpu_setregs();
310 }
311 
312 static void
late_ifunc_resolve(void * dummy __unused)313 late_ifunc_resolve(void *dummy __unused)
314 {
315 	link_elf_late_ireloc();
316 }
317 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
318 
319 
320 void
cpu_setregs(void)321 cpu_setregs(void)
322 {
323 	register_t cr0;
324 
325 	TSENTER();
326 	cr0 = rcr0();
327 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
328 	TSENTER2("load_cr0");
329 	load_cr0(cr0);
330 	TSEXIT2("load_cr0");
331 	TSEXIT();
332 }
333 
334 /*
335  * Initialize amd64 and configure to run kernel
336  */
337 
338 /*
339  * Initialize segments & interrupt table
340  */
341 static struct gate_descriptor idt0[NIDT];
342 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
343 
344 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
345 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
346 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
347 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
348 CTASSERT(sizeof(struct nmi_pcpu) == 16);
349 
350 /*
351  * Software prototypes -- in more palatable form.
352  *
353  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
354  * slots as corresponding segments for i386 kernel.
355  */
356 struct soft_segment_descriptor gdt_segs[] = {
357 [GNULL_SEL] = { /* 0 Null Descriptor */
358 	.ssd_base = 0x0,
359 	.ssd_limit = 0x0,
360 	.ssd_type = 0,
361 	.ssd_dpl = 0,
362 	.ssd_p = 0,
363 	.ssd_long = 0,
364 	.ssd_def32 = 0,
365 	.ssd_gran = 0		},
366 [GNULL2_SEL] = { /*	1 Null Descriptor */
367 	.ssd_base = 0x0,
368 	.ssd_limit = 0x0,
369 	.ssd_type = 0,
370 	.ssd_dpl = 0,
371 	.ssd_p = 0,
372 	.ssd_long = 0,
373 	.ssd_def32 = 0,
374 	.ssd_gran = 0		},
375 [GUFS32_SEL] = { /* 2 32 bit %gs Descriptor for user */
376 	.ssd_base = 0x0,
377 	.ssd_limit = 0xfffff,
378 	.ssd_type = SDT_MEMRWA,
379 	.ssd_dpl = SEL_UPL,
380 	.ssd_p = 1,
381 	.ssd_long = 0,
382 	.ssd_def32 = 1,
383 	.ssd_gran = 1		},
384 [GUGS32_SEL] = { /* 3 32 bit %fs Descriptor for user */
385 	.ssd_base = 0x0,
386 	.ssd_limit = 0xfffff,
387 	.ssd_type = SDT_MEMRWA,
388 	.ssd_dpl = SEL_UPL,
389 	.ssd_p = 1,
390 	.ssd_long = 0,
391 	.ssd_def32 = 1,
392 	.ssd_gran = 1		},
393 [GCODE_SEL] = { /* 4 Code Descriptor for kernel */
394 	.ssd_base = 0x0,
395 	.ssd_limit = 0xfffff,
396 	.ssd_type = SDT_MEMERA,
397 	.ssd_dpl = SEL_KPL,
398 	.ssd_p = 1,
399 	.ssd_long = 1,
400 	.ssd_def32 = 0,
401 	.ssd_gran = 1		},
402 [GDATA_SEL] = { /* 5 Data Descriptor for kernel */
403 	.ssd_base = 0x0,
404 	.ssd_limit = 0xfffff,
405 	.ssd_type = SDT_MEMRWA,
406 	.ssd_dpl = SEL_KPL,
407 	.ssd_p = 1,
408 	.ssd_long = 1,
409 	.ssd_def32 = 0,
410 	.ssd_gran = 1		},
411 [GUCODE32_SEL] = { /* 6 32 bit Code Descriptor for user */
412 	.ssd_base = 0x0,
413 	.ssd_limit = 0xfffff,
414 	.ssd_type = SDT_MEMERA,
415 	.ssd_dpl = SEL_UPL,
416 	.ssd_p = 1,
417 	.ssd_long = 0,
418 	.ssd_def32 = 1,
419 	.ssd_gran = 1		},
420 [GUDATA_SEL] = { /* 7 32/64 bit Data Descriptor for user */
421 	.ssd_base = 0x0,
422 	.ssd_limit = 0xfffff,
423 	.ssd_type = SDT_MEMRWA,
424 	.ssd_dpl = SEL_UPL,
425 	.ssd_p = 1,
426 	.ssd_long = 0,
427 	.ssd_def32 = 1,
428 	.ssd_gran = 1		},
429 [GUCODE_SEL] = { /* 8 64 bit Code Descriptor for user */
430 	.ssd_base = 0x0,
431 	.ssd_limit = 0xfffff,
432 	.ssd_type = SDT_MEMERA,
433 	.ssd_dpl = SEL_UPL,
434 	.ssd_p = 1,
435 	.ssd_long = 1,
436 	.ssd_def32 = 0,
437 	.ssd_gran = 1		},
438 [GPROC0_SEL] = { /* 9 Proc 0 TSS Descriptor */
439 	.ssd_base = 0x0,
440 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
441 	.ssd_type = SDT_SYSTSS,
442 	.ssd_dpl = SEL_KPL,
443 	.ssd_p = 1,
444 	.ssd_long = 0,
445 	.ssd_def32 = 0,
446 	.ssd_gran = 0		},
447 [GPROC0_SEL + 1] = { /* 10 Proc 0 TSS descriptor, double size */
448 	.ssd_base = 0x0,
449 	.ssd_limit = 0x0,
450 	.ssd_type = 0,
451 	.ssd_dpl = 0,
452 	.ssd_p = 0,
453 	.ssd_long = 0,
454 	.ssd_def32 = 0,
455 	.ssd_gran = 0		},
456 [GUSERLDT_SEL] = { /* 11 LDT Descriptor */
457 	.ssd_base = 0x0,
458 	.ssd_limit = 0x0,
459 	.ssd_type = 0,
460 	.ssd_dpl = 0,
461 	.ssd_p = 0,
462 	.ssd_long = 0,
463 	.ssd_def32 = 0,
464 	.ssd_gran = 0		},
465 [GUSERLDT_SEL + 1] = { /* 12 LDT Descriptor, double size */
466 	.ssd_base = 0x0,
467 	.ssd_limit = 0x0,
468 	.ssd_type = 0,
469 	.ssd_dpl = 0,
470 	.ssd_p = 0,
471 	.ssd_long = 0,
472 	.ssd_def32 = 0,
473 	.ssd_gran = 0		},
474 };
475 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
476 
477 void
setidt(int idx,inthand_t * func,int typ,int dpl,int ist)478 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
479 {
480 	struct gate_descriptor *ip;
481 
482 	ip = idt + idx;
483 	ip->gd_looffset = (uintptr_t)func;
484 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
485 	ip->gd_ist = ist;
486 	ip->gd_xx = 0;
487 	ip->gd_type = typ;
488 	ip->gd_dpl = dpl;
489 	ip->gd_p = 1;
490 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
491 }
492 
493 extern inthand_t
494 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
495 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
496 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
497 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
498 	IDTVEC(xmm), IDTVEC(dblfault),
499 	IDTVEC(div_pti), IDTVEC(bpt_pti),
500 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
501 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
502 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
503 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
504 	IDTVEC(xmm_pti),
505 #ifdef KDTRACE_HOOKS
506 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
507 #endif
508 #ifdef XENHVM
509 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
510 #endif
511 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
512 	IDTVEC(fast_syscall_pti);
513 
514 #ifdef DDB
515 /*
516  * Display the index and function name of any IDT entries that don't use
517  * the default 'rsvd' entry point.
518  */
DB_SHOW_COMMAND_FLAGS(idt,db_show_idt,DB_CMD_MEMSAFE)519 DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE)
520 {
521 	struct gate_descriptor *ip;
522 	int idx;
523 	uintptr_t func;
524 
525 	ip = idt;
526 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
527 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
528 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
529 			db_printf("%3d\t", idx);
530 			db_printsym(func, DB_STGY_PROC);
531 			db_printf("\n");
532 		}
533 		ip++;
534 	}
535 }
536 
537 /* Show privileged registers. */
DB_SHOW_COMMAND_FLAGS(sysregs,db_show_sysregs,DB_CMD_MEMSAFE)538 DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE)
539 {
540 	struct {
541 		uint16_t limit;
542 		uint64_t base;
543 	} __packed idtr, gdtr;
544 	uint16_t ldt, tr;
545 
546 	__asm __volatile("sidt %0" : "=m" (idtr));
547 	db_printf("idtr\t0x%016lx/%04x\n",
548 	    (u_long)idtr.base, (u_int)idtr.limit);
549 	__asm __volatile("sgdt %0" : "=m" (gdtr));
550 	db_printf("gdtr\t0x%016lx/%04x\n",
551 	    (u_long)gdtr.base, (u_int)gdtr.limit);
552 	__asm __volatile("sldt %0" : "=r" (ldt));
553 	db_printf("ldtr\t0x%04x\n", ldt);
554 	__asm __volatile("str %0" : "=r" (tr));
555 	db_printf("tr\t0x%04x\n", tr);
556 	db_printf("cr0\t0x%016lx\n", rcr0());
557 	db_printf("cr2\t0x%016lx\n", rcr2());
558 	db_printf("cr3\t0x%016lx\n", rcr3());
559 	db_printf("cr4\t0x%016lx\n", rcr4());
560 	if (rcr4() & CR4_XSAVE)
561 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
562 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
563 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
564 		db_printf("FEATURES_CTL\t%016lx\n",
565 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
566 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
567 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
568 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
569 }
570 
DB_SHOW_COMMAND_FLAGS(dbregs,db_show_dbregs,DB_CMD_MEMSAFE)571 DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE)
572 {
573 
574 	db_printf("dr0\t0x%016lx\n", rdr0());
575 	db_printf("dr1\t0x%016lx\n", rdr1());
576 	db_printf("dr2\t0x%016lx\n", rdr2());
577 	db_printf("dr3\t0x%016lx\n", rdr3());
578 	db_printf("dr6\t0x%016lx\n", rdr6());
579 	db_printf("dr7\t0x%016lx\n", rdr7());
580 }
581 #endif
582 
583 void
sdtossd(struct user_segment_descriptor * sd,struct soft_segment_descriptor * ssd)584 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
585 {
586 
587 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
588 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
589 	ssd->ssd_type  = sd->sd_type;
590 	ssd->ssd_dpl   = sd->sd_dpl;
591 	ssd->ssd_p     = sd->sd_p;
592 	ssd->ssd_long  = sd->sd_long;
593 	ssd->ssd_def32 = sd->sd_def32;
594 	ssd->ssd_gran  = sd->sd_gran;
595 }
596 
597 void
ssdtosd(struct soft_segment_descriptor * ssd,struct user_segment_descriptor * sd)598 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
599 {
600 
601 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
602 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
603 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
604 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
605 	sd->sd_type  = ssd->ssd_type;
606 	sd->sd_dpl   = ssd->ssd_dpl;
607 	sd->sd_p     = ssd->ssd_p;
608 	sd->sd_long  = ssd->ssd_long;
609 	sd->sd_def32 = ssd->ssd_def32;
610 	sd->sd_gran  = ssd->ssd_gran;
611 }
612 
613 void
ssdtosyssd(struct soft_segment_descriptor * ssd,struct system_segment_descriptor * sd)614 ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd)
615 {
616 
617 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
618 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
619 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
620 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
621 	sd->sd_type  = ssd->ssd_type;
622 	sd->sd_dpl   = ssd->ssd_dpl;
623 	sd->sd_p     = ssd->ssd_p;
624 	sd->sd_gran  = ssd->ssd_gran;
625 }
626 
627 u_int basemem;
628 
629 static int
add_physmap_entry(uint64_t base,uint64_t length,vm_paddr_t * physmap,int * physmap_idxp)630 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
631     int *physmap_idxp)
632 {
633 	int i, insert_idx, physmap_idx;
634 
635 	physmap_idx = *physmap_idxp;
636 
637 	if (length == 0)
638 		return (1);
639 
640 	/*
641 	 * Find insertion point while checking for overlap.  Start off by
642 	 * assuming the new entry will be added to the end.
643 	 *
644 	 * NB: physmap_idx points to the next free slot.
645 	 */
646 	insert_idx = physmap_idx;
647 	for (i = 0; i <= physmap_idx; i += 2) {
648 		if (base < physmap[i + 1]) {
649 			if (base + length <= physmap[i]) {
650 				insert_idx = i;
651 				break;
652 			}
653 			if (boothowto & RB_VERBOSE)
654 				printf(
655 		    "Overlapping memory regions, ignoring second region\n");
656 			return (1);
657 		}
658 	}
659 
660 	/* See if we can prepend to the next entry. */
661 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
662 		physmap[insert_idx] = base;
663 		return (1);
664 	}
665 
666 	/* See if we can append to the previous entry. */
667 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
668 		physmap[insert_idx - 1] += length;
669 		return (1);
670 	}
671 
672 	physmap_idx += 2;
673 	*physmap_idxp = physmap_idx;
674 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
675 		printf(
676 		"Too many segments in the physical address map, giving up\n");
677 		return (0);
678 	}
679 
680 	/*
681 	 * Move the last 'N' entries down to make room for the new
682 	 * entry if needed.
683 	 */
684 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
685 		physmap[i] = physmap[i - 2];
686 		physmap[i + 1] = physmap[i - 1];
687 	}
688 
689 	/* Insert the new entry. */
690 	physmap[insert_idx] = base;
691 	physmap[insert_idx + 1] = base + length;
692 	return (1);
693 }
694 
695 void
bios_add_smap_entries(struct bios_smap * smapbase,u_int32_t smapsize,vm_paddr_t * physmap,int * physmap_idx)696 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
697                       vm_paddr_t *physmap, int *physmap_idx)
698 {
699 	struct bios_smap *smap, *smapend;
700 
701 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
702 
703 	for (smap = smapbase; smap < smapend; smap++) {
704 		if (boothowto & RB_VERBOSE)
705 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
706 			    smap->type, smap->base, smap->length);
707 
708 		if (smap->type != SMAP_TYPE_MEMORY)
709 			continue;
710 
711 		if (!add_physmap_entry(smap->base, smap->length, physmap,
712 		    physmap_idx))
713 			break;
714 	}
715 }
716 
717 static void
add_efi_map_entries(struct efi_map_header * efihdr,vm_paddr_t * physmap,int * physmap_idx)718 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
719     int *physmap_idx)
720 {
721 	struct efi_md *map, *p;
722 	const char *type;
723 	size_t efisz;
724 	int ndesc, i;
725 
726 	static const char *types[] = {
727 		"Reserved",
728 		"LoaderCode",
729 		"LoaderData",
730 		"BootServicesCode",
731 		"BootServicesData",
732 		"RuntimeServicesCode",
733 		"RuntimeServicesData",
734 		"ConventionalMemory",
735 		"UnusableMemory",
736 		"ACPIReclaimMemory",
737 		"ACPIMemoryNVS",
738 		"MemoryMappedIO",
739 		"MemoryMappedIOPortSpace",
740 		"PalCode",
741 		"PersistentMemory"
742 	};
743 
744 	/*
745 	 * Memory map data provided by UEFI via the GetMemoryMap
746 	 * Boot Services API.
747 	 */
748 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
749 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
750 
751 	if (efihdr->descriptor_size == 0)
752 		return;
753 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
754 
755 	if (boothowto & RB_VERBOSE)
756 		printf("%23s %12s %12s %8s %4s\n",
757 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
758 
759 	for (i = 0, p = map; i < ndesc; i++,
760 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
761 		if (boothowto & RB_VERBOSE) {
762 			if (p->md_type < nitems(types))
763 				type = types[p->md_type];
764 			else
765 				type = "<INVALID>";
766 			printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
767 			    p->md_virt, p->md_pages);
768 			if (p->md_attr & EFI_MD_ATTR_UC)
769 				printf("UC ");
770 			if (p->md_attr & EFI_MD_ATTR_WC)
771 				printf("WC ");
772 			if (p->md_attr & EFI_MD_ATTR_WT)
773 				printf("WT ");
774 			if (p->md_attr & EFI_MD_ATTR_WB)
775 				printf("WB ");
776 			if (p->md_attr & EFI_MD_ATTR_UCE)
777 				printf("UCE ");
778 			if (p->md_attr & EFI_MD_ATTR_WP)
779 				printf("WP ");
780 			if (p->md_attr & EFI_MD_ATTR_RP)
781 				printf("RP ");
782 			if (p->md_attr & EFI_MD_ATTR_XP)
783 				printf("XP ");
784 			if (p->md_attr & EFI_MD_ATTR_NV)
785 				printf("NV ");
786 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
787 				printf("MORE_RELIABLE ");
788 			if (p->md_attr & EFI_MD_ATTR_RO)
789 				printf("RO ");
790 			if (p->md_attr & EFI_MD_ATTR_RT)
791 				printf("RUNTIME");
792 			printf("\n");
793 		}
794 
795 		switch (p->md_type) {
796 		case EFI_MD_TYPE_CODE:
797 		case EFI_MD_TYPE_DATA:
798 		case EFI_MD_TYPE_BS_CODE:
799 		case EFI_MD_TYPE_BS_DATA:
800 		case EFI_MD_TYPE_FREE:
801 			/*
802 			 * We're allowed to use any entry with these types.
803 			 */
804 			break;
805 		default:
806 			continue;
807 		}
808 
809 		if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE,
810 		    physmap, physmap_idx))
811 			break;
812 	}
813 }
814 
815 static void
native_parse_memmap(caddr_t kmdp,vm_paddr_t * physmap,int * physmap_idx)816 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
817 {
818 	struct bios_smap *smap;
819 	struct efi_map_header *efihdr;
820 	u_int32_t size;
821 
822 	/*
823 	 * Memory map from INT 15:E820.
824 	 *
825 	 * subr_module.c says:
826 	 * "Consumer may safely assume that size value precedes data."
827 	 * ie: an int32_t immediately precedes smap.
828 	 */
829 
830 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
831 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
832 	smap = (struct bios_smap *)preload_search_info(kmdp,
833 	    MODINFO_METADATA | MODINFOMD_SMAP);
834 	if (efihdr == NULL && smap == NULL)
835 		panic("No BIOS smap or EFI map info from loader!");
836 
837 	if (efihdr != NULL) {
838 		add_efi_map_entries(efihdr, physmap, physmap_idx);
839 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
840 	} else {
841 		size = *((u_int32_t *)smap - 1);
842 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
843 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
844 	}
845 }
846 
847 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
848 
849 /*
850  * Populate the (physmap) array with base/bound pairs describing the
851  * available physical memory in the system, then test this memory and
852  * build the phys_avail array describing the actually-available memory.
853  *
854  * Total memory size may be set by the kernel environment variable
855  * hw.physmem or the compile-time define MAXMEM.
856  *
857  * XXX first should be vm_paddr_t.
858  */
859 static void
getmemsize(caddr_t kmdp,u_int64_t first)860 getmemsize(caddr_t kmdp, u_int64_t first)
861 {
862 	int i, physmap_idx, pa_indx, da_indx;
863 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
864 	u_long physmem_start, physmem_tunable, memtest;
865 	pt_entry_t *pte;
866 	quad_t dcons_addr, dcons_size;
867 	int page_counter;
868 
869 	TSENTER();
870 	/*
871 	 * Tell the physical memory allocator about pages used to store
872 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
873 	 */
874 	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
875 
876 	bzero(physmap, sizeof(physmap));
877 	physmap_idx = 0;
878 
879 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
880 	physmap_idx -= 2;
881 
882 	/*
883 	 * Find the 'base memory' segment for SMP
884 	 */
885 	basemem = 0;
886 	for (i = 0; i <= physmap_idx; i += 2) {
887 		if (physmap[i] <= 0xA0000) {
888 			basemem = physmap[i + 1] / 1024;
889 			break;
890 		}
891 	}
892 	if (basemem == 0 || basemem > 640) {
893 		if (bootverbose)
894 			printf(
895 		"Memory map doesn't contain a basemem segment, faking it");
896 		basemem = 640;
897 	}
898 
899 	/*
900 	 * Maxmem isn't the "maximum memory", it's one larger than the
901 	 * highest page of the physical address space.  It should be
902 	 * called something like "Maxphyspage".  We may adjust this
903 	 * based on ``hw.physmem'' and the results of the memory test.
904 	 */
905 	Maxmem = atop(physmap[physmap_idx + 1]);
906 
907 #ifdef MAXMEM
908 	Maxmem = MAXMEM / 4;
909 #endif
910 
911 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
912 		Maxmem = atop(physmem_tunable);
913 
914 	/*
915 	 * The boot memory test is disabled by default, as it takes a
916 	 * significant amount of time on large-memory systems, and is
917 	 * unfriendly to virtual machines as it unnecessarily touches all
918 	 * pages.
919 	 *
920 	 * A general name is used as the code may be extended to support
921 	 * additional tests beyond the current "page present" test.
922 	 */
923 	memtest = 0;
924 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
925 
926 	/*
927 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
928 	 * in the system.
929 	 */
930 	if (Maxmem > atop(physmap[physmap_idx + 1]))
931 		Maxmem = atop(physmap[physmap_idx + 1]);
932 
933 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
934 	    (boothowto & RB_VERBOSE))
935 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
936 
937 	/* call pmap initialization to make new kernel address space */
938 	pmap_bootstrap(&first);
939 
940 	/*
941 	 * Size up each available chunk of physical memory.
942 	 *
943 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
944 	 * By default, mask off the first 16 pages unless we appear to be
945 	 * running in a VM.
946 	 */
947 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
948 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
949 	if (physmap[0] < physmem_start) {
950 		if (physmem_start < PAGE_SIZE)
951 			physmap[0] = PAGE_SIZE;
952 		else if (physmem_start >= physmap[1])
953 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
954 		else
955 			physmap[0] = round_page(physmem_start);
956 	}
957 	pa_indx = 0;
958 	da_indx = 1;
959 	phys_avail[pa_indx++] = physmap[0];
960 	phys_avail[pa_indx] = physmap[0];
961 	dump_avail[da_indx] = physmap[0];
962 	pte = CMAP1;
963 
964 	/*
965 	 * Get dcons buffer address
966 	 */
967 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
968 	    getenv_quad("dcons.size", &dcons_size) == 0)
969 		dcons_addr = 0;
970 
971 	/*
972 	 * physmap is in bytes, so when converting to page boundaries,
973 	 * round up the start address and round down the end address.
974 	 */
975 	page_counter = 0;
976 	if (memtest != 0)
977 		printf("Testing system memory");
978 	for (i = 0; i <= physmap_idx; i += 2) {
979 		vm_paddr_t end;
980 
981 		end = ptoa((vm_paddr_t)Maxmem);
982 		if (physmap[i + 1] < end)
983 			end = trunc_page(physmap[i + 1]);
984 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
985 			int *ptr = (int *)CADDR1;
986 			int tmp;
987 			bool full, page_bad;
988 
989 			full = false;
990 			/*
991 			 * block out kernel memory as not available.
992 			 */
993 			if (pa >= (vm_paddr_t)kernphys && pa < first)
994 				goto do_dump_avail;
995 
996 			/*
997 			 * block out dcons buffer
998 			 */
999 			if (dcons_addr > 0
1000 			    && pa >= trunc_page(dcons_addr)
1001 			    && pa < dcons_addr + dcons_size)
1002 				goto do_dump_avail;
1003 
1004 			page_bad = false;
1005 			if (memtest == 0)
1006 				goto skip_memtest;
1007 
1008 			/*
1009 			 * Print a "." every GB to show we're making
1010 			 * progress.
1011 			 */
1012 			page_counter++;
1013 			if ((page_counter % PAGES_PER_GB) == 0)
1014 				printf(".");
1015 
1016 			/*
1017 			 * map page into kernel: valid, read/write,non-cacheable
1018 			 */
1019 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1020 			invltlb();
1021 
1022 			tmp = *(int *)ptr;
1023 			/*
1024 			 * Test for alternating 1's and 0's
1025 			 */
1026 			*(volatile int *)ptr = 0xaaaaaaaa;
1027 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1028 				page_bad = true;
1029 			/*
1030 			 * Test for alternating 0's and 1's
1031 			 */
1032 			*(volatile int *)ptr = 0x55555555;
1033 			if (*(volatile int *)ptr != 0x55555555)
1034 				page_bad = true;
1035 			/*
1036 			 * Test for all 1's
1037 			 */
1038 			*(volatile int *)ptr = 0xffffffff;
1039 			if (*(volatile int *)ptr != 0xffffffff)
1040 				page_bad = true;
1041 			/*
1042 			 * Test for all 0's
1043 			 */
1044 			*(volatile int *)ptr = 0x0;
1045 			if (*(volatile int *)ptr != 0x0)
1046 				page_bad = true;
1047 			/*
1048 			 * Restore original value.
1049 			 */
1050 			*(int *)ptr = tmp;
1051 
1052 skip_memtest:
1053 			/*
1054 			 * Adjust array of valid/good pages.
1055 			 */
1056 			if (page_bad == true)
1057 				continue;
1058 			/*
1059 			 * If this good page is a continuation of the
1060 			 * previous set of good pages, then just increase
1061 			 * the end pointer. Otherwise start a new chunk.
1062 			 * Note that "end" points one higher than end,
1063 			 * making the range >= start and < end.
1064 			 * If we're also doing a speculative memory
1065 			 * test and we at or past the end, bump up Maxmem
1066 			 * so that we keep going. The first bad page
1067 			 * will terminate the loop.
1068 			 */
1069 			if (phys_avail[pa_indx] == pa) {
1070 				phys_avail[pa_indx] += PAGE_SIZE;
1071 			} else {
1072 				pa_indx++;
1073 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
1074 					printf(
1075 		"Too many holes in the physical address space, giving up\n");
1076 					pa_indx--;
1077 					full = true;
1078 					goto do_dump_avail;
1079 				}
1080 				phys_avail[pa_indx++] = pa;	/* start */
1081 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1082 			}
1083 			physmem++;
1084 do_dump_avail:
1085 			if (dump_avail[da_indx] == pa) {
1086 				dump_avail[da_indx] += PAGE_SIZE;
1087 			} else {
1088 				da_indx++;
1089 				if (da_indx == PHYS_AVAIL_ENTRIES) {
1090 					da_indx--;
1091 					goto do_next;
1092 				}
1093 				dump_avail[da_indx++] = pa; /* start */
1094 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1095 			}
1096 do_next:
1097 			if (full)
1098 				break;
1099 		}
1100 	}
1101 	*pte = 0;
1102 	invltlb();
1103 	if (memtest != 0)
1104 		printf("\n");
1105 
1106 	/*
1107 	 * XXX
1108 	 * The last chunk must contain at least one page plus the message
1109 	 * buffer to avoid complicating other code (message buffer address
1110 	 * calculation, etc.).
1111 	 */
1112 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1113 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1114 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1115 		phys_avail[pa_indx--] = 0;
1116 		phys_avail[pa_indx--] = 0;
1117 	}
1118 
1119 	Maxmem = atop(phys_avail[pa_indx]);
1120 
1121 	/* Trim off space for the message buffer. */
1122 	phys_avail[pa_indx] -= round_page(msgbufsize);
1123 
1124 	/* Map the message buffer. */
1125 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1126 	TSEXIT();
1127 }
1128 
1129 static caddr_t
native_parse_preload_data(u_int64_t modulep)1130 native_parse_preload_data(u_int64_t modulep)
1131 {
1132 	caddr_t kmdp;
1133 	char *envp;
1134 #ifdef DDB
1135 	vm_offset_t ksym_start;
1136 	vm_offset_t ksym_end;
1137 #endif
1138 
1139 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1140 	preload_bootstrap_relocate(KERNBASE);
1141 	kmdp = preload_search_by_type("elf kernel");
1142 	if (kmdp == NULL)
1143 		kmdp = preload_search_by_type("elf64 kernel");
1144 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1145 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1146 	if (envp != NULL)
1147 		envp += KERNBASE;
1148 	init_static_kenv(envp, 0);
1149 #ifdef DDB
1150 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1151 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1152 	db_fetch_ksymtab(ksym_start, ksym_end, 0);
1153 #endif
1154 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1155 
1156 	return (kmdp);
1157 }
1158 
1159 static void
native_clock_source_init(void)1160 native_clock_source_init(void)
1161 {
1162 	i8254_init();
1163 }
1164 
1165 static void
amd64_kdb_init(void)1166 amd64_kdb_init(void)
1167 {
1168 	kdb_init();
1169 #ifdef KDB
1170 	if (boothowto & RB_KDB)
1171 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1172 #endif
1173 }
1174 
1175 /* Set up the fast syscall stuff */
1176 void
amd64_conf_fast_syscall(void)1177 amd64_conf_fast_syscall(void)
1178 {
1179 	uint64_t msr;
1180 
1181 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1182 	wrmsr(MSR_EFER, msr);
1183 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1184 	    (u_int64_t)IDTVEC(fast_syscall));
1185 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1186 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1187 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1188 	wrmsr(MSR_STAR, msr);
1189 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1190 }
1191 
1192 void
amd64_bsp_pcpu_init1(struct pcpu * pc)1193 amd64_bsp_pcpu_init1(struct pcpu *pc)
1194 {
1195 	struct user_segment_descriptor *gdt;
1196 
1197 	PCPU_SET(prvspace, pc);
1198 	gdt = *PCPU_PTR(gdt);
1199 	PCPU_SET(curthread, &thread0);
1200 	PCPU_SET(tssp, PCPU_PTR(common_tss));
1201 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1202 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1203 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1204 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1205 	PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1206 	PCPU_SET(smp_tlb_gen, 1);
1207 }
1208 
1209 void
amd64_bsp_pcpu_init2(uint64_t rsp0)1210 amd64_bsp_pcpu_init2(uint64_t rsp0)
1211 {
1212 
1213 	PCPU_SET(rsp0, rsp0);
1214 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1215 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1216 	PCPU_SET(curpcb, thread0.td_pcb);
1217 }
1218 
1219 void
amd64_bsp_ist_init(struct pcpu * pc)1220 amd64_bsp_ist_init(struct pcpu *pc)
1221 {
1222 	struct nmi_pcpu *np;
1223 	struct amd64tss *tssp;
1224 
1225 	tssp = &pc->pc_common_tss;
1226 
1227 	/* doublefault stack space, runs on ist1 */
1228 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1229 	np->np_pcpu = (register_t)pc;
1230 	tssp->tss_ist1 = (long)np;
1231 
1232 	/*
1233 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1234 	 * above the start of the ist2 stack.
1235 	 */
1236 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1237 	np->np_pcpu = (register_t)pc;
1238 	tssp->tss_ist2 = (long)np;
1239 
1240 	/*
1241 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1242 	 * above the start of the ist3 stack.
1243 	 */
1244 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1245 	np->np_pcpu = (register_t)pc;
1246 	tssp->tss_ist3 = (long)np;
1247 
1248 	/*
1249 	 * DB# stack, runs on ist4.
1250 	 */
1251 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1252 	np->np_pcpu = (register_t)pc;
1253 	tssp->tss_ist4 = (long)np;
1254 }
1255 
1256 /*
1257  * Calculate the kernel load address by inspecting page table created by loader.
1258  * The assumptions:
1259  * - kernel is mapped at KERNBASE, backed by contiguous phys memory
1260  *   aligned at 2M, below 4G (the latter is important for AP startup)
1261  * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
1262  * - kernel is mapped with 2M superpages
1263  * - all participating memory, i.e. kernel, modules, metadata,
1264  *   page table is accessible by pre-created 1:1 mapping
1265  *   (right now loader creates 1:1 mapping for lower 4G, and all
1266  *   memory is from there)
1267  * - there is a usable memory block right after the end of the
1268  *   mapped kernel and all modules/metadata, pointed to by
1269  *   physfree, for early allocations
1270  */
1271 vm_paddr_t __nosanitizeaddress __nosanitizememory
amd64_loadaddr(void)1272 amd64_loadaddr(void)
1273 {
1274 	pml4_entry_t *pml4e;
1275 	pdp_entry_t *pdpe;
1276 	pd_entry_t *pde;
1277 	uint64_t cr3;
1278 
1279 	cr3 = rcr3();
1280 	pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART);
1281 	pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART);
1282 	pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART);
1283 	return (*pde & PG_FRAME);
1284 }
1285 
1286 u_int64_t
hammer_time(u_int64_t modulep,u_int64_t physfree)1287 hammer_time(u_int64_t modulep, u_int64_t physfree)
1288 {
1289 	caddr_t kmdp;
1290 	int gsel_tss, x;
1291 	struct pcpu *pc;
1292 	uint64_t rsp0;
1293 	char *env;
1294 	struct user_segment_descriptor *gdt;
1295 	struct region_descriptor r_gdt;
1296 	size_t kstack0_sz;
1297 
1298 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1299 
1300 	kernphys = amd64_loadaddr();
1301 
1302 	physfree += kernphys;
1303 
1304 	kmdp = init_ops.parse_preload_data(modulep);
1305 
1306 	efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
1307 	    MODINFOMD_EFI_MAP) != NULL;
1308 
1309 	if (!efi_boot) {
1310 		/* Tell the bios to warmboot next time */
1311 		atomic_store_short((u_short *)0x472, 0x1234);
1312 	}
1313 
1314 	physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
1315 	physfree = roundup2(physfree, PAGE_SIZE);
1316 
1317 	identify_cpu1();
1318 	identify_hypervisor();
1319 	identify_hypervisor_smbios();
1320 	identify_cpu_fixup_bsp();
1321 	identify_cpu2();
1322 	initializecpucache();
1323 
1324 	/*
1325 	 * Check for pti, pcid, and invpcid before ifuncs are
1326 	 * resolved, to correctly select the implementation for
1327 	 * pmap_activate_sw_mode().
1328 	 */
1329 	pti = pti_get_default();
1330 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1331 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1332 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1333 		invpcid_works = (cpu_stdext_feature &
1334 		    CPUID_STDEXT_INVPCID) != 0;
1335 	} else {
1336 		pmap_pcid_enabled = 0;
1337 	}
1338 
1339 	/*
1340 	 * Now we can do small core initialization, after the PCID
1341 	 * CPU features and user knobs are evaluated.
1342 	 */
1343 	TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
1344 	    &pmap_pcid_invlpg_workaround_uena);
1345 	cpu_init_small_core();
1346 
1347 	if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
1348 		use_xsave = 1;
1349 		TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave);
1350 	}
1351 
1352 	link_elf_ireloc(kmdp);
1353 
1354 	/*
1355 	 * This may be done better later if it gets more high level
1356 	 * components in it. If so just link td->td_proc here.
1357 	 */
1358 	proc_linkup0(&proc0, &thread0);
1359 
1360 	/* Init basic tunables, hz etc */
1361 	init_param1();
1362 
1363 	thread0.td_kstack = physfree - kernphys + KERNSTART;
1364 	thread0.td_kstack_pages = kstack_pages;
1365 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1366 	bzero((void *)thread0.td_kstack, kstack0_sz);
1367 	physfree += kstack0_sz;
1368 
1369 	/*
1370 	 * Initialize enough of thread0 for delayed invalidation to
1371 	 * work very early.  Rely on thread0.td_base_pri
1372 	 * zero-initialization, it is reset to PVM at proc0_init().
1373 	 */
1374 	pmap_thread_init_invl_gen(&thread0);
1375 
1376 	pc = &temp_bsp_pcpu;
1377 	pcpu_init(pc, 0, sizeof(struct pcpu));
1378 	gdt = &temp_bsp_pcpu.pc_gdt[0];
1379 
1380 	/*
1381 	 * make gdt memory segments
1382 	 */
1383 	for (x = 0; x < NGDT; x++) {
1384 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1385 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1386 			ssdtosd(&gdt_segs[x], &gdt[x]);
1387 	}
1388 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1389 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1390 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1391 
1392 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1393 	r_gdt.rd_base = (long)gdt;
1394 	lgdt(&r_gdt);
1395 
1396 	wrmsr(MSR_FSBASE, 0);		/* User value */
1397 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1398 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1399 
1400 	dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
1401 	physfree += DPCPU_SIZE;
1402 	amd64_bsp_pcpu_init1(pc);
1403 	/* Non-late cninit() and printf() can be moved up to here. */
1404 
1405 	/*
1406 	 * Initialize mutexes.
1407 	 *
1408 	 * icu_lock: in order to allow an interrupt to occur in a critical
1409 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1410 	 *	     must be able to get the icu lock, so it can't be
1411 	 *	     under witness.
1412 	 */
1413 	mutex_init();
1414 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1415 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1416 
1417 	/* exceptions */
1418 	for (x = 0; x < NIDT; x++)
1419 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1420 		    SEL_KPL, 0);
1421 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1422 	    SEL_KPL, 0);
1423 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1424 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1425 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1426 	    SEL_UPL, 0);
1427 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1428 	    SEL_UPL, 0);
1429 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1430 	    SEL_KPL, 0);
1431 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1432 	    SEL_KPL, 0);
1433 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1434 	    SEL_KPL, 0);
1435 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1436 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1437 	    SDT_SYSIGT, SEL_KPL, 0);
1438 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1439 	    SEL_KPL, 0);
1440 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1441 	    SDT_SYSIGT, SEL_KPL, 0);
1442 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1443 	    SEL_KPL, 0);
1444 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1445 	    SEL_KPL, 0);
1446 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1447 	    SEL_KPL, 0);
1448 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1449 	    SEL_KPL, 0);
1450 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1451 	    SEL_KPL, 0);
1452 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1453 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1454 	    SEL_KPL, 0);
1455 #ifdef KDTRACE_HOOKS
1456 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1457 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1458 #endif
1459 #ifdef XENHVM
1460 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1461 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1462 #endif
1463 	r_idt.rd_limit = sizeof(idt0) - 1;
1464 	r_idt.rd_base = (long) idt;
1465 	lidt(&r_idt);
1466 
1467 	/*
1468 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1469 	 * transition).
1470 	 * Once bootblocks have updated, we can test directly for
1471 	 * efi_systbl != NULL here...
1472 	 */
1473 	if (efi_boot)
1474 		vty_set_preferred(VTY_VT);
1475 
1476 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1477 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1478 
1479 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1480 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1481 
1482 	TUNABLE_INT_FETCH("machdep.syscall_ret_flush_l1d",
1483 	    &syscall_ret_l1d_flush_mode);
1484 
1485 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1486 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1487 
1488 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1489 
1490 	TUNABLE_INT_FETCH("machdep.mitigations.rngds.enable",
1491 	    &x86_rngds_mitg_enable);
1492 
1493 	TUNABLE_INT_FETCH("machdep.mitigations.zenbleed.enable",
1494 	    &zenbleed_enable);
1495 	zenbleed_sanitize_enable();
1496 
1497 	finishidentcpu();	/* Final stage of CPU initialization */
1498 
1499 	/*
1500 	 * Initialize the clock before the console so that console
1501 	 * initialization can use DELAY().
1502 	 */
1503 	clock_init();
1504 
1505 	initializecpu();	/* Initialize CPU registers */
1506 
1507 	amd64_bsp_ist_init(pc);
1508 
1509 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1510 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1511 	    IOPERM_BITMAP_SIZE;
1512 
1513 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1514 	ltr(gsel_tss);
1515 
1516 	amd64_conf_fast_syscall();
1517 
1518 	/*
1519 	 * We initialize the PCB pointer early so that exception
1520 	 * handlers will work.  Also set up td_critnest to short-cut
1521 	 * the page fault handler.
1522 	 */
1523 	cpu_max_ext_state_size = sizeof(struct savefpu);
1524 	set_top_of_stack_td(&thread0);
1525 	thread0.td_pcb = get_pcb_td(&thread0);
1526 	thread0.td_critnest = 1;
1527 
1528 	/*
1529 	 * The console and kdb should be initialized even earlier than here,
1530 	 * but some console drivers don't work until after getmemsize().
1531 	 * Default to late console initialization to support these drivers.
1532 	 * This loses mainly printf()s in getmemsize() and early debugging.
1533 	 */
1534 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1535 	if (!late_console) {
1536 		cninit();
1537 		amd64_kdb_init();
1538 	}
1539 
1540 	getmemsize(kmdp, physfree);
1541 	init_param2(physmem);
1542 
1543 	/* now running on new page tables, configured,and u/iom is accessible */
1544 
1545 #ifdef DEV_PCI
1546         /* This call might adjust phys_avail[]. */
1547         pci_early_quirks();
1548 #endif
1549 
1550 	if (late_console)
1551 		cninit();
1552 
1553 	/*
1554 	 * Dump the boot metadata. We have to wait for cninit() since console
1555 	 * output is required. If it's grossly incorrect the kernel will never
1556 	 * make it this far.
1557 	 */
1558 	if (getenv_is_true("debug.dump_modinfo_at_boot"))
1559 		preload_dump();
1560 
1561 #ifdef DEV_ISA
1562 #ifdef DEV_ATPIC
1563 	elcr_probe();
1564 	atpic_startup();
1565 #else
1566 	/* Reset and mask the atpics and leave them shut down. */
1567 	atpic_reset();
1568 
1569 	/*
1570 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1571 	 * interrupt handler.
1572 	 */
1573 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1574 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1575 #endif
1576 #else
1577 #error "have you forgotten the isa device?"
1578 #endif
1579 
1580 	if (late_console)
1581 		amd64_kdb_init();
1582 
1583 	msgbufinit(msgbufp, msgbufsize);
1584 	fpuinit();
1585 
1586 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1587 	rsp0 = thread0.td_md.md_stack_base;
1588 	/* Ensure the stack is aligned to 16 bytes */
1589 	rsp0 &= ~0xFul;
1590 	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1591 	amd64_bsp_pcpu_init2(rsp0);
1592 
1593 	/* transfer to user mode */
1594 
1595 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1596 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1597 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1598 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1599 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1600 
1601 	load_ds(_udatasel);
1602 	load_es(_udatasel);
1603 	load_fs(_ufssel);
1604 
1605 	/* setup proc 0's pcb */
1606 	thread0.td_pcb->pcb_flags = 0;
1607 
1608         env = kern_getenv("kernelname");
1609 	if (env != NULL)
1610 		strlcpy(kernelname, env, sizeof(kernelname));
1611 
1612 	kcsan_cpu_init(0);
1613 
1614 #ifdef FDT
1615 	x86_init_fdt();
1616 #endif
1617 	thread0.td_critnest = 0;
1618 
1619 	kasan_init();
1620 	kmsan_init();
1621 
1622 	TSEXIT();
1623 
1624 	/* Location of kernel stack for locore */
1625 	return (thread0.td_md.md_stack_base);
1626 }
1627 
1628 void
cpu_pcpu_init(struct pcpu * pcpu,int cpuid,size_t size)1629 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1630 {
1631 
1632 	pcpu->pc_acpi_id = 0xffffffff;
1633 }
1634 
1635 static int
smap_sysctl_handler(SYSCTL_HANDLER_ARGS)1636 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1637 {
1638 	struct bios_smap *smapbase;
1639 	struct bios_smap_xattr smap;
1640 	caddr_t kmdp;
1641 	uint32_t *smapattr;
1642 	int count, error, i;
1643 
1644 	/* Retrieve the system memory map from the loader. */
1645 	kmdp = preload_search_by_type("elf kernel");
1646 	if (kmdp == NULL)
1647 		kmdp = preload_search_by_type("elf64 kernel");
1648 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1649 	    MODINFO_METADATA | MODINFOMD_SMAP);
1650 	if (smapbase == NULL)
1651 		return (0);
1652 	smapattr = (uint32_t *)preload_search_info(kmdp,
1653 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1654 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1655 	error = 0;
1656 	for (i = 0; i < count; i++) {
1657 		smap.base = smapbase[i].base;
1658 		smap.length = smapbase[i].length;
1659 		smap.type = smapbase[i].type;
1660 		if (smapattr != NULL)
1661 			smap.xattr = smapattr[i];
1662 		else
1663 			smap.xattr = 0;
1664 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1665 	}
1666 	return (error);
1667 }
1668 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1669     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1670     smap_sysctl_handler, "S,bios_smap_xattr",
1671     "Raw BIOS SMAP data");
1672 
1673 static int
efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)1674 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1675 {
1676 	struct efi_map_header *efihdr;
1677 	caddr_t kmdp;
1678 	uint32_t efisize;
1679 
1680 	kmdp = preload_search_by_type("elf kernel");
1681 	if (kmdp == NULL)
1682 		kmdp = preload_search_by_type("elf64 kernel");
1683 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1684 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1685 	if (efihdr == NULL)
1686 		return (0);
1687 	efisize = *((uint32_t *)efihdr - 1);
1688 	return (SYSCTL_OUT(req, efihdr, efisize));
1689 }
1690 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1691     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1692     efi_map_sysctl_handler, "S,efi_map_header",
1693     "Raw EFI Memory Map");
1694 
1695 void
spinlock_enter(void)1696 spinlock_enter(void)
1697 {
1698 	struct thread *td;
1699 	register_t flags;
1700 
1701 	td = curthread;
1702 	if (td->td_md.md_spinlock_count == 0) {
1703 		flags = intr_disable();
1704 		td->td_md.md_spinlock_count = 1;
1705 		td->td_md.md_saved_flags = flags;
1706 		critical_enter();
1707 	} else
1708 		td->td_md.md_spinlock_count++;
1709 }
1710 
1711 void
spinlock_exit(void)1712 spinlock_exit(void)
1713 {
1714 	struct thread *td;
1715 	register_t flags;
1716 
1717 	td = curthread;
1718 	flags = td->td_md.md_saved_flags;
1719 	td->td_md.md_spinlock_count--;
1720 	if (td->td_md.md_spinlock_count == 0) {
1721 		critical_exit();
1722 		intr_restore(flags);
1723 	}
1724 }
1725 
1726 /*
1727  * Construct a PCB from a trapframe. This is called from kdb_trap() where
1728  * we want to start a backtrace from the function that caused us to enter
1729  * the debugger. We have the context in the trapframe, but base the trace
1730  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1731  * enough for a backtrace.
1732  */
1733 void
makectx(struct trapframe * tf,struct pcb * pcb)1734 makectx(struct trapframe *tf, struct pcb *pcb)
1735 {
1736 
1737 	pcb->pcb_r12 = tf->tf_r12;
1738 	pcb->pcb_r13 = tf->tf_r13;
1739 	pcb->pcb_r14 = tf->tf_r14;
1740 	pcb->pcb_r15 = tf->tf_r15;
1741 	pcb->pcb_rbp = tf->tf_rbp;
1742 	pcb->pcb_rbx = tf->tf_rbx;
1743 	pcb->pcb_rip = tf->tf_rip;
1744 	pcb->pcb_rsp = tf->tf_rsp;
1745 }
1746 
1747 /*
1748  * The pcb_flags is only modified by current thread, or by other threads
1749  * when current thread is stopped.  However, current thread may change it
1750  * from the interrupt context in cpu_switch(), or in the trap handler.
1751  * When we read-modify-write pcb_flags from C sources, compiler may generate
1752  * code that is not atomic regarding the interrupt handler.  If a trap or
1753  * interrupt happens and any flag is modified from the handler, it can be
1754  * clobbered with the cached value later.  Therefore, we implement setting
1755  * and clearing flags with single-instruction functions, which do not race
1756  * with possible modification of the flags from the trap or interrupt context,
1757  * because traps and interrupts are executed only on instruction boundary.
1758  */
1759 void
set_pcb_flags_raw(struct pcb * pcb,const u_int flags)1760 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
1761 {
1762 
1763 	__asm __volatile("orl %1,%0"
1764 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
1765 	    : "cc", "memory");
1766 
1767 }
1768 
1769 /*
1770  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
1771  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
1772  * pcb if user space modified the bases.  We must save on the context
1773  * switch or if the return to usermode happens through the doreti.
1774  *
1775  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
1776  * which have a consequence that the base MSRs must be saved each time
1777  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
1778  * context switches.
1779  */
1780 static void
set_pcb_flags_fsgsbase(struct pcb * pcb,const u_int flags)1781 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
1782 {
1783 	register_t r;
1784 
1785 	if (curpcb == pcb &&
1786 	    (flags & PCB_FULL_IRET) != 0 &&
1787 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1788 		r = intr_disable();
1789 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1790 			if (rfs() == _ufssel)
1791 				pcb->pcb_fsbase = rdfsbase();
1792 			if (rgs() == _ugssel)
1793 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
1794 		}
1795 		set_pcb_flags_raw(pcb, flags);
1796 		intr_restore(r);
1797 	} else {
1798 		set_pcb_flags_raw(pcb, flags);
1799 	}
1800 }
1801 
1802 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
1803 {
1804 
1805 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
1806 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
1807 }
1808 
1809 void
clear_pcb_flags(struct pcb * pcb,const u_int flags)1810 clear_pcb_flags(struct pcb *pcb, const u_int flags)
1811 {
1812 
1813 	__asm __volatile("andl %1,%0"
1814 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
1815 	    : "cc", "memory");
1816 }
1817 
1818 #ifdef KDB
1819 
1820 /*
1821  * Provide inb() and outb() as functions.  They are normally only available as
1822  * inline functions, thus cannot be called from the debugger.
1823  */
1824 
1825 /* silence compiler warnings */
1826 u_char inb_(u_short);
1827 void outb_(u_short, u_char);
1828 
1829 u_char
inb_(u_short port)1830 inb_(u_short port)
1831 {
1832 	return inb(port);
1833 }
1834 
1835 void
outb_(u_short port,u_char data)1836 outb_(u_short port, u_char data)
1837 {
1838 	outb(port, data);
1839 }
1840 
1841 #endif /* KDB */
1842 
1843 #undef memset
1844 #undef memmove
1845 #undef memcpy
1846 
1847 void	*memset_std(void *buf, int c, size_t len);
1848 void	*memset_erms(void *buf, int c, size_t len);
1849 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
1850 	    size_t len);
1851 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
1852 	    size_t len);
1853 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
1854 	    size_t len);
1855 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
1856 	    size_t len);
1857 
1858 #ifdef KCSAN
1859 /*
1860  * These fail to build as ifuncs when used with KCSAN.
1861  */
1862 void *
memset(void * buf,int c,size_t len)1863 memset(void *buf, int c, size_t len)
1864 {
1865 
1866 	return (memset_std(buf, c, len));
1867 }
1868 
1869 void *
memmove(void * _Nonnull dst,const void * _Nonnull src,size_t len)1870 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1871 {
1872 
1873 	return (memmove_std(dst, src, len));
1874 }
1875 
1876 void *
memcpy(void * _Nonnull dst,const void * _Nonnull src,size_t len)1877 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1878 {
1879 
1880 	return (memcpy_std(dst, src, len));
1881 }
1882 #else
1883 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
1884 {
1885 
1886 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1887 	    memset_erms : memset_std);
1888 }
1889 
1890 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
1891     size_t))
1892 {
1893 
1894 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1895 	    memmove_erms : memmove_std);
1896 }
1897 
1898 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
1899 {
1900 
1901 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1902 	    memcpy_erms : memcpy_std);
1903 }
1904 #endif
1905 
1906 void	pagezero_std(void *addr);
1907 void	pagezero_erms(void *addr);
1908 DEFINE_IFUNC(, void , pagezero, (void *))
1909 {
1910 
1911 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1912 	    pagezero_erms : pagezero_std);
1913 }
1914