xref: /freebsd/sys/amd64/amd64/machdep.c (revision 15f0b8c3)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 1992 Terrence R. Lambert.
6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
41  */
42 
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 #include "opt_atpic.h"
47 #include "opt_cpu.h"
48 #include "opt_ddb.h"
49 #include "opt_inet.h"
50 #include "opt_isa.h"
51 #include "opt_kstack_pages.h"
52 #include "opt_maxmem.h"
53 #include "opt_pci.h"
54 #include "opt_platform.h"
55 #include "opt_sched.h"
56 
57 #include <sys/param.h>
58 #include <sys/proc.h>
59 #include <sys/systm.h>
60 #include <sys/asan.h>
61 #include <sys/bio.h>
62 #include <sys/buf.h>
63 #include <sys/bus.h>
64 #include <sys/callout.h>
65 #include <sys/cons.h>
66 #include <sys/cpu.h>
67 #include <sys/csan.h>
68 #include <sys/efi.h>
69 #include <sys/eventhandler.h>
70 #include <sys/exec.h>
71 #include <sys/imgact.h>
72 #include <sys/kdb.h>
73 #include <sys/kernel.h>
74 #include <sys/ktr.h>
75 #include <sys/linker.h>
76 #include <sys/lock.h>
77 #include <sys/malloc.h>
78 #include <sys/memrange.h>
79 #include <sys/msan.h>
80 #include <sys/msgbuf.h>
81 #include <sys/mutex.h>
82 #include <sys/pcpu.h>
83 #include <sys/ptrace.h>
84 #include <sys/reboot.h>
85 #include <sys/reg.h>
86 #include <sys/rwlock.h>
87 #include <sys/sched.h>
88 #include <sys/signalvar.h>
89 #ifdef SMP
90 #include <sys/smp.h>
91 #endif
92 #include <sys/syscallsubr.h>
93 #include <sys/sysctl.h>
94 #include <sys/sysent.h>
95 #include <sys/sysproto.h>
96 #include <sys/ucontext.h>
97 #include <sys/vmmeter.h>
98 
99 #include <vm/vm.h>
100 #include <vm/vm_param.h>
101 #include <vm/vm_extern.h>
102 #include <vm/vm_kern.h>
103 #include <vm/vm_page.h>
104 #include <vm/vm_map.h>
105 #include <vm/vm_object.h>
106 #include <vm/vm_pager.h>
107 #include <vm/vm_phys.h>
108 #include <vm/vm_dumpset.h>
109 
110 #ifdef DDB
111 #ifndef KDB
112 #error KDB must be enabled in order for DDB to work!
113 #endif
114 #include <ddb/ddb.h>
115 #include <ddb/db_sym.h>
116 #endif
117 
118 #include <net/netisr.h>
119 
120 #include <machine/clock.h>
121 #include <machine/cpu.h>
122 #include <machine/cputypes.h>
123 #include <machine/frame.h>
124 #include <machine/intr_machdep.h>
125 #include <x86/mca.h>
126 #include <machine/md_var.h>
127 #include <machine/metadata.h>
128 #include <machine/pc/bios.h>
129 #include <machine/pcb.h>
130 #include <machine/proc.h>
131 #include <machine/sigframe.h>
132 #include <machine/specialreg.h>
133 #include <machine/trap.h>
134 #include <machine/tss.h>
135 #include <x86/ucode.h>
136 #include <x86/ifunc.h>
137 #ifdef SMP
138 #include <machine/smp.h>
139 #endif
140 #ifdef FDT
141 #include <x86/fdt.h>
142 #endif
143 
144 #ifdef DEV_ATPIC
145 #include <x86/isa/icu.h>
146 #else
147 #include <x86/apicvar.h>
148 #endif
149 
150 #include <isa/isareg.h>
151 #include <isa/rtc.h>
152 #include <x86/init.h>
153 
154 /* Sanity check for __curthread() */
155 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
156 
157 /*
158  * The PTI trampoline stack needs enough space for a hardware trapframe and a
159  * couple of scratch registers, as well as the trapframe left behind after an
160  * iret fault.
161  */
162 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
163     offsetof(struct pti_frame, pti_rip));
164 
165 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
166 
167 static void cpu_startup(void *);
168 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
169 
170 /* Probe 8254 PIT and TSC. */
171 static void native_clock_source_init(void);
172 
173 /* Preload data parse function */
174 static caddr_t native_parse_preload_data(u_int64_t);
175 
176 /* Native function to fetch and parse the e820 map */
177 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
178 
179 /* Default init_ops implementation. */
180 struct init_ops init_ops = {
181 	.parse_preload_data =		native_parse_preload_data,
182 	.early_clock_source_init =	native_clock_source_init,
183 	.early_delay =			i8254_delay,
184 	.parse_memmap =			native_parse_memmap,
185 };
186 
187 /*
188  * Physical address of the EFI System Table. Stashed from the metadata hints
189  * passed into the kernel and used by the EFI code to call runtime services.
190  */
191 vm_paddr_t efi_systbl_phys;
192 
193 /* Intel ICH registers */
194 #define ICH_PMBASE	0x400
195 #define ICH_SMI_EN	ICH_PMBASE + 0x30
196 
197 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
198 
199 int cold = 1;
200 
201 long Maxmem = 0;
202 long realmem = 0;
203 int late_console = 1;
204 
205 struct kva_md_info kmi;
206 
207 struct region_descriptor r_idt;
208 
209 struct pcpu *__pcpu;
210 struct pcpu temp_bsp_pcpu;
211 
212 struct mtx icu_lock;
213 
214 struct mem_range_softc mem_range_softc;
215 
216 struct mtx dt_lock;	/* lock for GDT and LDT */
217 
218 void (*vmm_resume_p)(void);
219 
220 bool efi_boot;
221 
222 static void
223 cpu_startup(dummy)
224 	void *dummy;
225 {
226 	uintmax_t memsize;
227 	char *sysenv;
228 
229 	/*
230 	 * On MacBooks, we need to disallow the legacy USB circuit to
231 	 * generate an SMI# because this can cause several problems,
232 	 * namely: incorrect CPU frequency detection and failure to
233 	 * start the APs.
234 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
235 	 * Enable register) of the Intel ICH LPC Interface Bridge.
236 	 */
237 	sysenv = kern_getenv("smbios.system.product");
238 	if (sysenv != NULL) {
239 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
240 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
241 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
242 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
243 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
244 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
245 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
246 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
247 			if (bootverbose)
248 				printf("Disabling LEGACY_USB_EN bit on "
249 				    "Intel ICH.\n");
250 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
251 		}
252 		freeenv(sysenv);
253 	}
254 
255 	/*
256 	 * Good {morning,afternoon,evening,night}.
257 	 */
258 	startrtclock();
259 	printcpuinfo();
260 
261 	/*
262 	 * Display physical memory if SMBIOS reports reasonable amount.
263 	 */
264 	memsize = 0;
265 	sysenv = kern_getenv("smbios.memory.enabled");
266 	if (sysenv != NULL) {
267 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
268 		freeenv(sysenv);
269 	}
270 	if (memsize < ptoa((uintmax_t)vm_free_count()))
271 		memsize = ptoa((uintmax_t)Maxmem);
272 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
273 	realmem = atop(memsize);
274 
275 	/*
276 	 * Display any holes after the first chunk of extended memory.
277 	 */
278 	if (bootverbose) {
279 		int indx;
280 
281 		printf("Physical memory chunk(s):\n");
282 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
283 			vm_paddr_t size;
284 
285 			size = phys_avail[indx + 1] - phys_avail[indx];
286 			printf(
287 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
288 			    (uintmax_t)phys_avail[indx],
289 			    (uintmax_t)phys_avail[indx + 1] - 1,
290 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
291 		}
292 	}
293 
294 	vm_ksubmap_init(&kmi);
295 
296 	printf("avail memory = %ju (%ju MB)\n",
297 	    ptoa((uintmax_t)vm_free_count()),
298 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
299 #ifdef DEV_PCI
300 	if (bootverbose && intel_graphics_stolen_base != 0)
301 		printf("intel stolen mem: base %#jx size %ju MB\n",
302 		    (uintmax_t)intel_graphics_stolen_base,
303 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
304 #endif
305 
306 	/*
307 	 * Set up buffers, so they can be used to read disk labels.
308 	 */
309 	bufinit();
310 	vm_pager_bufferinit();
311 
312 	cpu_setregs();
313 }
314 
315 static void
316 late_ifunc_resolve(void *dummy __unused)
317 {
318 	link_elf_late_ireloc();
319 }
320 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
321 
322 
323 void
324 cpu_setregs(void)
325 {
326 	register_t cr0;
327 
328 	cr0 = rcr0();
329 	/*
330 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
331 	 * BSP.  See the comments there about why we set them.
332 	 */
333 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
334 	load_cr0(cr0);
335 }
336 
337 /*
338  * Initialize amd64 and configure to run kernel
339  */
340 
341 /*
342  * Initialize segments & interrupt table
343  */
344 static struct gate_descriptor idt0[NIDT];
345 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
346 
347 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
348 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
349 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
350 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
351 CTASSERT(sizeof(struct nmi_pcpu) == 16);
352 
353 /*
354  * Software prototypes -- in more palatable form.
355  *
356  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
357  * slots as corresponding segments for i386 kernel.
358  */
359 struct soft_segment_descriptor gdt_segs[] = {
360 /* GNULL_SEL	0 Null Descriptor */
361 {	.ssd_base = 0x0,
362 	.ssd_limit = 0x0,
363 	.ssd_type = 0,
364 	.ssd_dpl = 0,
365 	.ssd_p = 0,
366 	.ssd_long = 0,
367 	.ssd_def32 = 0,
368 	.ssd_gran = 0		},
369 /* GNULL2_SEL	1 Null Descriptor */
370 {	.ssd_base = 0x0,
371 	.ssd_limit = 0x0,
372 	.ssd_type = 0,
373 	.ssd_dpl = 0,
374 	.ssd_p = 0,
375 	.ssd_long = 0,
376 	.ssd_def32 = 0,
377 	.ssd_gran = 0		},
378 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
379 {	.ssd_base = 0x0,
380 	.ssd_limit = 0xfffff,
381 	.ssd_type = SDT_MEMRWA,
382 	.ssd_dpl = SEL_UPL,
383 	.ssd_p = 1,
384 	.ssd_long = 0,
385 	.ssd_def32 = 1,
386 	.ssd_gran = 1		},
387 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
388 {	.ssd_base = 0x0,
389 	.ssd_limit = 0xfffff,
390 	.ssd_type = SDT_MEMRWA,
391 	.ssd_dpl = SEL_UPL,
392 	.ssd_p = 1,
393 	.ssd_long = 0,
394 	.ssd_def32 = 1,
395 	.ssd_gran = 1		},
396 /* GCODE_SEL	4 Code Descriptor for kernel */
397 {	.ssd_base = 0x0,
398 	.ssd_limit = 0xfffff,
399 	.ssd_type = SDT_MEMERA,
400 	.ssd_dpl = SEL_KPL,
401 	.ssd_p = 1,
402 	.ssd_long = 1,
403 	.ssd_def32 = 0,
404 	.ssd_gran = 1		},
405 /* GDATA_SEL	5 Data Descriptor for kernel */
406 {	.ssd_base = 0x0,
407 	.ssd_limit = 0xfffff,
408 	.ssd_type = SDT_MEMRWA,
409 	.ssd_dpl = SEL_KPL,
410 	.ssd_p = 1,
411 	.ssd_long = 1,
412 	.ssd_def32 = 0,
413 	.ssd_gran = 1		},
414 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
415 {	.ssd_base = 0x0,
416 	.ssd_limit = 0xfffff,
417 	.ssd_type = SDT_MEMERA,
418 	.ssd_dpl = SEL_UPL,
419 	.ssd_p = 1,
420 	.ssd_long = 0,
421 	.ssd_def32 = 1,
422 	.ssd_gran = 1		},
423 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
424 {	.ssd_base = 0x0,
425 	.ssd_limit = 0xfffff,
426 	.ssd_type = SDT_MEMRWA,
427 	.ssd_dpl = SEL_UPL,
428 	.ssd_p = 1,
429 	.ssd_long = 0,
430 	.ssd_def32 = 1,
431 	.ssd_gran = 1		},
432 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
433 {	.ssd_base = 0x0,
434 	.ssd_limit = 0xfffff,
435 	.ssd_type = SDT_MEMERA,
436 	.ssd_dpl = SEL_UPL,
437 	.ssd_p = 1,
438 	.ssd_long = 1,
439 	.ssd_def32 = 0,
440 	.ssd_gran = 1		},
441 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
442 {	.ssd_base = 0x0,
443 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
444 	.ssd_type = SDT_SYSTSS,
445 	.ssd_dpl = SEL_KPL,
446 	.ssd_p = 1,
447 	.ssd_long = 0,
448 	.ssd_def32 = 0,
449 	.ssd_gran = 0		},
450 /* Actually, the TSS is a system descriptor which is double size */
451 {	.ssd_base = 0x0,
452 	.ssd_limit = 0x0,
453 	.ssd_type = 0,
454 	.ssd_dpl = 0,
455 	.ssd_p = 0,
456 	.ssd_long = 0,
457 	.ssd_def32 = 0,
458 	.ssd_gran = 0		},
459 /* GUSERLDT_SEL	11 LDT Descriptor */
460 {	.ssd_base = 0x0,
461 	.ssd_limit = 0x0,
462 	.ssd_type = 0,
463 	.ssd_dpl = 0,
464 	.ssd_p = 0,
465 	.ssd_long = 0,
466 	.ssd_def32 = 0,
467 	.ssd_gran = 0		},
468 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
469 {	.ssd_base = 0x0,
470 	.ssd_limit = 0x0,
471 	.ssd_type = 0,
472 	.ssd_dpl = 0,
473 	.ssd_p = 0,
474 	.ssd_long = 0,
475 	.ssd_def32 = 0,
476 	.ssd_gran = 0		},
477 };
478 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
479 
480 void
481 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
482 {
483 	struct gate_descriptor *ip;
484 
485 	ip = idt + idx;
486 	ip->gd_looffset = (uintptr_t)func;
487 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
488 	ip->gd_ist = ist;
489 	ip->gd_xx = 0;
490 	ip->gd_type = typ;
491 	ip->gd_dpl = dpl;
492 	ip->gd_p = 1;
493 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
494 }
495 
496 extern inthand_t
497 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
498 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
499 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
500 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
501 	IDTVEC(xmm), IDTVEC(dblfault),
502 	IDTVEC(div_pti), IDTVEC(bpt_pti),
503 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
504 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
505 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
506 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
507 	IDTVEC(xmm_pti),
508 #ifdef KDTRACE_HOOKS
509 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
510 #endif
511 #ifdef XENHVM
512 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
513 #endif
514 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
515 	IDTVEC(fast_syscall_pti);
516 
517 #ifdef DDB
518 /*
519  * Display the index and function name of any IDT entries that don't use
520  * the default 'rsvd' entry point.
521  */
522 DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE)
523 {
524 	struct gate_descriptor *ip;
525 	int idx;
526 	uintptr_t func;
527 
528 	ip = idt;
529 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
530 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
531 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
532 			db_printf("%3d\t", idx);
533 			db_printsym(func, DB_STGY_PROC);
534 			db_printf("\n");
535 		}
536 		ip++;
537 	}
538 }
539 
540 /* Show privileged registers. */
541 DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE)
542 {
543 	struct {
544 		uint16_t limit;
545 		uint64_t base;
546 	} __packed idtr, gdtr;
547 	uint16_t ldt, tr;
548 
549 	__asm __volatile("sidt %0" : "=m" (idtr));
550 	db_printf("idtr\t0x%016lx/%04x\n",
551 	    (u_long)idtr.base, (u_int)idtr.limit);
552 	__asm __volatile("sgdt %0" : "=m" (gdtr));
553 	db_printf("gdtr\t0x%016lx/%04x\n",
554 	    (u_long)gdtr.base, (u_int)gdtr.limit);
555 	__asm __volatile("sldt %0" : "=r" (ldt));
556 	db_printf("ldtr\t0x%04x\n", ldt);
557 	__asm __volatile("str %0" : "=r" (tr));
558 	db_printf("tr\t0x%04x\n", tr);
559 	db_printf("cr0\t0x%016lx\n", rcr0());
560 	db_printf("cr2\t0x%016lx\n", rcr2());
561 	db_printf("cr3\t0x%016lx\n", rcr3());
562 	db_printf("cr4\t0x%016lx\n", rcr4());
563 	if (rcr4() & CR4_XSAVE)
564 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
565 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
566 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
567 		db_printf("FEATURES_CTL\t%016lx\n",
568 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
569 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
570 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
571 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
572 }
573 
574 DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE)
575 {
576 
577 	db_printf("dr0\t0x%016lx\n", rdr0());
578 	db_printf("dr1\t0x%016lx\n", rdr1());
579 	db_printf("dr2\t0x%016lx\n", rdr2());
580 	db_printf("dr3\t0x%016lx\n", rdr3());
581 	db_printf("dr6\t0x%016lx\n", rdr6());
582 	db_printf("dr7\t0x%016lx\n", rdr7());
583 }
584 #endif
585 
586 void
587 sdtossd(sd, ssd)
588 	struct user_segment_descriptor *sd;
589 	struct soft_segment_descriptor *ssd;
590 {
591 
592 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
593 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
594 	ssd->ssd_type  = sd->sd_type;
595 	ssd->ssd_dpl   = sd->sd_dpl;
596 	ssd->ssd_p     = sd->sd_p;
597 	ssd->ssd_long  = sd->sd_long;
598 	ssd->ssd_def32 = sd->sd_def32;
599 	ssd->ssd_gran  = sd->sd_gran;
600 }
601 
602 void
603 ssdtosd(ssd, sd)
604 	struct soft_segment_descriptor *ssd;
605 	struct user_segment_descriptor *sd;
606 {
607 
608 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
609 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
610 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
611 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
612 	sd->sd_type  = ssd->ssd_type;
613 	sd->sd_dpl   = ssd->ssd_dpl;
614 	sd->sd_p     = ssd->ssd_p;
615 	sd->sd_long  = ssd->ssd_long;
616 	sd->sd_def32 = ssd->ssd_def32;
617 	sd->sd_gran  = ssd->ssd_gran;
618 }
619 
620 void
621 ssdtosyssd(ssd, sd)
622 	struct soft_segment_descriptor *ssd;
623 	struct system_segment_descriptor *sd;
624 {
625 
626 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
627 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
628 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
629 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
630 	sd->sd_type  = ssd->ssd_type;
631 	sd->sd_dpl   = ssd->ssd_dpl;
632 	sd->sd_p     = ssd->ssd_p;
633 	sd->sd_gran  = ssd->ssd_gran;
634 }
635 
636 u_int basemem;
637 
638 static int
639 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
640     int *physmap_idxp)
641 {
642 	int i, insert_idx, physmap_idx;
643 
644 	physmap_idx = *physmap_idxp;
645 
646 	if (length == 0)
647 		return (1);
648 
649 	/*
650 	 * Find insertion point while checking for overlap.  Start off by
651 	 * assuming the new entry will be added to the end.
652 	 *
653 	 * NB: physmap_idx points to the next free slot.
654 	 */
655 	insert_idx = physmap_idx;
656 	for (i = 0; i <= physmap_idx; i += 2) {
657 		if (base < physmap[i + 1]) {
658 			if (base + length <= physmap[i]) {
659 				insert_idx = i;
660 				break;
661 			}
662 			if (boothowto & RB_VERBOSE)
663 				printf(
664 		    "Overlapping memory regions, ignoring second region\n");
665 			return (1);
666 		}
667 	}
668 
669 	/* See if we can prepend to the next entry. */
670 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
671 		physmap[insert_idx] = base;
672 		return (1);
673 	}
674 
675 	/* See if we can append to the previous entry. */
676 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
677 		physmap[insert_idx - 1] += length;
678 		return (1);
679 	}
680 
681 	physmap_idx += 2;
682 	*physmap_idxp = physmap_idx;
683 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
684 		printf(
685 		"Too many segments in the physical address map, giving up\n");
686 		return (0);
687 	}
688 
689 	/*
690 	 * Move the last 'N' entries down to make room for the new
691 	 * entry if needed.
692 	 */
693 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
694 		physmap[i] = physmap[i - 2];
695 		physmap[i + 1] = physmap[i - 1];
696 	}
697 
698 	/* Insert the new entry. */
699 	physmap[insert_idx] = base;
700 	physmap[insert_idx + 1] = base + length;
701 	return (1);
702 }
703 
704 void
705 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
706                       vm_paddr_t *physmap, int *physmap_idx)
707 {
708 	struct bios_smap *smap, *smapend;
709 
710 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
711 
712 	for (smap = smapbase; smap < smapend; smap++) {
713 		if (boothowto & RB_VERBOSE)
714 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
715 			    smap->type, smap->base, smap->length);
716 
717 		if (smap->type != SMAP_TYPE_MEMORY)
718 			continue;
719 
720 		if (!add_physmap_entry(smap->base, smap->length, physmap,
721 		    physmap_idx))
722 			break;
723 	}
724 }
725 
726 static void
727 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
728     int *physmap_idx)
729 {
730 	struct efi_md *map, *p;
731 	const char *type;
732 	size_t efisz;
733 	int ndesc, i;
734 
735 	static const char *types[] = {
736 		"Reserved",
737 		"LoaderCode",
738 		"LoaderData",
739 		"BootServicesCode",
740 		"BootServicesData",
741 		"RuntimeServicesCode",
742 		"RuntimeServicesData",
743 		"ConventionalMemory",
744 		"UnusableMemory",
745 		"ACPIReclaimMemory",
746 		"ACPIMemoryNVS",
747 		"MemoryMappedIO",
748 		"MemoryMappedIOPortSpace",
749 		"PalCode",
750 		"PersistentMemory"
751 	};
752 
753 	/*
754 	 * Memory map data provided by UEFI via the GetMemoryMap
755 	 * Boot Services API.
756 	 */
757 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
758 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
759 
760 	if (efihdr->descriptor_size == 0)
761 		return;
762 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
763 
764 	if (boothowto & RB_VERBOSE)
765 		printf("%23s %12s %12s %8s %4s\n",
766 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
767 
768 	for (i = 0, p = map; i < ndesc; i++,
769 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
770 		if (boothowto & RB_VERBOSE) {
771 			if (p->md_type < nitems(types))
772 				type = types[p->md_type];
773 			else
774 				type = "<INVALID>";
775 			printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
776 			    p->md_virt, p->md_pages);
777 			if (p->md_attr & EFI_MD_ATTR_UC)
778 				printf("UC ");
779 			if (p->md_attr & EFI_MD_ATTR_WC)
780 				printf("WC ");
781 			if (p->md_attr & EFI_MD_ATTR_WT)
782 				printf("WT ");
783 			if (p->md_attr & EFI_MD_ATTR_WB)
784 				printf("WB ");
785 			if (p->md_attr & EFI_MD_ATTR_UCE)
786 				printf("UCE ");
787 			if (p->md_attr & EFI_MD_ATTR_WP)
788 				printf("WP ");
789 			if (p->md_attr & EFI_MD_ATTR_RP)
790 				printf("RP ");
791 			if (p->md_attr & EFI_MD_ATTR_XP)
792 				printf("XP ");
793 			if (p->md_attr & EFI_MD_ATTR_NV)
794 				printf("NV ");
795 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
796 				printf("MORE_RELIABLE ");
797 			if (p->md_attr & EFI_MD_ATTR_RO)
798 				printf("RO ");
799 			if (p->md_attr & EFI_MD_ATTR_RT)
800 				printf("RUNTIME");
801 			printf("\n");
802 		}
803 
804 		switch (p->md_type) {
805 		case EFI_MD_TYPE_CODE:
806 		case EFI_MD_TYPE_DATA:
807 		case EFI_MD_TYPE_BS_CODE:
808 		case EFI_MD_TYPE_BS_DATA:
809 		case EFI_MD_TYPE_FREE:
810 			/*
811 			 * We're allowed to use any entry with these types.
812 			 */
813 			break;
814 		default:
815 			continue;
816 		}
817 
818 		if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE,
819 		    physmap, physmap_idx))
820 			break;
821 	}
822 }
823 
824 static void
825 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
826 {
827 	struct bios_smap *smap;
828 	struct efi_map_header *efihdr;
829 	u_int32_t size;
830 
831 	/*
832 	 * Memory map from INT 15:E820.
833 	 *
834 	 * subr_module.c says:
835 	 * "Consumer may safely assume that size value precedes data."
836 	 * ie: an int32_t immediately precedes smap.
837 	 */
838 
839 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
840 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
841 	smap = (struct bios_smap *)preload_search_info(kmdp,
842 	    MODINFO_METADATA | MODINFOMD_SMAP);
843 	if (efihdr == NULL && smap == NULL)
844 		panic("No BIOS smap or EFI map info from loader!");
845 
846 	if (efihdr != NULL) {
847 		add_efi_map_entries(efihdr, physmap, physmap_idx);
848 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
849 	} else {
850 		size = *((u_int32_t *)smap - 1);
851 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
852 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
853 	}
854 }
855 
856 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
857 
858 /*
859  * Populate the (physmap) array with base/bound pairs describing the
860  * available physical memory in the system, then test this memory and
861  * build the phys_avail array describing the actually-available memory.
862  *
863  * Total memory size may be set by the kernel environment variable
864  * hw.physmem or the compile-time define MAXMEM.
865  *
866  * XXX first should be vm_paddr_t.
867  */
868 static void
869 getmemsize(caddr_t kmdp, u_int64_t first)
870 {
871 	int i, physmap_idx, pa_indx, da_indx;
872 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
873 	u_long physmem_start, physmem_tunable, memtest;
874 	pt_entry_t *pte;
875 	quad_t dcons_addr, dcons_size;
876 	int page_counter;
877 
878 	/*
879 	 * Tell the physical memory allocator about pages used to store
880 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
881 	 */
882 	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
883 
884 	bzero(physmap, sizeof(physmap));
885 	physmap_idx = 0;
886 
887 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
888 	physmap_idx -= 2;
889 
890 	/*
891 	 * Find the 'base memory' segment for SMP
892 	 */
893 	basemem = 0;
894 	for (i = 0; i <= physmap_idx; i += 2) {
895 		if (physmap[i] <= 0xA0000) {
896 			basemem = physmap[i + 1] / 1024;
897 			break;
898 		}
899 	}
900 	if (basemem == 0 || basemem > 640) {
901 		if (bootverbose)
902 			printf(
903 		"Memory map doesn't contain a basemem segment, faking it");
904 		basemem = 640;
905 	}
906 
907 	/*
908 	 * Maxmem isn't the "maximum memory", it's one larger than the
909 	 * highest page of the physical address space.  It should be
910 	 * called something like "Maxphyspage".  We may adjust this
911 	 * based on ``hw.physmem'' and the results of the memory test.
912 	 */
913 	Maxmem = atop(physmap[physmap_idx + 1]);
914 
915 #ifdef MAXMEM
916 	Maxmem = MAXMEM / 4;
917 #endif
918 
919 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
920 		Maxmem = atop(physmem_tunable);
921 
922 	/*
923 	 * The boot memory test is disabled by default, as it takes a
924 	 * significant amount of time on large-memory systems, and is
925 	 * unfriendly to virtual machines as it unnecessarily touches all
926 	 * pages.
927 	 *
928 	 * A general name is used as the code may be extended to support
929 	 * additional tests beyond the current "page present" test.
930 	 */
931 	memtest = 0;
932 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
933 
934 	/*
935 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
936 	 * in the system.
937 	 */
938 	if (Maxmem > atop(physmap[physmap_idx + 1]))
939 		Maxmem = atop(physmap[physmap_idx + 1]);
940 
941 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
942 	    (boothowto & RB_VERBOSE))
943 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
944 
945 	/* call pmap initialization to make new kernel address space */
946 	pmap_bootstrap(&first);
947 
948 	/*
949 	 * Size up each available chunk of physical memory.
950 	 *
951 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
952 	 * By default, mask off the first 16 pages unless we appear to be
953 	 * running in a VM.
954 	 */
955 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
956 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
957 	if (physmap[0] < physmem_start) {
958 		if (physmem_start < PAGE_SIZE)
959 			physmap[0] = PAGE_SIZE;
960 		else if (physmem_start >= physmap[1])
961 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
962 		else
963 			physmap[0] = round_page(physmem_start);
964 	}
965 	pa_indx = 0;
966 	da_indx = 1;
967 	phys_avail[pa_indx++] = physmap[0];
968 	phys_avail[pa_indx] = physmap[0];
969 	dump_avail[da_indx] = physmap[0];
970 	pte = CMAP1;
971 
972 	/*
973 	 * Get dcons buffer address
974 	 */
975 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
976 	    getenv_quad("dcons.size", &dcons_size) == 0)
977 		dcons_addr = 0;
978 
979 	/*
980 	 * physmap is in bytes, so when converting to page boundaries,
981 	 * round up the start address and round down the end address.
982 	 */
983 	page_counter = 0;
984 	if (memtest != 0)
985 		printf("Testing system memory");
986 	for (i = 0; i <= physmap_idx; i += 2) {
987 		vm_paddr_t end;
988 
989 		end = ptoa((vm_paddr_t)Maxmem);
990 		if (physmap[i + 1] < end)
991 			end = trunc_page(physmap[i + 1]);
992 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
993 			int tmp, page_bad, full;
994 			int *ptr = (int *)CADDR1;
995 
996 			full = FALSE;
997 			/*
998 			 * block out kernel memory as not available.
999 			 */
1000 			if (pa >= (vm_paddr_t)kernphys && pa < first)
1001 				goto do_dump_avail;
1002 
1003 			/*
1004 			 * block out dcons buffer
1005 			 */
1006 			if (dcons_addr > 0
1007 			    && pa >= trunc_page(dcons_addr)
1008 			    && pa < dcons_addr + dcons_size)
1009 				goto do_dump_avail;
1010 
1011 			page_bad = FALSE;
1012 			if (memtest == 0)
1013 				goto skip_memtest;
1014 
1015 			/*
1016 			 * Print a "." every GB to show we're making
1017 			 * progress.
1018 			 */
1019 			page_counter++;
1020 			if ((page_counter % PAGES_PER_GB) == 0)
1021 				printf(".");
1022 
1023 			/*
1024 			 * map page into kernel: valid, read/write,non-cacheable
1025 			 */
1026 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1027 			invltlb();
1028 
1029 			tmp = *(int *)ptr;
1030 			/*
1031 			 * Test for alternating 1's and 0's
1032 			 */
1033 			*(volatile int *)ptr = 0xaaaaaaaa;
1034 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1035 				page_bad = TRUE;
1036 			/*
1037 			 * Test for alternating 0's and 1's
1038 			 */
1039 			*(volatile int *)ptr = 0x55555555;
1040 			if (*(volatile int *)ptr != 0x55555555)
1041 				page_bad = TRUE;
1042 			/*
1043 			 * Test for all 1's
1044 			 */
1045 			*(volatile int *)ptr = 0xffffffff;
1046 			if (*(volatile int *)ptr != 0xffffffff)
1047 				page_bad = TRUE;
1048 			/*
1049 			 * Test for all 0's
1050 			 */
1051 			*(volatile int *)ptr = 0x0;
1052 			if (*(volatile int *)ptr != 0x0)
1053 				page_bad = TRUE;
1054 			/*
1055 			 * Restore original value.
1056 			 */
1057 			*(int *)ptr = tmp;
1058 
1059 skip_memtest:
1060 			/*
1061 			 * Adjust array of valid/good pages.
1062 			 */
1063 			if (page_bad == TRUE)
1064 				continue;
1065 			/*
1066 			 * If this good page is a continuation of the
1067 			 * previous set of good pages, then just increase
1068 			 * the end pointer. Otherwise start a new chunk.
1069 			 * Note that "end" points one higher than end,
1070 			 * making the range >= start and < end.
1071 			 * If we're also doing a speculative memory
1072 			 * test and we at or past the end, bump up Maxmem
1073 			 * so that we keep going. The first bad page
1074 			 * will terminate the loop.
1075 			 */
1076 			if (phys_avail[pa_indx] == pa) {
1077 				phys_avail[pa_indx] += PAGE_SIZE;
1078 			} else {
1079 				pa_indx++;
1080 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
1081 					printf(
1082 		"Too many holes in the physical address space, giving up\n");
1083 					pa_indx--;
1084 					full = TRUE;
1085 					goto do_dump_avail;
1086 				}
1087 				phys_avail[pa_indx++] = pa;	/* start */
1088 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1089 			}
1090 			physmem++;
1091 do_dump_avail:
1092 			if (dump_avail[da_indx] == pa) {
1093 				dump_avail[da_indx] += PAGE_SIZE;
1094 			} else {
1095 				da_indx++;
1096 				if (da_indx == PHYS_AVAIL_ENTRIES) {
1097 					da_indx--;
1098 					goto do_next;
1099 				}
1100 				dump_avail[da_indx++] = pa; /* start */
1101 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1102 			}
1103 do_next:
1104 			if (full)
1105 				break;
1106 		}
1107 	}
1108 	*pte = 0;
1109 	invltlb();
1110 	if (memtest != 0)
1111 		printf("\n");
1112 
1113 	/*
1114 	 * XXX
1115 	 * The last chunk must contain at least one page plus the message
1116 	 * buffer to avoid complicating other code (message buffer address
1117 	 * calculation, etc.).
1118 	 */
1119 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1120 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1121 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1122 		phys_avail[pa_indx--] = 0;
1123 		phys_avail[pa_indx--] = 0;
1124 	}
1125 
1126 	Maxmem = atop(phys_avail[pa_indx]);
1127 
1128 	/* Trim off space for the message buffer. */
1129 	phys_avail[pa_indx] -= round_page(msgbufsize);
1130 
1131 	/* Map the message buffer. */
1132 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1133 }
1134 
1135 static caddr_t
1136 native_parse_preload_data(u_int64_t modulep)
1137 {
1138 	caddr_t kmdp;
1139 	char *envp;
1140 #ifdef DDB
1141 	vm_offset_t ksym_start;
1142 	vm_offset_t ksym_end;
1143 #endif
1144 
1145 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1146 	preload_bootstrap_relocate(KERNBASE);
1147 	kmdp = preload_search_by_type("elf kernel");
1148 	if (kmdp == NULL)
1149 		kmdp = preload_search_by_type("elf64 kernel");
1150 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1151 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1152 	if (envp != NULL)
1153 		envp += KERNBASE;
1154 	init_static_kenv(envp, 0);
1155 #ifdef DDB
1156 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1157 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1158 	db_fetch_ksymtab(ksym_start, ksym_end, 0);
1159 #endif
1160 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1161 
1162 	return (kmdp);
1163 }
1164 
1165 static void
1166 native_clock_source_init(void)
1167 {
1168 	i8254_init();
1169 }
1170 
1171 static void
1172 amd64_kdb_init(void)
1173 {
1174 	kdb_init();
1175 #ifdef KDB
1176 	if (boothowto & RB_KDB)
1177 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1178 #endif
1179 }
1180 
1181 /* Set up the fast syscall stuff */
1182 void
1183 amd64_conf_fast_syscall(void)
1184 {
1185 	uint64_t msr;
1186 
1187 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1188 	wrmsr(MSR_EFER, msr);
1189 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1190 	    (u_int64_t)IDTVEC(fast_syscall));
1191 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1192 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1193 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1194 	wrmsr(MSR_STAR, msr);
1195 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1196 }
1197 
1198 void
1199 amd64_bsp_pcpu_init1(struct pcpu *pc)
1200 {
1201 	struct user_segment_descriptor *gdt;
1202 
1203 	PCPU_SET(prvspace, pc);
1204 	gdt = *PCPU_PTR(gdt);
1205 	PCPU_SET(curthread, &thread0);
1206 	PCPU_SET(tssp, PCPU_PTR(common_tss));
1207 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1208 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1209 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1210 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1211 	PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1212 	PCPU_SET(smp_tlb_gen, 1);
1213 }
1214 
1215 void
1216 amd64_bsp_pcpu_init2(uint64_t rsp0)
1217 {
1218 
1219 	PCPU_SET(rsp0, rsp0);
1220 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1221 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1222 	PCPU_SET(curpcb, thread0.td_pcb);
1223 }
1224 
1225 void
1226 amd64_bsp_ist_init(struct pcpu *pc)
1227 {
1228 	struct nmi_pcpu *np;
1229 	struct amd64tss *tssp;
1230 
1231 	tssp = &pc->pc_common_tss;
1232 
1233 	/* doublefault stack space, runs on ist1 */
1234 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1235 	np->np_pcpu = (register_t)pc;
1236 	tssp->tss_ist1 = (long)np;
1237 
1238 	/*
1239 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1240 	 * above the start of the ist2 stack.
1241 	 */
1242 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1243 	np->np_pcpu = (register_t)pc;
1244 	tssp->tss_ist2 = (long)np;
1245 
1246 	/*
1247 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1248 	 * above the start of the ist3 stack.
1249 	 */
1250 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1251 	np->np_pcpu = (register_t)pc;
1252 	tssp->tss_ist3 = (long)np;
1253 
1254 	/*
1255 	 * DB# stack, runs on ist4.
1256 	 */
1257 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1258 	np->np_pcpu = (register_t)pc;
1259 	tssp->tss_ist4 = (long)np;
1260 }
1261 
1262 /*
1263  * Calculate the kernel load address by inspecting page table created by loader.
1264  * The assumptions:
1265  * - kernel is mapped at KERNBASE, backed by contiguous phys memory
1266  *   aligned at 2M, below 4G (the latter is important for AP startup)
1267  * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
1268  * - kernel is mapped with 2M superpages
1269  * - all participating memory, i.e. kernel, modules, metadata,
1270  *   page table is accessible by pre-created 1:1 mapping
1271  *   (right now loader creates 1:1 mapping for lower 4G, and all
1272  *   memory is from there)
1273  * - there is a usable memory block right after the end of the
1274  *   mapped kernel and all modules/metadata, pointed to by
1275  *   physfree, for early allocations
1276  */
1277 vm_paddr_t __nosanitizeaddress __nosanitizememory
1278 amd64_loadaddr(void)
1279 {
1280 	pml4_entry_t *pml4e;
1281 	pdp_entry_t *pdpe;
1282 	pd_entry_t *pde;
1283 	uint64_t cr3;
1284 
1285 	cr3 = rcr3();
1286 	pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART);
1287 	pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART);
1288 	pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART);
1289 	return (*pde & PG_FRAME);
1290 }
1291 
1292 u_int64_t
1293 hammer_time(u_int64_t modulep, u_int64_t physfree)
1294 {
1295 	caddr_t kmdp;
1296 	int gsel_tss, x;
1297 	struct pcpu *pc;
1298 	uint64_t rsp0;
1299 	char *env;
1300 	struct user_segment_descriptor *gdt;
1301 	struct region_descriptor r_gdt;
1302 	size_t kstack0_sz;
1303 
1304 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1305 
1306 	kernphys = amd64_loadaddr();
1307 
1308 	physfree += kernphys;
1309 
1310 	kmdp = init_ops.parse_preload_data(modulep);
1311 
1312 	efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
1313 	    MODINFOMD_EFI_MAP) != NULL;
1314 
1315 	if (!efi_boot) {
1316 		/* Tell the bios to warmboot next time */
1317 		atomic_store_short((u_short *)0x472, 0x1234);
1318 	}
1319 
1320 	physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
1321 	physfree = roundup2(physfree, PAGE_SIZE);
1322 
1323 	identify_cpu1();
1324 	identify_hypervisor();
1325 	identify_cpu_fixup_bsp();
1326 	identify_cpu2();
1327 	initializecpucache();
1328 
1329 	/*
1330 	 * Check for pti, pcid, and invpcid before ifuncs are
1331 	 * resolved, to correctly select the implementation for
1332 	 * pmap_activate_sw_mode().
1333 	 */
1334 	pti = pti_get_default();
1335 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1336 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1337 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1338 		invpcid_works = (cpu_stdext_feature &
1339 		    CPUID_STDEXT_INVPCID) != 0;
1340 	} else {
1341 		pmap_pcid_enabled = 0;
1342 	}
1343 
1344 	/*
1345 	 * Now we can do small core initialization, after the PCID
1346 	 * CPU features and user knobs are evaluated.
1347 	 */
1348 	TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
1349 	    &pmap_pcid_invlpg_workaround_uena);
1350 	cpu_init_small_core();
1351 
1352 	link_elf_ireloc(kmdp);
1353 
1354 	/*
1355 	 * This may be done better later if it gets more high level
1356 	 * components in it. If so just link td->td_proc here.
1357 	 */
1358 	proc_linkup0(&proc0, &thread0);
1359 
1360 	/* Init basic tunables, hz etc */
1361 	init_param1();
1362 
1363 	thread0.td_kstack = physfree - kernphys + KERNSTART;
1364 	thread0.td_kstack_pages = kstack_pages;
1365 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1366 	bzero((void *)thread0.td_kstack, kstack0_sz);
1367 	physfree += kstack0_sz;
1368 
1369 	/*
1370 	 * Initialize enough of thread0 for delayed invalidation to
1371 	 * work very early.  Rely on thread0.td_base_pri
1372 	 * zero-initialization, it is reset to PVM at proc0_init().
1373 	 */
1374 	pmap_thread_init_invl_gen(&thread0);
1375 
1376 	pc = &temp_bsp_pcpu;
1377 	pcpu_init(pc, 0, sizeof(struct pcpu));
1378 	gdt = &temp_bsp_pcpu.pc_gdt[0];
1379 
1380 	/*
1381 	 * make gdt memory segments
1382 	 */
1383 	for (x = 0; x < NGDT; x++) {
1384 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1385 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1386 			ssdtosd(&gdt_segs[x], &gdt[x]);
1387 	}
1388 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1389 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1390 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1391 
1392 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1393 	r_gdt.rd_base = (long)gdt;
1394 	lgdt(&r_gdt);
1395 
1396 	wrmsr(MSR_FSBASE, 0);		/* User value */
1397 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1398 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1399 
1400 	dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
1401 	physfree += DPCPU_SIZE;
1402 	amd64_bsp_pcpu_init1(pc);
1403 	/* Non-late cninit() and printf() can be moved up to here. */
1404 
1405 	/*
1406 	 * Initialize mutexes.
1407 	 *
1408 	 * icu_lock: in order to allow an interrupt to occur in a critical
1409 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1410 	 *	     must be able to get the icu lock, so it can't be
1411 	 *	     under witness.
1412 	 */
1413 	mutex_init();
1414 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1415 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1416 
1417 	/* exceptions */
1418 	for (x = 0; x < NIDT; x++)
1419 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1420 		    SEL_KPL, 0);
1421 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1422 	    SEL_KPL, 0);
1423 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1424 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1425 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1426 	    SEL_UPL, 0);
1427 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1428 	    SEL_UPL, 0);
1429 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1430 	    SEL_KPL, 0);
1431 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1432 	    SEL_KPL, 0);
1433 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1434 	    SEL_KPL, 0);
1435 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1436 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1437 	    SDT_SYSIGT, SEL_KPL, 0);
1438 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1439 	    SEL_KPL, 0);
1440 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1441 	    SDT_SYSIGT, SEL_KPL, 0);
1442 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1443 	    SEL_KPL, 0);
1444 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1445 	    SEL_KPL, 0);
1446 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1447 	    SEL_KPL, 0);
1448 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1449 	    SEL_KPL, 0);
1450 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1451 	    SEL_KPL, 0);
1452 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1453 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1454 	    SEL_KPL, 0);
1455 #ifdef KDTRACE_HOOKS
1456 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1457 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1458 #endif
1459 #ifdef XENHVM
1460 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1461 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1462 #endif
1463 	r_idt.rd_limit = sizeof(idt0) - 1;
1464 	r_idt.rd_base = (long) idt;
1465 	lidt(&r_idt);
1466 
1467 	/*
1468 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1469 	 * transition).
1470 	 * Once bootblocks have updated, we can test directly for
1471 	 * efi_systbl != NULL here...
1472 	 */
1473 	if (efi_boot)
1474 		vty_set_preferred(VTY_VT);
1475 
1476 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1477 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1478 
1479 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1480 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1481 
1482 	TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
1483 	    &syscall_ret_l1d_flush_mode);
1484 
1485 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1486 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1487 
1488 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1489 
1490 	TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
1491 	    &x86_rngds_mitg_enable);
1492 
1493 	finishidentcpu();	/* Final stage of CPU initialization */
1494 
1495 	/*
1496 	 * Initialize the clock before the console so that console
1497 	 * initialization can use DELAY().
1498 	 */
1499 	clock_init();
1500 
1501 	initializecpu();	/* Initialize CPU registers */
1502 
1503 	amd64_bsp_ist_init(pc);
1504 
1505 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1506 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1507 	    IOPERM_BITMAP_SIZE;
1508 
1509 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1510 	ltr(gsel_tss);
1511 
1512 	amd64_conf_fast_syscall();
1513 
1514 	/*
1515 	 * We initialize the PCB pointer early so that exception
1516 	 * handlers will work.  Also set up td_critnest to short-cut
1517 	 * the page fault handler.
1518 	 */
1519 	cpu_max_ext_state_size = sizeof(struct savefpu);
1520 	set_top_of_stack_td(&thread0);
1521 	thread0.td_pcb = get_pcb_td(&thread0);
1522 	thread0.td_critnest = 1;
1523 
1524 	/*
1525 	 * The console and kdb should be initialized even earlier than here,
1526 	 * but some console drivers don't work until after getmemsize().
1527 	 * Default to late console initialization to support these drivers.
1528 	 * This loses mainly printf()s in getmemsize() and early debugging.
1529 	 */
1530 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1531 	if (!late_console) {
1532 		cninit();
1533 		amd64_kdb_init();
1534 	}
1535 
1536 	getmemsize(kmdp, physfree);
1537 	init_param2(physmem);
1538 
1539 	/* now running on new page tables, configured,and u/iom is accessible */
1540 
1541 #ifdef DEV_PCI
1542         /* This call might adjust phys_avail[]. */
1543         pci_early_quirks();
1544 #endif
1545 
1546 	if (late_console)
1547 		cninit();
1548 
1549 	/*
1550 	 * Dump the boot metadata. We have to wait for cninit() since console
1551 	 * output is required. If it's grossly incorrect the kernel will never
1552 	 * make it this far.
1553 	 */
1554 	if (getenv_is_true("debug.dump_modinfo_at_boot"))
1555 		preload_dump();
1556 
1557 #ifdef DEV_ISA
1558 #ifdef DEV_ATPIC
1559 	elcr_probe();
1560 	atpic_startup();
1561 #else
1562 	/* Reset and mask the atpics and leave them shut down. */
1563 	atpic_reset();
1564 
1565 	/*
1566 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1567 	 * interrupt handler.
1568 	 */
1569 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1570 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1571 #endif
1572 #else
1573 #error "have you forgotten the isa device?"
1574 #endif
1575 
1576 	if (late_console)
1577 		amd64_kdb_init();
1578 
1579 	msgbufinit(msgbufp, msgbufsize);
1580 	fpuinit();
1581 
1582 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1583 	rsp0 = thread0.td_md.md_stack_base;
1584 	/* Ensure the stack is aligned to 16 bytes */
1585 	rsp0 &= ~0xFul;
1586 	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1587 	amd64_bsp_pcpu_init2(rsp0);
1588 
1589 	/* transfer to user mode */
1590 
1591 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1592 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1593 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1594 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1595 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1596 
1597 	load_ds(_udatasel);
1598 	load_es(_udatasel);
1599 	load_fs(_ufssel);
1600 
1601 	/* setup proc 0's pcb */
1602 	thread0.td_pcb->pcb_flags = 0;
1603 
1604         env = kern_getenv("kernelname");
1605 	if (env != NULL)
1606 		strlcpy(kernelname, env, sizeof(kernelname));
1607 
1608 	kcsan_cpu_init(0);
1609 
1610 #ifdef FDT
1611 	x86_init_fdt();
1612 #endif
1613 	thread0.td_critnest = 0;
1614 
1615 	kasan_init();
1616 	kmsan_init();
1617 
1618 	TSEXIT();
1619 
1620 	/* Location of kernel stack for locore */
1621 	return (thread0.td_md.md_stack_base);
1622 }
1623 
1624 void
1625 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1626 {
1627 
1628 	pcpu->pc_acpi_id = 0xffffffff;
1629 }
1630 
1631 static int
1632 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1633 {
1634 	struct bios_smap *smapbase;
1635 	struct bios_smap_xattr smap;
1636 	caddr_t kmdp;
1637 	uint32_t *smapattr;
1638 	int count, error, i;
1639 
1640 	/* Retrieve the system memory map from the loader. */
1641 	kmdp = preload_search_by_type("elf kernel");
1642 	if (kmdp == NULL)
1643 		kmdp = preload_search_by_type("elf64 kernel");
1644 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1645 	    MODINFO_METADATA | MODINFOMD_SMAP);
1646 	if (smapbase == NULL)
1647 		return (0);
1648 	smapattr = (uint32_t *)preload_search_info(kmdp,
1649 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1650 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1651 	error = 0;
1652 	for (i = 0; i < count; i++) {
1653 		smap.base = smapbase[i].base;
1654 		smap.length = smapbase[i].length;
1655 		smap.type = smapbase[i].type;
1656 		if (smapattr != NULL)
1657 			smap.xattr = smapattr[i];
1658 		else
1659 			smap.xattr = 0;
1660 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1661 	}
1662 	return (error);
1663 }
1664 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1665     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1666     smap_sysctl_handler, "S,bios_smap_xattr",
1667     "Raw BIOS SMAP data");
1668 
1669 static int
1670 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1671 {
1672 	struct efi_map_header *efihdr;
1673 	caddr_t kmdp;
1674 	uint32_t efisize;
1675 
1676 	kmdp = preload_search_by_type("elf kernel");
1677 	if (kmdp == NULL)
1678 		kmdp = preload_search_by_type("elf64 kernel");
1679 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1680 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1681 	if (efihdr == NULL)
1682 		return (0);
1683 	efisize = *((uint32_t *)efihdr - 1);
1684 	return (SYSCTL_OUT(req, efihdr, efisize));
1685 }
1686 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1687     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1688     efi_map_sysctl_handler, "S,efi_map_header",
1689     "Raw EFI Memory Map");
1690 
1691 void
1692 spinlock_enter(void)
1693 {
1694 	struct thread *td;
1695 	register_t flags;
1696 
1697 	td = curthread;
1698 	if (td->td_md.md_spinlock_count == 0) {
1699 		flags = intr_disable();
1700 		td->td_md.md_spinlock_count = 1;
1701 		td->td_md.md_saved_flags = flags;
1702 		critical_enter();
1703 	} else
1704 		td->td_md.md_spinlock_count++;
1705 }
1706 
1707 void
1708 spinlock_exit(void)
1709 {
1710 	struct thread *td;
1711 	register_t flags;
1712 
1713 	td = curthread;
1714 	flags = td->td_md.md_saved_flags;
1715 	td->td_md.md_spinlock_count--;
1716 	if (td->td_md.md_spinlock_count == 0) {
1717 		critical_exit();
1718 		intr_restore(flags);
1719 	}
1720 }
1721 
1722 /*
1723  * Construct a PCB from a trapframe. This is called from kdb_trap() where
1724  * we want to start a backtrace from the function that caused us to enter
1725  * the debugger. We have the context in the trapframe, but base the trace
1726  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1727  * enough for a backtrace.
1728  */
1729 void
1730 makectx(struct trapframe *tf, struct pcb *pcb)
1731 {
1732 
1733 	pcb->pcb_r12 = tf->tf_r12;
1734 	pcb->pcb_r13 = tf->tf_r13;
1735 	pcb->pcb_r14 = tf->tf_r14;
1736 	pcb->pcb_r15 = tf->tf_r15;
1737 	pcb->pcb_rbp = tf->tf_rbp;
1738 	pcb->pcb_rbx = tf->tf_rbx;
1739 	pcb->pcb_rip = tf->tf_rip;
1740 	pcb->pcb_rsp = tf->tf_rsp;
1741 }
1742 
1743 /*
1744  * The pcb_flags is only modified by current thread, or by other threads
1745  * when current thread is stopped.  However, current thread may change it
1746  * from the interrupt context in cpu_switch(), or in the trap handler.
1747  * When we read-modify-write pcb_flags from C sources, compiler may generate
1748  * code that is not atomic regarding the interrupt handler.  If a trap or
1749  * interrupt happens and any flag is modified from the handler, it can be
1750  * clobbered with the cached value later.  Therefore, we implement setting
1751  * and clearing flags with single-instruction functions, which do not race
1752  * with possible modification of the flags from the trap or interrupt context,
1753  * because traps and interrupts are executed only on instruction boundary.
1754  */
1755 void
1756 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
1757 {
1758 
1759 	__asm __volatile("orl %1,%0"
1760 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
1761 	    : "cc", "memory");
1762 
1763 }
1764 
1765 /*
1766  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
1767  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
1768  * pcb if user space modified the bases.  We must save on the context
1769  * switch or if the return to usermode happens through the doreti.
1770  *
1771  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
1772  * which have a consequence that the base MSRs must be saved each time
1773  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
1774  * context switches.
1775  */
1776 static void
1777 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
1778 {
1779 	register_t r;
1780 
1781 	if (curpcb == pcb &&
1782 	    (flags & PCB_FULL_IRET) != 0 &&
1783 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1784 		r = intr_disable();
1785 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1786 			if (rfs() == _ufssel)
1787 				pcb->pcb_fsbase = rdfsbase();
1788 			if (rgs() == _ugssel)
1789 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
1790 		}
1791 		set_pcb_flags_raw(pcb, flags);
1792 		intr_restore(r);
1793 	} else {
1794 		set_pcb_flags_raw(pcb, flags);
1795 	}
1796 }
1797 
1798 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
1799 {
1800 
1801 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
1802 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
1803 }
1804 
1805 void
1806 clear_pcb_flags(struct pcb *pcb, const u_int flags)
1807 {
1808 
1809 	__asm __volatile("andl %1,%0"
1810 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
1811 	    : "cc", "memory");
1812 }
1813 
1814 #ifdef KDB
1815 
1816 /*
1817  * Provide inb() and outb() as functions.  They are normally only available as
1818  * inline functions, thus cannot be called from the debugger.
1819  */
1820 
1821 /* silence compiler warnings */
1822 u_char inb_(u_short);
1823 void outb_(u_short, u_char);
1824 
1825 u_char
1826 inb_(u_short port)
1827 {
1828 	return inb(port);
1829 }
1830 
1831 void
1832 outb_(u_short port, u_char data)
1833 {
1834 	outb(port, data);
1835 }
1836 
1837 #endif /* KDB */
1838 
1839 #undef memset
1840 #undef memmove
1841 #undef memcpy
1842 
1843 void	*memset_std(void *buf, int c, size_t len);
1844 void	*memset_erms(void *buf, int c, size_t len);
1845 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
1846 	    size_t len);
1847 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
1848 	    size_t len);
1849 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
1850 	    size_t len);
1851 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
1852 	    size_t len);
1853 
1854 #ifdef KCSAN
1855 /*
1856  * These fail to build as ifuncs when used with KCSAN.
1857  */
1858 void *
1859 memset(void *buf, int c, size_t len)
1860 {
1861 
1862 	return (memset_std(buf, c, len));
1863 }
1864 
1865 void *
1866 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1867 {
1868 
1869 	return (memmove_std(dst, src, len));
1870 }
1871 
1872 void *
1873 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1874 {
1875 
1876 	return (memcpy_std(dst, src, len));
1877 }
1878 #else
1879 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
1880 {
1881 
1882 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1883 	    memset_erms : memset_std);
1884 }
1885 
1886 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
1887     size_t))
1888 {
1889 
1890 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1891 	    memmove_erms : memmove_std);
1892 }
1893 
1894 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
1895 {
1896 
1897 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1898 	    memcpy_erms : memcpy_std);
1899 }
1900 #endif
1901 
1902 void	pagezero_std(void *addr);
1903 void	pagezero_erms(void *addr);
1904 DEFINE_IFUNC(, void , pagezero, (void *))
1905 {
1906 
1907 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1908 	    pagezero_erms : pagezero_std);
1909 }
1910