xref: /freebsd/sys/amd64/amd64/machdep.c (revision 2a58b312)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 1992 Terrence R. Lambert.
6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
41  */
42 
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 #include "opt_atpic.h"
47 #include "opt_cpu.h"
48 #include "opt_ddb.h"
49 #include "opt_inet.h"
50 #include "opt_isa.h"
51 #include "opt_kstack_pages.h"
52 #include "opt_maxmem.h"
53 #include "opt_pci.h"
54 #include "opt_platform.h"
55 #include "opt_sched.h"
56 
57 #include <sys/param.h>
58 #include <sys/proc.h>
59 #include <sys/systm.h>
60 #include <sys/asan.h>
61 #include <sys/bio.h>
62 #include <sys/buf.h>
63 #include <sys/bus.h>
64 #include <sys/callout.h>
65 #include <sys/cons.h>
66 #include <sys/cpu.h>
67 #include <sys/csan.h>
68 #include <sys/efi.h>
69 #include <sys/eventhandler.h>
70 #include <sys/exec.h>
71 #include <sys/imgact.h>
72 #include <sys/kdb.h>
73 #include <sys/kernel.h>
74 #include <sys/ktr.h>
75 #include <sys/linker.h>
76 #include <sys/lock.h>
77 #include <sys/malloc.h>
78 #include <sys/memrange.h>
79 #include <sys/msan.h>
80 #include <sys/msgbuf.h>
81 #include <sys/mutex.h>
82 #include <sys/pcpu.h>
83 #include <sys/ptrace.h>
84 #include <sys/reboot.h>
85 #include <sys/reg.h>
86 #include <sys/rwlock.h>
87 #include <sys/sched.h>
88 #include <sys/signalvar.h>
89 #ifdef SMP
90 #include <sys/smp.h>
91 #endif
92 #include <sys/syscallsubr.h>
93 #include <sys/sysctl.h>
94 #include <sys/sysent.h>
95 #include <sys/sysproto.h>
96 #include <sys/ucontext.h>
97 #include <sys/vmmeter.h>
98 
99 #include <vm/vm.h>
100 #include <vm/vm_param.h>
101 #include <vm/vm_extern.h>
102 #include <vm/vm_kern.h>
103 #include <vm/vm_page.h>
104 #include <vm/vm_map.h>
105 #include <vm/vm_object.h>
106 #include <vm/vm_pager.h>
107 #include <vm/vm_phys.h>
108 #include <vm/vm_dumpset.h>
109 
110 #ifdef DDB
111 #ifndef KDB
112 #error KDB must be enabled in order for DDB to work!
113 #endif
114 #include <ddb/ddb.h>
115 #include <ddb/db_sym.h>
116 #endif
117 
118 #include <net/netisr.h>
119 
120 #include <dev/smbios/smbios.h>
121 
122 #include <machine/clock.h>
123 #include <machine/cpu.h>
124 #include <machine/cputypes.h>
125 #include <machine/frame.h>
126 #include <machine/intr_machdep.h>
127 #include <x86/mca.h>
128 #include <machine/md_var.h>
129 #include <machine/metadata.h>
130 #include <machine/pc/bios.h>
131 #include <machine/pcb.h>
132 #include <machine/proc.h>
133 #include <machine/sigframe.h>
134 #include <machine/specialreg.h>
135 #include <machine/trap.h>
136 #include <machine/tss.h>
137 #include <x86/ucode.h>
138 #include <x86/ifunc.h>
139 #ifdef SMP
140 #include <machine/smp.h>
141 #endif
142 #ifdef FDT
143 #include <x86/fdt.h>
144 #endif
145 
146 #ifdef DEV_ATPIC
147 #include <x86/isa/icu.h>
148 #else
149 #include <x86/apicvar.h>
150 #endif
151 
152 #include <isa/isareg.h>
153 #include <isa/rtc.h>
154 #include <x86/init.h>
155 
156 /* Sanity check for __curthread() */
157 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
158 
159 /*
160  * The PTI trampoline stack needs enough space for a hardware trapframe and a
161  * couple of scratch registers, as well as the trapframe left behind after an
162  * iret fault.
163  */
164 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
165     offsetof(struct pti_frame, pti_rip));
166 
167 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
168 
169 static void cpu_startup(void *);
170 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
171 
172 /* Probe 8254 PIT and TSC. */
173 static void native_clock_source_init(void);
174 
175 /* Preload data parse function */
176 static caddr_t native_parse_preload_data(u_int64_t);
177 
178 /* Native function to fetch and parse the e820 map */
179 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
180 
181 /* Default init_ops implementation. */
182 struct init_ops init_ops = {
183 	.parse_preload_data =		native_parse_preload_data,
184 	.early_clock_source_init =	native_clock_source_init,
185 	.early_delay =			i8254_delay,
186 	.parse_memmap =			native_parse_memmap,
187 };
188 
189 /*
190  * Physical address of the EFI System Table. Stashed from the metadata hints
191  * passed into the kernel and used by the EFI code to call runtime services.
192  */
193 vm_paddr_t efi_systbl_phys;
194 
195 /* Intel ICH registers */
196 #define ICH_PMBASE	0x400
197 #define ICH_SMI_EN	ICH_PMBASE + 0x30
198 
199 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
200 
201 int cold = 1;
202 
203 long Maxmem = 0;
204 long realmem = 0;
205 int late_console = 1;
206 
207 struct kva_md_info kmi;
208 
209 struct region_descriptor r_idt;
210 
211 struct pcpu *__pcpu;
212 struct pcpu temp_bsp_pcpu;
213 
214 struct mtx icu_lock;
215 
216 struct mem_range_softc mem_range_softc;
217 
218 struct mtx dt_lock;	/* lock for GDT and LDT */
219 
220 void (*vmm_resume_p)(void);
221 
222 bool efi_boot;
223 
224 static void
225 cpu_startup(void *dummy)
226 {
227 	uintmax_t memsize;
228 	char *sysenv;
229 
230 	/*
231 	 * On MacBooks, we need to disallow the legacy USB circuit to
232 	 * generate an SMI# because this can cause several problems,
233 	 * namely: incorrect CPU frequency detection and failure to
234 	 * start the APs.
235 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
236 	 * Enable register) of the Intel ICH LPC Interface Bridge.
237 	 */
238 	sysenv = kern_getenv("smbios.system.product");
239 	if (sysenv != NULL) {
240 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
241 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
242 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
243 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
244 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
245 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
246 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
247 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
248 			if (bootverbose)
249 				printf("Disabling LEGACY_USB_EN bit on "
250 				    "Intel ICH.\n");
251 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
252 		}
253 		freeenv(sysenv);
254 	}
255 
256 	/*
257 	 * Good {morning,afternoon,evening,night}.
258 	 */
259 	startrtclock();
260 	printcpuinfo();
261 
262 	/*
263 	 * Display physical memory if SMBIOS reports reasonable amount.
264 	 */
265 	memsize = 0;
266 	sysenv = kern_getenv("smbios.memory.enabled");
267 	if (sysenv != NULL) {
268 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
269 		freeenv(sysenv);
270 	}
271 	if (memsize < ptoa((uintmax_t)vm_free_count()))
272 		memsize = ptoa((uintmax_t)Maxmem);
273 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
274 	realmem = atop(memsize);
275 
276 	/*
277 	 * Display any holes after the first chunk of extended memory.
278 	 */
279 	if (bootverbose) {
280 		int indx;
281 
282 		printf("Physical memory chunk(s):\n");
283 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
284 			vm_paddr_t size;
285 
286 			size = phys_avail[indx + 1] - phys_avail[indx];
287 			printf(
288 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
289 			    (uintmax_t)phys_avail[indx],
290 			    (uintmax_t)phys_avail[indx + 1] - 1,
291 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
292 		}
293 	}
294 
295 	vm_ksubmap_init(&kmi);
296 
297 	printf("avail memory = %ju (%ju MB)\n",
298 	    ptoa((uintmax_t)vm_free_count()),
299 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
300 #ifdef DEV_PCI
301 	if (bootverbose && intel_graphics_stolen_base != 0)
302 		printf("intel stolen mem: base %#jx size %ju MB\n",
303 		    (uintmax_t)intel_graphics_stolen_base,
304 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
305 #endif
306 
307 	/*
308 	 * Set up buffers, so they can be used to read disk labels.
309 	 */
310 	bufinit();
311 	vm_pager_bufferinit();
312 
313 	cpu_setregs();
314 }
315 
316 static void
317 late_ifunc_resolve(void *dummy __unused)
318 {
319 	link_elf_late_ireloc();
320 }
321 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
322 
323 
324 void
325 cpu_setregs(void)
326 {
327 	register_t cr0;
328 
329 	cr0 = rcr0();
330 	/*
331 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
332 	 * BSP.  See the comments there about why we set them.
333 	 */
334 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
335 	load_cr0(cr0);
336 }
337 
338 /*
339  * Initialize amd64 and configure to run kernel
340  */
341 
342 /*
343  * Initialize segments & interrupt table
344  */
345 static struct gate_descriptor idt0[NIDT];
346 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
347 
348 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
349 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
350 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
351 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
352 CTASSERT(sizeof(struct nmi_pcpu) == 16);
353 
354 /*
355  * Software prototypes -- in more palatable form.
356  *
357  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
358  * slots as corresponding segments for i386 kernel.
359  */
360 struct soft_segment_descriptor gdt_segs[] = {
361 /* GNULL_SEL	0 Null Descriptor */
362 {	.ssd_base = 0x0,
363 	.ssd_limit = 0x0,
364 	.ssd_type = 0,
365 	.ssd_dpl = 0,
366 	.ssd_p = 0,
367 	.ssd_long = 0,
368 	.ssd_def32 = 0,
369 	.ssd_gran = 0		},
370 /* GNULL2_SEL	1 Null Descriptor */
371 {	.ssd_base = 0x0,
372 	.ssd_limit = 0x0,
373 	.ssd_type = 0,
374 	.ssd_dpl = 0,
375 	.ssd_p = 0,
376 	.ssd_long = 0,
377 	.ssd_def32 = 0,
378 	.ssd_gran = 0		},
379 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
380 {	.ssd_base = 0x0,
381 	.ssd_limit = 0xfffff,
382 	.ssd_type = SDT_MEMRWA,
383 	.ssd_dpl = SEL_UPL,
384 	.ssd_p = 1,
385 	.ssd_long = 0,
386 	.ssd_def32 = 1,
387 	.ssd_gran = 1		},
388 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
389 {	.ssd_base = 0x0,
390 	.ssd_limit = 0xfffff,
391 	.ssd_type = SDT_MEMRWA,
392 	.ssd_dpl = SEL_UPL,
393 	.ssd_p = 1,
394 	.ssd_long = 0,
395 	.ssd_def32 = 1,
396 	.ssd_gran = 1		},
397 /* GCODE_SEL	4 Code Descriptor for kernel */
398 {	.ssd_base = 0x0,
399 	.ssd_limit = 0xfffff,
400 	.ssd_type = SDT_MEMERA,
401 	.ssd_dpl = SEL_KPL,
402 	.ssd_p = 1,
403 	.ssd_long = 1,
404 	.ssd_def32 = 0,
405 	.ssd_gran = 1		},
406 /* GDATA_SEL	5 Data Descriptor for kernel */
407 {	.ssd_base = 0x0,
408 	.ssd_limit = 0xfffff,
409 	.ssd_type = SDT_MEMRWA,
410 	.ssd_dpl = SEL_KPL,
411 	.ssd_p = 1,
412 	.ssd_long = 1,
413 	.ssd_def32 = 0,
414 	.ssd_gran = 1		},
415 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
416 {	.ssd_base = 0x0,
417 	.ssd_limit = 0xfffff,
418 	.ssd_type = SDT_MEMERA,
419 	.ssd_dpl = SEL_UPL,
420 	.ssd_p = 1,
421 	.ssd_long = 0,
422 	.ssd_def32 = 1,
423 	.ssd_gran = 1		},
424 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
425 {	.ssd_base = 0x0,
426 	.ssd_limit = 0xfffff,
427 	.ssd_type = SDT_MEMRWA,
428 	.ssd_dpl = SEL_UPL,
429 	.ssd_p = 1,
430 	.ssd_long = 0,
431 	.ssd_def32 = 1,
432 	.ssd_gran = 1		},
433 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
434 {	.ssd_base = 0x0,
435 	.ssd_limit = 0xfffff,
436 	.ssd_type = SDT_MEMERA,
437 	.ssd_dpl = SEL_UPL,
438 	.ssd_p = 1,
439 	.ssd_long = 1,
440 	.ssd_def32 = 0,
441 	.ssd_gran = 1		},
442 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
443 {	.ssd_base = 0x0,
444 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
445 	.ssd_type = SDT_SYSTSS,
446 	.ssd_dpl = SEL_KPL,
447 	.ssd_p = 1,
448 	.ssd_long = 0,
449 	.ssd_def32 = 0,
450 	.ssd_gran = 0		},
451 /* Actually, the TSS is a system descriptor which is double size */
452 {	.ssd_base = 0x0,
453 	.ssd_limit = 0x0,
454 	.ssd_type = 0,
455 	.ssd_dpl = 0,
456 	.ssd_p = 0,
457 	.ssd_long = 0,
458 	.ssd_def32 = 0,
459 	.ssd_gran = 0		},
460 /* GUSERLDT_SEL	11 LDT Descriptor */
461 {	.ssd_base = 0x0,
462 	.ssd_limit = 0x0,
463 	.ssd_type = 0,
464 	.ssd_dpl = 0,
465 	.ssd_p = 0,
466 	.ssd_long = 0,
467 	.ssd_def32 = 0,
468 	.ssd_gran = 0		},
469 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
470 {	.ssd_base = 0x0,
471 	.ssd_limit = 0x0,
472 	.ssd_type = 0,
473 	.ssd_dpl = 0,
474 	.ssd_p = 0,
475 	.ssd_long = 0,
476 	.ssd_def32 = 0,
477 	.ssd_gran = 0		},
478 };
479 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
480 
481 void
482 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
483 {
484 	struct gate_descriptor *ip;
485 
486 	ip = idt + idx;
487 	ip->gd_looffset = (uintptr_t)func;
488 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
489 	ip->gd_ist = ist;
490 	ip->gd_xx = 0;
491 	ip->gd_type = typ;
492 	ip->gd_dpl = dpl;
493 	ip->gd_p = 1;
494 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
495 }
496 
497 extern inthand_t
498 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
499 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
500 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
501 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
502 	IDTVEC(xmm), IDTVEC(dblfault),
503 	IDTVEC(div_pti), IDTVEC(bpt_pti),
504 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
505 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
506 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
507 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
508 	IDTVEC(xmm_pti),
509 #ifdef KDTRACE_HOOKS
510 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
511 #endif
512 #ifdef XENHVM
513 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
514 #endif
515 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
516 	IDTVEC(fast_syscall_pti);
517 
518 #ifdef DDB
519 /*
520  * Display the index and function name of any IDT entries that don't use
521  * the default 'rsvd' entry point.
522  */
523 DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE)
524 {
525 	struct gate_descriptor *ip;
526 	int idx;
527 	uintptr_t func;
528 
529 	ip = idt;
530 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
531 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
532 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
533 			db_printf("%3d\t", idx);
534 			db_printsym(func, DB_STGY_PROC);
535 			db_printf("\n");
536 		}
537 		ip++;
538 	}
539 }
540 
541 /* Show privileged registers. */
542 DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE)
543 {
544 	struct {
545 		uint16_t limit;
546 		uint64_t base;
547 	} __packed idtr, gdtr;
548 	uint16_t ldt, tr;
549 
550 	__asm __volatile("sidt %0" : "=m" (idtr));
551 	db_printf("idtr\t0x%016lx/%04x\n",
552 	    (u_long)idtr.base, (u_int)idtr.limit);
553 	__asm __volatile("sgdt %0" : "=m" (gdtr));
554 	db_printf("gdtr\t0x%016lx/%04x\n",
555 	    (u_long)gdtr.base, (u_int)gdtr.limit);
556 	__asm __volatile("sldt %0" : "=r" (ldt));
557 	db_printf("ldtr\t0x%04x\n", ldt);
558 	__asm __volatile("str %0" : "=r" (tr));
559 	db_printf("tr\t0x%04x\n", tr);
560 	db_printf("cr0\t0x%016lx\n", rcr0());
561 	db_printf("cr2\t0x%016lx\n", rcr2());
562 	db_printf("cr3\t0x%016lx\n", rcr3());
563 	db_printf("cr4\t0x%016lx\n", rcr4());
564 	if (rcr4() & CR4_XSAVE)
565 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
566 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
567 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
568 		db_printf("FEATURES_CTL\t%016lx\n",
569 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
570 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
571 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
572 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
573 }
574 
575 DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE)
576 {
577 
578 	db_printf("dr0\t0x%016lx\n", rdr0());
579 	db_printf("dr1\t0x%016lx\n", rdr1());
580 	db_printf("dr2\t0x%016lx\n", rdr2());
581 	db_printf("dr3\t0x%016lx\n", rdr3());
582 	db_printf("dr6\t0x%016lx\n", rdr6());
583 	db_printf("dr7\t0x%016lx\n", rdr7());
584 }
585 #endif
586 
587 void
588 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
589 {
590 
591 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
592 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
593 	ssd->ssd_type  = sd->sd_type;
594 	ssd->ssd_dpl   = sd->sd_dpl;
595 	ssd->ssd_p     = sd->sd_p;
596 	ssd->ssd_long  = sd->sd_long;
597 	ssd->ssd_def32 = sd->sd_def32;
598 	ssd->ssd_gran  = sd->sd_gran;
599 }
600 
601 void
602 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
603 {
604 
605 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
606 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
607 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
608 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
609 	sd->sd_type  = ssd->ssd_type;
610 	sd->sd_dpl   = ssd->ssd_dpl;
611 	sd->sd_p     = ssd->ssd_p;
612 	sd->sd_long  = ssd->ssd_long;
613 	sd->sd_def32 = ssd->ssd_def32;
614 	sd->sd_gran  = ssd->ssd_gran;
615 }
616 
617 void
618 ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd)
619 {
620 
621 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
622 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
623 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
624 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
625 	sd->sd_type  = ssd->ssd_type;
626 	sd->sd_dpl   = ssd->ssd_dpl;
627 	sd->sd_p     = ssd->ssd_p;
628 	sd->sd_gran  = ssd->ssd_gran;
629 }
630 
631 u_int basemem;
632 
633 static int
634 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
635     int *physmap_idxp)
636 {
637 	int i, insert_idx, physmap_idx;
638 
639 	physmap_idx = *physmap_idxp;
640 
641 	if (length == 0)
642 		return (1);
643 
644 	/*
645 	 * Find insertion point while checking for overlap.  Start off by
646 	 * assuming the new entry will be added to the end.
647 	 *
648 	 * NB: physmap_idx points to the next free slot.
649 	 */
650 	insert_idx = physmap_idx;
651 	for (i = 0; i <= physmap_idx; i += 2) {
652 		if (base < physmap[i + 1]) {
653 			if (base + length <= physmap[i]) {
654 				insert_idx = i;
655 				break;
656 			}
657 			if (boothowto & RB_VERBOSE)
658 				printf(
659 		    "Overlapping memory regions, ignoring second region\n");
660 			return (1);
661 		}
662 	}
663 
664 	/* See if we can prepend to the next entry. */
665 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
666 		physmap[insert_idx] = base;
667 		return (1);
668 	}
669 
670 	/* See if we can append to the previous entry. */
671 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
672 		physmap[insert_idx - 1] += length;
673 		return (1);
674 	}
675 
676 	physmap_idx += 2;
677 	*physmap_idxp = physmap_idx;
678 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
679 		printf(
680 		"Too many segments in the physical address map, giving up\n");
681 		return (0);
682 	}
683 
684 	/*
685 	 * Move the last 'N' entries down to make room for the new
686 	 * entry if needed.
687 	 */
688 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
689 		physmap[i] = physmap[i - 2];
690 		physmap[i + 1] = physmap[i - 1];
691 	}
692 
693 	/* Insert the new entry. */
694 	physmap[insert_idx] = base;
695 	physmap[insert_idx + 1] = base + length;
696 	return (1);
697 }
698 
699 void
700 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
701                       vm_paddr_t *physmap, int *physmap_idx)
702 {
703 	struct bios_smap *smap, *smapend;
704 
705 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
706 
707 	for (smap = smapbase; smap < smapend; smap++) {
708 		if (boothowto & RB_VERBOSE)
709 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
710 			    smap->type, smap->base, smap->length);
711 
712 		if (smap->type != SMAP_TYPE_MEMORY)
713 			continue;
714 
715 		if (!add_physmap_entry(smap->base, smap->length, physmap,
716 		    physmap_idx))
717 			break;
718 	}
719 }
720 
721 static void
722 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
723     int *physmap_idx)
724 {
725 	struct efi_md *map, *p;
726 	const char *type;
727 	size_t efisz;
728 	int ndesc, i;
729 
730 	static const char *types[] = {
731 		"Reserved",
732 		"LoaderCode",
733 		"LoaderData",
734 		"BootServicesCode",
735 		"BootServicesData",
736 		"RuntimeServicesCode",
737 		"RuntimeServicesData",
738 		"ConventionalMemory",
739 		"UnusableMemory",
740 		"ACPIReclaimMemory",
741 		"ACPIMemoryNVS",
742 		"MemoryMappedIO",
743 		"MemoryMappedIOPortSpace",
744 		"PalCode",
745 		"PersistentMemory"
746 	};
747 
748 	/*
749 	 * Memory map data provided by UEFI via the GetMemoryMap
750 	 * Boot Services API.
751 	 */
752 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
753 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
754 
755 	if (efihdr->descriptor_size == 0)
756 		return;
757 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
758 
759 	if (boothowto & RB_VERBOSE)
760 		printf("%23s %12s %12s %8s %4s\n",
761 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
762 
763 	for (i = 0, p = map; i < ndesc; i++,
764 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
765 		if (boothowto & RB_VERBOSE) {
766 			if (p->md_type < nitems(types))
767 				type = types[p->md_type];
768 			else
769 				type = "<INVALID>";
770 			printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
771 			    p->md_virt, p->md_pages);
772 			if (p->md_attr & EFI_MD_ATTR_UC)
773 				printf("UC ");
774 			if (p->md_attr & EFI_MD_ATTR_WC)
775 				printf("WC ");
776 			if (p->md_attr & EFI_MD_ATTR_WT)
777 				printf("WT ");
778 			if (p->md_attr & EFI_MD_ATTR_WB)
779 				printf("WB ");
780 			if (p->md_attr & EFI_MD_ATTR_UCE)
781 				printf("UCE ");
782 			if (p->md_attr & EFI_MD_ATTR_WP)
783 				printf("WP ");
784 			if (p->md_attr & EFI_MD_ATTR_RP)
785 				printf("RP ");
786 			if (p->md_attr & EFI_MD_ATTR_XP)
787 				printf("XP ");
788 			if (p->md_attr & EFI_MD_ATTR_NV)
789 				printf("NV ");
790 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
791 				printf("MORE_RELIABLE ");
792 			if (p->md_attr & EFI_MD_ATTR_RO)
793 				printf("RO ");
794 			if (p->md_attr & EFI_MD_ATTR_RT)
795 				printf("RUNTIME");
796 			printf("\n");
797 		}
798 
799 		switch (p->md_type) {
800 		case EFI_MD_TYPE_CODE:
801 		case EFI_MD_TYPE_DATA:
802 		case EFI_MD_TYPE_BS_CODE:
803 		case EFI_MD_TYPE_BS_DATA:
804 		case EFI_MD_TYPE_FREE:
805 			/*
806 			 * We're allowed to use any entry with these types.
807 			 */
808 			break;
809 		default:
810 			continue;
811 		}
812 
813 		if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE,
814 		    physmap, physmap_idx))
815 			break;
816 	}
817 }
818 
819 static void
820 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
821 {
822 	struct bios_smap *smap;
823 	struct efi_map_header *efihdr;
824 	u_int32_t size;
825 
826 	/*
827 	 * Memory map from INT 15:E820.
828 	 *
829 	 * subr_module.c says:
830 	 * "Consumer may safely assume that size value precedes data."
831 	 * ie: an int32_t immediately precedes smap.
832 	 */
833 
834 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
835 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
836 	smap = (struct bios_smap *)preload_search_info(kmdp,
837 	    MODINFO_METADATA | MODINFOMD_SMAP);
838 	if (efihdr == NULL && smap == NULL)
839 		panic("No BIOS smap or EFI map info from loader!");
840 
841 	if (efihdr != NULL) {
842 		add_efi_map_entries(efihdr, physmap, physmap_idx);
843 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
844 	} else {
845 		size = *((u_int32_t *)smap - 1);
846 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
847 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
848 	}
849 }
850 
851 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
852 
853 /*
854  * Populate the (physmap) array with base/bound pairs describing the
855  * available physical memory in the system, then test this memory and
856  * build the phys_avail array describing the actually-available memory.
857  *
858  * Total memory size may be set by the kernel environment variable
859  * hw.physmem or the compile-time define MAXMEM.
860  *
861  * XXX first should be vm_paddr_t.
862  */
863 static void
864 getmemsize(caddr_t kmdp, u_int64_t first)
865 {
866 	int i, physmap_idx, pa_indx, da_indx;
867 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
868 	u_long physmem_start, physmem_tunable, memtest;
869 	pt_entry_t *pte;
870 	quad_t dcons_addr, dcons_size;
871 	int page_counter;
872 
873 	/*
874 	 * Tell the physical memory allocator about pages used to store
875 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
876 	 */
877 	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
878 
879 	bzero(physmap, sizeof(physmap));
880 	physmap_idx = 0;
881 
882 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
883 	physmap_idx -= 2;
884 
885 	/*
886 	 * Find the 'base memory' segment for SMP
887 	 */
888 	basemem = 0;
889 	for (i = 0; i <= physmap_idx; i += 2) {
890 		if (physmap[i] <= 0xA0000) {
891 			basemem = physmap[i + 1] / 1024;
892 			break;
893 		}
894 	}
895 	if (basemem == 0 || basemem > 640) {
896 		if (bootverbose)
897 			printf(
898 		"Memory map doesn't contain a basemem segment, faking it");
899 		basemem = 640;
900 	}
901 
902 	/*
903 	 * Maxmem isn't the "maximum memory", it's one larger than the
904 	 * highest page of the physical address space.  It should be
905 	 * called something like "Maxphyspage".  We may adjust this
906 	 * based on ``hw.physmem'' and the results of the memory test.
907 	 */
908 	Maxmem = atop(physmap[physmap_idx + 1]);
909 
910 #ifdef MAXMEM
911 	Maxmem = MAXMEM / 4;
912 #endif
913 
914 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
915 		Maxmem = atop(physmem_tunable);
916 
917 	/*
918 	 * The boot memory test is disabled by default, as it takes a
919 	 * significant amount of time on large-memory systems, and is
920 	 * unfriendly to virtual machines as it unnecessarily touches all
921 	 * pages.
922 	 *
923 	 * A general name is used as the code may be extended to support
924 	 * additional tests beyond the current "page present" test.
925 	 */
926 	memtest = 0;
927 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
928 
929 	/*
930 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
931 	 * in the system.
932 	 */
933 	if (Maxmem > atop(physmap[physmap_idx + 1]))
934 		Maxmem = atop(physmap[physmap_idx + 1]);
935 
936 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
937 	    (boothowto & RB_VERBOSE))
938 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
939 
940 	/* call pmap initialization to make new kernel address space */
941 	pmap_bootstrap(&first);
942 
943 	/*
944 	 * Size up each available chunk of physical memory.
945 	 *
946 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
947 	 * By default, mask off the first 16 pages unless we appear to be
948 	 * running in a VM.
949 	 */
950 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
951 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
952 	if (physmap[0] < physmem_start) {
953 		if (physmem_start < PAGE_SIZE)
954 			physmap[0] = PAGE_SIZE;
955 		else if (physmem_start >= physmap[1])
956 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
957 		else
958 			physmap[0] = round_page(physmem_start);
959 	}
960 	pa_indx = 0;
961 	da_indx = 1;
962 	phys_avail[pa_indx++] = physmap[0];
963 	phys_avail[pa_indx] = physmap[0];
964 	dump_avail[da_indx] = physmap[0];
965 	pte = CMAP1;
966 
967 	/*
968 	 * Get dcons buffer address
969 	 */
970 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
971 	    getenv_quad("dcons.size", &dcons_size) == 0)
972 		dcons_addr = 0;
973 
974 	/*
975 	 * physmap is in bytes, so when converting to page boundaries,
976 	 * round up the start address and round down the end address.
977 	 */
978 	page_counter = 0;
979 	if (memtest != 0)
980 		printf("Testing system memory");
981 	for (i = 0; i <= physmap_idx; i += 2) {
982 		vm_paddr_t end;
983 
984 		end = ptoa((vm_paddr_t)Maxmem);
985 		if (physmap[i + 1] < end)
986 			end = trunc_page(physmap[i + 1]);
987 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
988 			int tmp, page_bad, full;
989 			int *ptr = (int *)CADDR1;
990 
991 			full = FALSE;
992 			/*
993 			 * block out kernel memory as not available.
994 			 */
995 			if (pa >= (vm_paddr_t)kernphys && pa < first)
996 				goto do_dump_avail;
997 
998 			/*
999 			 * block out dcons buffer
1000 			 */
1001 			if (dcons_addr > 0
1002 			    && pa >= trunc_page(dcons_addr)
1003 			    && pa < dcons_addr + dcons_size)
1004 				goto do_dump_avail;
1005 
1006 			page_bad = FALSE;
1007 			if (memtest == 0)
1008 				goto skip_memtest;
1009 
1010 			/*
1011 			 * Print a "." every GB to show we're making
1012 			 * progress.
1013 			 */
1014 			page_counter++;
1015 			if ((page_counter % PAGES_PER_GB) == 0)
1016 				printf(".");
1017 
1018 			/*
1019 			 * map page into kernel: valid, read/write,non-cacheable
1020 			 */
1021 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1022 			invltlb();
1023 
1024 			tmp = *(int *)ptr;
1025 			/*
1026 			 * Test for alternating 1's and 0's
1027 			 */
1028 			*(volatile int *)ptr = 0xaaaaaaaa;
1029 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1030 				page_bad = TRUE;
1031 			/*
1032 			 * Test for alternating 0's and 1's
1033 			 */
1034 			*(volatile int *)ptr = 0x55555555;
1035 			if (*(volatile int *)ptr != 0x55555555)
1036 				page_bad = TRUE;
1037 			/*
1038 			 * Test for all 1's
1039 			 */
1040 			*(volatile int *)ptr = 0xffffffff;
1041 			if (*(volatile int *)ptr != 0xffffffff)
1042 				page_bad = TRUE;
1043 			/*
1044 			 * Test for all 0's
1045 			 */
1046 			*(volatile int *)ptr = 0x0;
1047 			if (*(volatile int *)ptr != 0x0)
1048 				page_bad = TRUE;
1049 			/*
1050 			 * Restore original value.
1051 			 */
1052 			*(int *)ptr = tmp;
1053 
1054 skip_memtest:
1055 			/*
1056 			 * Adjust array of valid/good pages.
1057 			 */
1058 			if (page_bad == TRUE)
1059 				continue;
1060 			/*
1061 			 * If this good page is a continuation of the
1062 			 * previous set of good pages, then just increase
1063 			 * the end pointer. Otherwise start a new chunk.
1064 			 * Note that "end" points one higher than end,
1065 			 * making the range >= start and < end.
1066 			 * If we're also doing a speculative memory
1067 			 * test and we at or past the end, bump up Maxmem
1068 			 * so that we keep going. The first bad page
1069 			 * will terminate the loop.
1070 			 */
1071 			if (phys_avail[pa_indx] == pa) {
1072 				phys_avail[pa_indx] += PAGE_SIZE;
1073 			} else {
1074 				pa_indx++;
1075 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
1076 					printf(
1077 		"Too many holes in the physical address space, giving up\n");
1078 					pa_indx--;
1079 					full = TRUE;
1080 					goto do_dump_avail;
1081 				}
1082 				phys_avail[pa_indx++] = pa;	/* start */
1083 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1084 			}
1085 			physmem++;
1086 do_dump_avail:
1087 			if (dump_avail[da_indx] == pa) {
1088 				dump_avail[da_indx] += PAGE_SIZE;
1089 			} else {
1090 				da_indx++;
1091 				if (da_indx == PHYS_AVAIL_ENTRIES) {
1092 					da_indx--;
1093 					goto do_next;
1094 				}
1095 				dump_avail[da_indx++] = pa; /* start */
1096 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1097 			}
1098 do_next:
1099 			if (full)
1100 				break;
1101 		}
1102 	}
1103 	*pte = 0;
1104 	invltlb();
1105 	if (memtest != 0)
1106 		printf("\n");
1107 
1108 	/*
1109 	 * XXX
1110 	 * The last chunk must contain at least one page plus the message
1111 	 * buffer to avoid complicating other code (message buffer address
1112 	 * calculation, etc.).
1113 	 */
1114 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1115 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1116 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1117 		phys_avail[pa_indx--] = 0;
1118 		phys_avail[pa_indx--] = 0;
1119 	}
1120 
1121 	Maxmem = atop(phys_avail[pa_indx]);
1122 
1123 	/* Trim off space for the message buffer. */
1124 	phys_avail[pa_indx] -= round_page(msgbufsize);
1125 
1126 	/* Map the message buffer. */
1127 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1128 }
1129 
1130 static caddr_t
1131 native_parse_preload_data(u_int64_t modulep)
1132 {
1133 	caddr_t kmdp;
1134 	char *envp;
1135 #ifdef DDB
1136 	vm_offset_t ksym_start;
1137 	vm_offset_t ksym_end;
1138 #endif
1139 
1140 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1141 	preload_bootstrap_relocate(KERNBASE);
1142 	kmdp = preload_search_by_type("elf kernel");
1143 	if (kmdp == NULL)
1144 		kmdp = preload_search_by_type("elf64 kernel");
1145 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1146 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1147 	if (envp != NULL)
1148 		envp += KERNBASE;
1149 	init_static_kenv(envp, 0);
1150 #ifdef DDB
1151 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1152 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1153 	db_fetch_ksymtab(ksym_start, ksym_end, 0);
1154 #endif
1155 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1156 
1157 	return (kmdp);
1158 }
1159 
1160 static void
1161 native_clock_source_init(void)
1162 {
1163 	i8254_init();
1164 }
1165 
1166 static void
1167 amd64_kdb_init(void)
1168 {
1169 	kdb_init();
1170 #ifdef KDB
1171 	if (boothowto & RB_KDB)
1172 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1173 #endif
1174 }
1175 
1176 /* Set up the fast syscall stuff */
1177 void
1178 amd64_conf_fast_syscall(void)
1179 {
1180 	uint64_t msr;
1181 
1182 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1183 	wrmsr(MSR_EFER, msr);
1184 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1185 	    (u_int64_t)IDTVEC(fast_syscall));
1186 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1187 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1188 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1189 	wrmsr(MSR_STAR, msr);
1190 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1191 }
1192 
1193 void
1194 amd64_bsp_pcpu_init1(struct pcpu *pc)
1195 {
1196 	struct user_segment_descriptor *gdt;
1197 
1198 	PCPU_SET(prvspace, pc);
1199 	gdt = *PCPU_PTR(gdt);
1200 	PCPU_SET(curthread, &thread0);
1201 	PCPU_SET(tssp, PCPU_PTR(common_tss));
1202 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1203 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1204 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1205 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1206 	PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1207 	PCPU_SET(smp_tlb_gen, 1);
1208 }
1209 
1210 void
1211 amd64_bsp_pcpu_init2(uint64_t rsp0)
1212 {
1213 
1214 	PCPU_SET(rsp0, rsp0);
1215 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1216 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1217 	PCPU_SET(curpcb, thread0.td_pcb);
1218 }
1219 
1220 void
1221 amd64_bsp_ist_init(struct pcpu *pc)
1222 {
1223 	struct nmi_pcpu *np;
1224 	struct amd64tss *tssp;
1225 
1226 	tssp = &pc->pc_common_tss;
1227 
1228 	/* doublefault stack space, runs on ist1 */
1229 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1230 	np->np_pcpu = (register_t)pc;
1231 	tssp->tss_ist1 = (long)np;
1232 
1233 	/*
1234 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1235 	 * above the start of the ist2 stack.
1236 	 */
1237 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1238 	np->np_pcpu = (register_t)pc;
1239 	tssp->tss_ist2 = (long)np;
1240 
1241 	/*
1242 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1243 	 * above the start of the ist3 stack.
1244 	 */
1245 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1246 	np->np_pcpu = (register_t)pc;
1247 	tssp->tss_ist3 = (long)np;
1248 
1249 	/*
1250 	 * DB# stack, runs on ist4.
1251 	 */
1252 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1253 	np->np_pcpu = (register_t)pc;
1254 	tssp->tss_ist4 = (long)np;
1255 }
1256 
1257 /*
1258  * Calculate the kernel load address by inspecting page table created by loader.
1259  * The assumptions:
1260  * - kernel is mapped at KERNBASE, backed by contiguous phys memory
1261  *   aligned at 2M, below 4G (the latter is important for AP startup)
1262  * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
1263  * - kernel is mapped with 2M superpages
1264  * - all participating memory, i.e. kernel, modules, metadata,
1265  *   page table is accessible by pre-created 1:1 mapping
1266  *   (right now loader creates 1:1 mapping for lower 4G, and all
1267  *   memory is from there)
1268  * - there is a usable memory block right after the end of the
1269  *   mapped kernel and all modules/metadata, pointed to by
1270  *   physfree, for early allocations
1271  */
1272 vm_paddr_t __nosanitizeaddress __nosanitizememory
1273 amd64_loadaddr(void)
1274 {
1275 	pml4_entry_t *pml4e;
1276 	pdp_entry_t *pdpe;
1277 	pd_entry_t *pde;
1278 	uint64_t cr3;
1279 
1280 	cr3 = rcr3();
1281 	pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART);
1282 	pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART);
1283 	pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART);
1284 	return (*pde & PG_FRAME);
1285 }
1286 
1287 u_int64_t
1288 hammer_time(u_int64_t modulep, u_int64_t physfree)
1289 {
1290 	caddr_t kmdp;
1291 	int gsel_tss, x;
1292 	struct pcpu *pc;
1293 	uint64_t rsp0;
1294 	char *env;
1295 	struct user_segment_descriptor *gdt;
1296 	struct region_descriptor r_gdt;
1297 	size_t kstack0_sz;
1298 
1299 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1300 
1301 	kernphys = amd64_loadaddr();
1302 
1303 	physfree += kernphys;
1304 
1305 	kmdp = init_ops.parse_preload_data(modulep);
1306 
1307 	efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
1308 	    MODINFOMD_EFI_MAP) != NULL;
1309 
1310 	if (!efi_boot) {
1311 		/* Tell the bios to warmboot next time */
1312 		atomic_store_short((u_short *)0x472, 0x1234);
1313 	}
1314 
1315 	physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
1316 	physfree = roundup2(physfree, PAGE_SIZE);
1317 
1318 	identify_cpu1();
1319 	identify_hypervisor();
1320 	identify_hypervisor_smbios();
1321 	identify_cpu_fixup_bsp();
1322 	identify_cpu2();
1323 	initializecpucache();
1324 
1325 	/*
1326 	 * Check for pti, pcid, and invpcid before ifuncs are
1327 	 * resolved, to correctly select the implementation for
1328 	 * pmap_activate_sw_mode().
1329 	 */
1330 	pti = pti_get_default();
1331 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1332 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1333 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1334 		invpcid_works = (cpu_stdext_feature &
1335 		    CPUID_STDEXT_INVPCID) != 0;
1336 	} else {
1337 		pmap_pcid_enabled = 0;
1338 	}
1339 
1340 	/*
1341 	 * Now we can do small core initialization, after the PCID
1342 	 * CPU features and user knobs are evaluated.
1343 	 */
1344 	TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
1345 	    &pmap_pcid_invlpg_workaround_uena);
1346 	cpu_init_small_core();
1347 
1348 	link_elf_ireloc(kmdp);
1349 
1350 	/*
1351 	 * This may be done better later if it gets more high level
1352 	 * components in it. If so just link td->td_proc here.
1353 	 */
1354 	proc_linkup0(&proc0, &thread0);
1355 
1356 	/* Init basic tunables, hz etc */
1357 	init_param1();
1358 
1359 	thread0.td_kstack = physfree - kernphys + KERNSTART;
1360 	thread0.td_kstack_pages = kstack_pages;
1361 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1362 	bzero((void *)thread0.td_kstack, kstack0_sz);
1363 	physfree += kstack0_sz;
1364 
1365 	/*
1366 	 * Initialize enough of thread0 for delayed invalidation to
1367 	 * work very early.  Rely on thread0.td_base_pri
1368 	 * zero-initialization, it is reset to PVM at proc0_init().
1369 	 */
1370 	pmap_thread_init_invl_gen(&thread0);
1371 
1372 	pc = &temp_bsp_pcpu;
1373 	pcpu_init(pc, 0, sizeof(struct pcpu));
1374 	gdt = &temp_bsp_pcpu.pc_gdt[0];
1375 
1376 	/*
1377 	 * make gdt memory segments
1378 	 */
1379 	for (x = 0; x < NGDT; x++) {
1380 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1381 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1382 			ssdtosd(&gdt_segs[x], &gdt[x]);
1383 	}
1384 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1385 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1386 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1387 
1388 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1389 	r_gdt.rd_base = (long)gdt;
1390 	lgdt(&r_gdt);
1391 
1392 	wrmsr(MSR_FSBASE, 0);		/* User value */
1393 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1394 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1395 
1396 	dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
1397 	physfree += DPCPU_SIZE;
1398 	amd64_bsp_pcpu_init1(pc);
1399 	/* Non-late cninit() and printf() can be moved up to here. */
1400 
1401 	/*
1402 	 * Initialize mutexes.
1403 	 *
1404 	 * icu_lock: in order to allow an interrupt to occur in a critical
1405 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1406 	 *	     must be able to get the icu lock, so it can't be
1407 	 *	     under witness.
1408 	 */
1409 	mutex_init();
1410 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1411 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1412 
1413 	/* exceptions */
1414 	for (x = 0; x < NIDT; x++)
1415 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1416 		    SEL_KPL, 0);
1417 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1418 	    SEL_KPL, 0);
1419 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1420 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1421 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1422 	    SEL_UPL, 0);
1423 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1424 	    SEL_UPL, 0);
1425 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1426 	    SEL_KPL, 0);
1427 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1428 	    SEL_KPL, 0);
1429 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1430 	    SEL_KPL, 0);
1431 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1432 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1433 	    SDT_SYSIGT, SEL_KPL, 0);
1434 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1435 	    SEL_KPL, 0);
1436 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1437 	    SDT_SYSIGT, SEL_KPL, 0);
1438 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1439 	    SEL_KPL, 0);
1440 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1441 	    SEL_KPL, 0);
1442 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1443 	    SEL_KPL, 0);
1444 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1445 	    SEL_KPL, 0);
1446 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1447 	    SEL_KPL, 0);
1448 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1449 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1450 	    SEL_KPL, 0);
1451 #ifdef KDTRACE_HOOKS
1452 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1453 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1454 #endif
1455 #ifdef XENHVM
1456 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1457 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1458 #endif
1459 	r_idt.rd_limit = sizeof(idt0) - 1;
1460 	r_idt.rd_base = (long) idt;
1461 	lidt(&r_idt);
1462 
1463 	/*
1464 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1465 	 * transition).
1466 	 * Once bootblocks have updated, we can test directly for
1467 	 * efi_systbl != NULL here...
1468 	 */
1469 	if (efi_boot)
1470 		vty_set_preferred(VTY_VT);
1471 
1472 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1473 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1474 
1475 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1476 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1477 
1478 	TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
1479 	    &syscall_ret_l1d_flush_mode);
1480 
1481 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1482 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1483 
1484 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1485 
1486 	TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
1487 	    &x86_rngds_mitg_enable);
1488 
1489 	finishidentcpu();	/* Final stage of CPU initialization */
1490 
1491 	/*
1492 	 * Initialize the clock before the console so that console
1493 	 * initialization can use DELAY().
1494 	 */
1495 	clock_init();
1496 
1497 	initializecpu();	/* Initialize CPU registers */
1498 
1499 	amd64_bsp_ist_init(pc);
1500 
1501 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1502 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1503 	    IOPERM_BITMAP_SIZE;
1504 
1505 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1506 	ltr(gsel_tss);
1507 
1508 	amd64_conf_fast_syscall();
1509 
1510 	/*
1511 	 * We initialize the PCB pointer early so that exception
1512 	 * handlers will work.  Also set up td_critnest to short-cut
1513 	 * the page fault handler.
1514 	 */
1515 	cpu_max_ext_state_size = sizeof(struct savefpu);
1516 	set_top_of_stack_td(&thread0);
1517 	thread0.td_pcb = get_pcb_td(&thread0);
1518 	thread0.td_critnest = 1;
1519 
1520 	/*
1521 	 * The console and kdb should be initialized even earlier than here,
1522 	 * but some console drivers don't work until after getmemsize().
1523 	 * Default to late console initialization to support these drivers.
1524 	 * This loses mainly printf()s in getmemsize() and early debugging.
1525 	 */
1526 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1527 	if (!late_console) {
1528 		cninit();
1529 		amd64_kdb_init();
1530 	}
1531 
1532 	getmemsize(kmdp, physfree);
1533 	init_param2(physmem);
1534 
1535 	/* now running on new page tables, configured,and u/iom is accessible */
1536 
1537 #ifdef DEV_PCI
1538         /* This call might adjust phys_avail[]. */
1539         pci_early_quirks();
1540 #endif
1541 
1542 	if (late_console)
1543 		cninit();
1544 
1545 	/*
1546 	 * Dump the boot metadata. We have to wait for cninit() since console
1547 	 * output is required. If it's grossly incorrect the kernel will never
1548 	 * make it this far.
1549 	 */
1550 	if (getenv_is_true("debug.dump_modinfo_at_boot"))
1551 		preload_dump();
1552 
1553 #ifdef DEV_ISA
1554 #ifdef DEV_ATPIC
1555 	elcr_probe();
1556 	atpic_startup();
1557 #else
1558 	/* Reset and mask the atpics and leave them shut down. */
1559 	atpic_reset();
1560 
1561 	/*
1562 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1563 	 * interrupt handler.
1564 	 */
1565 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1566 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1567 #endif
1568 #else
1569 #error "have you forgotten the isa device?"
1570 #endif
1571 
1572 	if (late_console)
1573 		amd64_kdb_init();
1574 
1575 	msgbufinit(msgbufp, msgbufsize);
1576 	fpuinit();
1577 
1578 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1579 	rsp0 = thread0.td_md.md_stack_base;
1580 	/* Ensure the stack is aligned to 16 bytes */
1581 	rsp0 &= ~0xFul;
1582 	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1583 	amd64_bsp_pcpu_init2(rsp0);
1584 
1585 	/* transfer to user mode */
1586 
1587 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1588 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1589 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1590 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1591 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1592 
1593 	load_ds(_udatasel);
1594 	load_es(_udatasel);
1595 	load_fs(_ufssel);
1596 
1597 	/* setup proc 0's pcb */
1598 	thread0.td_pcb->pcb_flags = 0;
1599 
1600         env = kern_getenv("kernelname");
1601 	if (env != NULL)
1602 		strlcpy(kernelname, env, sizeof(kernelname));
1603 
1604 	kcsan_cpu_init(0);
1605 
1606 #ifdef FDT
1607 	x86_init_fdt();
1608 #endif
1609 	thread0.td_critnest = 0;
1610 
1611 	kasan_init();
1612 	kmsan_init();
1613 
1614 	TSEXIT();
1615 
1616 	/* Location of kernel stack for locore */
1617 	return (thread0.td_md.md_stack_base);
1618 }
1619 
1620 void
1621 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1622 {
1623 
1624 	pcpu->pc_acpi_id = 0xffffffff;
1625 }
1626 
1627 static int
1628 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1629 {
1630 	struct bios_smap *smapbase;
1631 	struct bios_smap_xattr smap;
1632 	caddr_t kmdp;
1633 	uint32_t *smapattr;
1634 	int count, error, i;
1635 
1636 	/* Retrieve the system memory map from the loader. */
1637 	kmdp = preload_search_by_type("elf kernel");
1638 	if (kmdp == NULL)
1639 		kmdp = preload_search_by_type("elf64 kernel");
1640 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1641 	    MODINFO_METADATA | MODINFOMD_SMAP);
1642 	if (smapbase == NULL)
1643 		return (0);
1644 	smapattr = (uint32_t *)preload_search_info(kmdp,
1645 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1646 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1647 	error = 0;
1648 	for (i = 0; i < count; i++) {
1649 		smap.base = smapbase[i].base;
1650 		smap.length = smapbase[i].length;
1651 		smap.type = smapbase[i].type;
1652 		if (smapattr != NULL)
1653 			smap.xattr = smapattr[i];
1654 		else
1655 			smap.xattr = 0;
1656 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1657 	}
1658 	return (error);
1659 }
1660 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1661     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1662     smap_sysctl_handler, "S,bios_smap_xattr",
1663     "Raw BIOS SMAP data");
1664 
1665 static int
1666 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1667 {
1668 	struct efi_map_header *efihdr;
1669 	caddr_t kmdp;
1670 	uint32_t efisize;
1671 
1672 	kmdp = preload_search_by_type("elf kernel");
1673 	if (kmdp == NULL)
1674 		kmdp = preload_search_by_type("elf64 kernel");
1675 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1676 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1677 	if (efihdr == NULL)
1678 		return (0);
1679 	efisize = *((uint32_t *)efihdr - 1);
1680 	return (SYSCTL_OUT(req, efihdr, efisize));
1681 }
1682 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1683     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1684     efi_map_sysctl_handler, "S,efi_map_header",
1685     "Raw EFI Memory Map");
1686 
1687 void
1688 spinlock_enter(void)
1689 {
1690 	struct thread *td;
1691 	register_t flags;
1692 
1693 	td = curthread;
1694 	if (td->td_md.md_spinlock_count == 0) {
1695 		flags = intr_disable();
1696 		td->td_md.md_spinlock_count = 1;
1697 		td->td_md.md_saved_flags = flags;
1698 		critical_enter();
1699 	} else
1700 		td->td_md.md_spinlock_count++;
1701 }
1702 
1703 void
1704 spinlock_exit(void)
1705 {
1706 	struct thread *td;
1707 	register_t flags;
1708 
1709 	td = curthread;
1710 	flags = td->td_md.md_saved_flags;
1711 	td->td_md.md_spinlock_count--;
1712 	if (td->td_md.md_spinlock_count == 0) {
1713 		critical_exit();
1714 		intr_restore(flags);
1715 	}
1716 }
1717 
1718 /*
1719  * Construct a PCB from a trapframe. This is called from kdb_trap() where
1720  * we want to start a backtrace from the function that caused us to enter
1721  * the debugger. We have the context in the trapframe, but base the trace
1722  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1723  * enough for a backtrace.
1724  */
1725 void
1726 makectx(struct trapframe *tf, struct pcb *pcb)
1727 {
1728 
1729 	pcb->pcb_r12 = tf->tf_r12;
1730 	pcb->pcb_r13 = tf->tf_r13;
1731 	pcb->pcb_r14 = tf->tf_r14;
1732 	pcb->pcb_r15 = tf->tf_r15;
1733 	pcb->pcb_rbp = tf->tf_rbp;
1734 	pcb->pcb_rbx = tf->tf_rbx;
1735 	pcb->pcb_rip = tf->tf_rip;
1736 	pcb->pcb_rsp = tf->tf_rsp;
1737 }
1738 
1739 /*
1740  * The pcb_flags is only modified by current thread, or by other threads
1741  * when current thread is stopped.  However, current thread may change it
1742  * from the interrupt context in cpu_switch(), or in the trap handler.
1743  * When we read-modify-write pcb_flags from C sources, compiler may generate
1744  * code that is not atomic regarding the interrupt handler.  If a trap or
1745  * interrupt happens and any flag is modified from the handler, it can be
1746  * clobbered with the cached value later.  Therefore, we implement setting
1747  * and clearing flags with single-instruction functions, which do not race
1748  * with possible modification of the flags from the trap or interrupt context,
1749  * because traps and interrupts are executed only on instruction boundary.
1750  */
1751 void
1752 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
1753 {
1754 
1755 	__asm __volatile("orl %1,%0"
1756 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
1757 	    : "cc", "memory");
1758 
1759 }
1760 
1761 /*
1762  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
1763  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
1764  * pcb if user space modified the bases.  We must save on the context
1765  * switch or if the return to usermode happens through the doreti.
1766  *
1767  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
1768  * which have a consequence that the base MSRs must be saved each time
1769  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
1770  * context switches.
1771  */
1772 static void
1773 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
1774 {
1775 	register_t r;
1776 
1777 	if (curpcb == pcb &&
1778 	    (flags & PCB_FULL_IRET) != 0 &&
1779 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1780 		r = intr_disable();
1781 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1782 			if (rfs() == _ufssel)
1783 				pcb->pcb_fsbase = rdfsbase();
1784 			if (rgs() == _ugssel)
1785 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
1786 		}
1787 		set_pcb_flags_raw(pcb, flags);
1788 		intr_restore(r);
1789 	} else {
1790 		set_pcb_flags_raw(pcb, flags);
1791 	}
1792 }
1793 
1794 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
1795 {
1796 
1797 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
1798 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
1799 }
1800 
1801 void
1802 clear_pcb_flags(struct pcb *pcb, const u_int flags)
1803 {
1804 
1805 	__asm __volatile("andl %1,%0"
1806 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
1807 	    : "cc", "memory");
1808 }
1809 
1810 #ifdef KDB
1811 
1812 /*
1813  * Provide inb() and outb() as functions.  They are normally only available as
1814  * inline functions, thus cannot be called from the debugger.
1815  */
1816 
1817 /* silence compiler warnings */
1818 u_char inb_(u_short);
1819 void outb_(u_short, u_char);
1820 
1821 u_char
1822 inb_(u_short port)
1823 {
1824 	return inb(port);
1825 }
1826 
1827 void
1828 outb_(u_short port, u_char data)
1829 {
1830 	outb(port, data);
1831 }
1832 
1833 #endif /* KDB */
1834 
1835 #undef memset
1836 #undef memmove
1837 #undef memcpy
1838 
1839 void	*memset_std(void *buf, int c, size_t len);
1840 void	*memset_erms(void *buf, int c, size_t len);
1841 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
1842 	    size_t len);
1843 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
1844 	    size_t len);
1845 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
1846 	    size_t len);
1847 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
1848 	    size_t len);
1849 
1850 #ifdef KCSAN
1851 /*
1852  * These fail to build as ifuncs when used with KCSAN.
1853  */
1854 void *
1855 memset(void *buf, int c, size_t len)
1856 {
1857 
1858 	return (memset_std(buf, c, len));
1859 }
1860 
1861 void *
1862 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1863 {
1864 
1865 	return (memmove_std(dst, src, len));
1866 }
1867 
1868 void *
1869 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1870 {
1871 
1872 	return (memcpy_std(dst, src, len));
1873 }
1874 #else
1875 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
1876 {
1877 
1878 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1879 	    memset_erms : memset_std);
1880 }
1881 
1882 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
1883     size_t))
1884 {
1885 
1886 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1887 	    memmove_erms : memmove_std);
1888 }
1889 
1890 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
1891 {
1892 
1893 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1894 	    memcpy_erms : memcpy_std);
1895 }
1896 #endif
1897 
1898 void	pagezero_std(void *addr);
1899 void	pagezero_erms(void *addr);
1900 DEFINE_IFUNC(, void , pagezero, (void *))
1901 {
1902 
1903 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1904 	    pagezero_erms : pagezero_std);
1905 }
1906