xref: /freebsd/sys/amd64/amd64/machdep.c (revision 3494f7c0)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 1992 Terrence R. Lambert.
6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  */
40 
41 #include <sys/cdefs.h>
42 #include "opt_atpic.h"
43 #include "opt_cpu.h"
44 #include "opt_ddb.h"
45 #include "opt_inet.h"
46 #include "opt_isa.h"
47 #include "opt_kstack_pages.h"
48 #include "opt_maxmem.h"
49 #include "opt_pci.h"
50 #include "opt_platform.h"
51 #include "opt_sched.h"
52 
53 #include <sys/param.h>
54 #include <sys/proc.h>
55 #include <sys/systm.h>
56 #include <sys/asan.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/bus.h>
60 #include <sys/callout.h>
61 #include <sys/cons.h>
62 #include <sys/cpu.h>
63 #include <sys/csan.h>
64 #include <sys/efi.h>
65 #include <sys/eventhandler.h>
66 #include <sys/exec.h>
67 #include <sys/imgact.h>
68 #include <sys/kdb.h>
69 #include <sys/kernel.h>
70 #include <sys/ktr.h>
71 #include <sys/linker.h>
72 #include <sys/lock.h>
73 #include <sys/malloc.h>
74 #include <sys/memrange.h>
75 #include <sys/msan.h>
76 #include <sys/msgbuf.h>
77 #include <sys/mutex.h>
78 #include <sys/pcpu.h>
79 #include <sys/ptrace.h>
80 #include <sys/reboot.h>
81 #include <sys/reg.h>
82 #include <sys/rwlock.h>
83 #include <sys/sched.h>
84 #include <sys/signalvar.h>
85 #ifdef SMP
86 #include <sys/smp.h>
87 #endif
88 #include <sys/syscallsubr.h>
89 #include <sys/sysctl.h>
90 #include <sys/sysent.h>
91 #include <sys/sysproto.h>
92 #include <sys/ucontext.h>
93 #include <sys/vmmeter.h>
94 
95 #include <vm/vm.h>
96 #include <vm/vm_param.h>
97 #include <vm/vm_extern.h>
98 #include <vm/vm_kern.h>
99 #include <vm/vm_page.h>
100 #include <vm/vm_map.h>
101 #include <vm/vm_object.h>
102 #include <vm/vm_pager.h>
103 #include <vm/vm_phys.h>
104 #include <vm/vm_dumpset.h>
105 
106 #ifdef DDB
107 #ifndef KDB
108 #error KDB must be enabled in order for DDB to work!
109 #endif
110 #include <ddb/ddb.h>
111 #include <ddb/db_sym.h>
112 #endif
113 
114 #include <net/netisr.h>
115 
116 #include <dev/smbios/smbios.h>
117 
118 #include <machine/clock.h>
119 #include <machine/cpu.h>
120 #include <machine/cputypes.h>
121 #include <machine/frame.h>
122 #include <machine/intr_machdep.h>
123 #include <x86/mca.h>
124 #include <machine/md_var.h>
125 #include <machine/metadata.h>
126 #include <machine/pc/bios.h>
127 #include <machine/pcb.h>
128 #include <machine/proc.h>
129 #include <machine/sigframe.h>
130 #include <machine/specialreg.h>
131 #include <machine/trap.h>
132 #include <machine/tss.h>
133 #include <x86/ucode.h>
134 #include <x86/ifunc.h>
135 #ifdef SMP
136 #include <machine/smp.h>
137 #endif
138 #ifdef FDT
139 #include <x86/fdt.h>
140 #endif
141 
142 #ifdef DEV_ATPIC
143 #include <x86/isa/icu.h>
144 #else
145 #include <x86/apicvar.h>
146 #endif
147 
148 #include <isa/isareg.h>
149 #include <isa/rtc.h>
150 #include <x86/init.h>
151 
152 /* Sanity check for __curthread() */
153 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
154 
155 /*
156  * The PTI trampoline stack needs enough space for a hardware trapframe and a
157  * couple of scratch registers, as well as the trapframe left behind after an
158  * iret fault.
159  */
160 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
161     offsetof(struct pti_frame, pti_rip));
162 
163 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
164 
165 static void cpu_startup(void *);
166 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
167 
168 /* Probe 8254 PIT and TSC. */
169 static void native_clock_source_init(void);
170 
171 /* Preload data parse function */
172 static caddr_t native_parse_preload_data(u_int64_t);
173 
174 /* Native function to fetch and parse the e820 map */
175 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
176 
177 /* Default init_ops implementation. */
178 struct init_ops init_ops = {
179 	.parse_preload_data =		native_parse_preload_data,
180 	.early_clock_source_init =	native_clock_source_init,
181 	.early_delay =			i8254_delay,
182 	.parse_memmap =			native_parse_memmap,
183 };
184 
185 /*
186  * Physical address of the EFI System Table. Stashed from the metadata hints
187  * passed into the kernel and used by the EFI code to call runtime services.
188  */
189 vm_paddr_t efi_systbl_phys;
190 
191 /* Intel ICH registers */
192 #define ICH_PMBASE	0x400
193 #define ICH_SMI_EN	ICH_PMBASE + 0x30
194 
195 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
196 
197 int cold = 1;
198 
199 long Maxmem = 0;
200 long realmem = 0;
201 int late_console = 1;
202 
203 struct kva_md_info kmi;
204 
205 struct region_descriptor r_idt;
206 
207 struct pcpu *__pcpu;
208 struct pcpu temp_bsp_pcpu;
209 
210 struct mtx icu_lock;
211 
212 struct mem_range_softc mem_range_softc;
213 
214 struct mtx dt_lock;	/* lock for GDT and LDT */
215 
216 void (*vmm_resume_p)(void);
217 
218 bool efi_boot;
219 
220 static void
221 cpu_startup(void *dummy)
222 {
223 	uintmax_t memsize;
224 	char *sysenv;
225 
226 	/*
227 	 * On MacBooks, we need to disallow the legacy USB circuit to
228 	 * generate an SMI# because this can cause several problems,
229 	 * namely: incorrect CPU frequency detection and failure to
230 	 * start the APs.
231 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
232 	 * Enable register) of the Intel ICH LPC Interface Bridge.
233 	 */
234 	sysenv = kern_getenv("smbios.system.product");
235 	if (sysenv != NULL) {
236 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
237 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
238 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
239 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
240 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
241 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
242 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
243 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
244 			if (bootverbose)
245 				printf("Disabling LEGACY_USB_EN bit on "
246 				    "Intel ICH.\n");
247 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
248 		}
249 		freeenv(sysenv);
250 	}
251 
252 	/*
253 	 * Good {morning,afternoon,evening,night}.
254 	 */
255 	startrtclock();
256 	printcpuinfo();
257 
258 	/*
259 	 * Display physical memory if SMBIOS reports reasonable amount.
260 	 */
261 	memsize = 0;
262 	sysenv = kern_getenv("smbios.memory.enabled");
263 	if (sysenv != NULL) {
264 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
265 		freeenv(sysenv);
266 	}
267 	if (memsize < ptoa((uintmax_t)vm_free_count()))
268 		memsize = ptoa((uintmax_t)Maxmem);
269 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
270 	realmem = atop(memsize);
271 
272 	/*
273 	 * Display any holes after the first chunk of extended memory.
274 	 */
275 	if (bootverbose) {
276 		int indx;
277 
278 		printf("Physical memory chunk(s):\n");
279 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
280 			vm_paddr_t size;
281 
282 			size = phys_avail[indx + 1] - phys_avail[indx];
283 			printf(
284 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
285 			    (uintmax_t)phys_avail[indx],
286 			    (uintmax_t)phys_avail[indx + 1] - 1,
287 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
288 		}
289 	}
290 
291 	vm_ksubmap_init(&kmi);
292 
293 	printf("avail memory = %ju (%ju MB)\n",
294 	    ptoa((uintmax_t)vm_free_count()),
295 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
296 #ifdef DEV_PCI
297 	if (bootverbose && intel_graphics_stolen_base != 0)
298 		printf("intel stolen mem: base %#jx size %ju MB\n",
299 		    (uintmax_t)intel_graphics_stolen_base,
300 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
301 #endif
302 
303 	/*
304 	 * Set up buffers, so they can be used to read disk labels.
305 	 */
306 	bufinit();
307 	vm_pager_bufferinit();
308 
309 	cpu_setregs();
310 }
311 
312 static void
313 late_ifunc_resolve(void *dummy __unused)
314 {
315 	link_elf_late_ireloc();
316 }
317 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
318 
319 
320 void
321 cpu_setregs(void)
322 {
323 	register_t cr0;
324 
325 	TSENTER();
326 	cr0 = rcr0();
327 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
328 	TSENTER2("load_cr0");
329 	load_cr0(cr0);
330 	TSEXIT2("load_cr0");
331 	TSEXIT();
332 }
333 
334 /*
335  * Initialize amd64 and configure to run kernel
336  */
337 
338 /*
339  * Initialize segments & interrupt table
340  */
341 static struct gate_descriptor idt0[NIDT];
342 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
343 
344 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
345 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
346 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
347 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
348 CTASSERT(sizeof(struct nmi_pcpu) == 16);
349 
350 /*
351  * Software prototypes -- in more palatable form.
352  *
353  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
354  * slots as corresponding segments for i386 kernel.
355  */
356 struct soft_segment_descriptor gdt_segs[] = {
357 /* GNULL_SEL	0 Null Descriptor */
358 {	.ssd_base = 0x0,
359 	.ssd_limit = 0x0,
360 	.ssd_type = 0,
361 	.ssd_dpl = 0,
362 	.ssd_p = 0,
363 	.ssd_long = 0,
364 	.ssd_def32 = 0,
365 	.ssd_gran = 0		},
366 /* GNULL2_SEL	1 Null Descriptor */
367 {	.ssd_base = 0x0,
368 	.ssd_limit = 0x0,
369 	.ssd_type = 0,
370 	.ssd_dpl = 0,
371 	.ssd_p = 0,
372 	.ssd_long = 0,
373 	.ssd_def32 = 0,
374 	.ssd_gran = 0		},
375 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
376 {	.ssd_base = 0x0,
377 	.ssd_limit = 0xfffff,
378 	.ssd_type = SDT_MEMRWA,
379 	.ssd_dpl = SEL_UPL,
380 	.ssd_p = 1,
381 	.ssd_long = 0,
382 	.ssd_def32 = 1,
383 	.ssd_gran = 1		},
384 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
385 {	.ssd_base = 0x0,
386 	.ssd_limit = 0xfffff,
387 	.ssd_type = SDT_MEMRWA,
388 	.ssd_dpl = SEL_UPL,
389 	.ssd_p = 1,
390 	.ssd_long = 0,
391 	.ssd_def32 = 1,
392 	.ssd_gran = 1		},
393 /* GCODE_SEL	4 Code Descriptor for kernel */
394 {	.ssd_base = 0x0,
395 	.ssd_limit = 0xfffff,
396 	.ssd_type = SDT_MEMERA,
397 	.ssd_dpl = SEL_KPL,
398 	.ssd_p = 1,
399 	.ssd_long = 1,
400 	.ssd_def32 = 0,
401 	.ssd_gran = 1		},
402 /* GDATA_SEL	5 Data Descriptor for kernel */
403 {	.ssd_base = 0x0,
404 	.ssd_limit = 0xfffff,
405 	.ssd_type = SDT_MEMRWA,
406 	.ssd_dpl = SEL_KPL,
407 	.ssd_p = 1,
408 	.ssd_long = 1,
409 	.ssd_def32 = 0,
410 	.ssd_gran = 1		},
411 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
412 {	.ssd_base = 0x0,
413 	.ssd_limit = 0xfffff,
414 	.ssd_type = SDT_MEMERA,
415 	.ssd_dpl = SEL_UPL,
416 	.ssd_p = 1,
417 	.ssd_long = 0,
418 	.ssd_def32 = 1,
419 	.ssd_gran = 1		},
420 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
421 {	.ssd_base = 0x0,
422 	.ssd_limit = 0xfffff,
423 	.ssd_type = SDT_MEMRWA,
424 	.ssd_dpl = SEL_UPL,
425 	.ssd_p = 1,
426 	.ssd_long = 0,
427 	.ssd_def32 = 1,
428 	.ssd_gran = 1		},
429 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
430 {	.ssd_base = 0x0,
431 	.ssd_limit = 0xfffff,
432 	.ssd_type = SDT_MEMERA,
433 	.ssd_dpl = SEL_UPL,
434 	.ssd_p = 1,
435 	.ssd_long = 1,
436 	.ssd_def32 = 0,
437 	.ssd_gran = 1		},
438 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
439 {	.ssd_base = 0x0,
440 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
441 	.ssd_type = SDT_SYSTSS,
442 	.ssd_dpl = SEL_KPL,
443 	.ssd_p = 1,
444 	.ssd_long = 0,
445 	.ssd_def32 = 0,
446 	.ssd_gran = 0		},
447 /* Actually, the TSS is a system descriptor which is double size */
448 {	.ssd_base = 0x0,
449 	.ssd_limit = 0x0,
450 	.ssd_type = 0,
451 	.ssd_dpl = 0,
452 	.ssd_p = 0,
453 	.ssd_long = 0,
454 	.ssd_def32 = 0,
455 	.ssd_gran = 0		},
456 /* GUSERLDT_SEL	11 LDT Descriptor */
457 {	.ssd_base = 0x0,
458 	.ssd_limit = 0x0,
459 	.ssd_type = 0,
460 	.ssd_dpl = 0,
461 	.ssd_p = 0,
462 	.ssd_long = 0,
463 	.ssd_def32 = 0,
464 	.ssd_gran = 0		},
465 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
466 {	.ssd_base = 0x0,
467 	.ssd_limit = 0x0,
468 	.ssd_type = 0,
469 	.ssd_dpl = 0,
470 	.ssd_p = 0,
471 	.ssd_long = 0,
472 	.ssd_def32 = 0,
473 	.ssd_gran = 0		},
474 };
475 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
476 
477 void
478 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
479 {
480 	struct gate_descriptor *ip;
481 
482 	ip = idt + idx;
483 	ip->gd_looffset = (uintptr_t)func;
484 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
485 	ip->gd_ist = ist;
486 	ip->gd_xx = 0;
487 	ip->gd_type = typ;
488 	ip->gd_dpl = dpl;
489 	ip->gd_p = 1;
490 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
491 }
492 
493 extern inthand_t
494 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
495 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
496 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
497 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
498 	IDTVEC(xmm), IDTVEC(dblfault),
499 	IDTVEC(div_pti), IDTVEC(bpt_pti),
500 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
501 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
502 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
503 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
504 	IDTVEC(xmm_pti),
505 #ifdef KDTRACE_HOOKS
506 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
507 #endif
508 #ifdef XENHVM
509 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
510 #endif
511 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
512 	IDTVEC(fast_syscall_pti);
513 
514 #ifdef DDB
515 /*
516  * Display the index and function name of any IDT entries that don't use
517  * the default 'rsvd' entry point.
518  */
519 DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE)
520 {
521 	struct gate_descriptor *ip;
522 	int idx;
523 	uintptr_t func;
524 
525 	ip = idt;
526 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
527 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
528 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
529 			db_printf("%3d\t", idx);
530 			db_printsym(func, DB_STGY_PROC);
531 			db_printf("\n");
532 		}
533 		ip++;
534 	}
535 }
536 
537 /* Show privileged registers. */
538 DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE)
539 {
540 	struct {
541 		uint16_t limit;
542 		uint64_t base;
543 	} __packed idtr, gdtr;
544 	uint16_t ldt, tr;
545 
546 	__asm __volatile("sidt %0" : "=m" (idtr));
547 	db_printf("idtr\t0x%016lx/%04x\n",
548 	    (u_long)idtr.base, (u_int)idtr.limit);
549 	__asm __volatile("sgdt %0" : "=m" (gdtr));
550 	db_printf("gdtr\t0x%016lx/%04x\n",
551 	    (u_long)gdtr.base, (u_int)gdtr.limit);
552 	__asm __volatile("sldt %0" : "=r" (ldt));
553 	db_printf("ldtr\t0x%04x\n", ldt);
554 	__asm __volatile("str %0" : "=r" (tr));
555 	db_printf("tr\t0x%04x\n", tr);
556 	db_printf("cr0\t0x%016lx\n", rcr0());
557 	db_printf("cr2\t0x%016lx\n", rcr2());
558 	db_printf("cr3\t0x%016lx\n", rcr3());
559 	db_printf("cr4\t0x%016lx\n", rcr4());
560 	if (rcr4() & CR4_XSAVE)
561 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
562 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
563 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
564 		db_printf("FEATURES_CTL\t%016lx\n",
565 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
566 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
567 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
568 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
569 }
570 
571 DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE)
572 {
573 
574 	db_printf("dr0\t0x%016lx\n", rdr0());
575 	db_printf("dr1\t0x%016lx\n", rdr1());
576 	db_printf("dr2\t0x%016lx\n", rdr2());
577 	db_printf("dr3\t0x%016lx\n", rdr3());
578 	db_printf("dr6\t0x%016lx\n", rdr6());
579 	db_printf("dr7\t0x%016lx\n", rdr7());
580 }
581 #endif
582 
583 void
584 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
585 {
586 
587 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
588 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
589 	ssd->ssd_type  = sd->sd_type;
590 	ssd->ssd_dpl   = sd->sd_dpl;
591 	ssd->ssd_p     = sd->sd_p;
592 	ssd->ssd_long  = sd->sd_long;
593 	ssd->ssd_def32 = sd->sd_def32;
594 	ssd->ssd_gran  = sd->sd_gran;
595 }
596 
597 void
598 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
599 {
600 
601 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
602 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
603 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
604 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
605 	sd->sd_type  = ssd->ssd_type;
606 	sd->sd_dpl   = ssd->ssd_dpl;
607 	sd->sd_p     = ssd->ssd_p;
608 	sd->sd_long  = ssd->ssd_long;
609 	sd->sd_def32 = ssd->ssd_def32;
610 	sd->sd_gran  = ssd->ssd_gran;
611 }
612 
613 void
614 ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd)
615 {
616 
617 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
618 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
619 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
620 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
621 	sd->sd_type  = ssd->ssd_type;
622 	sd->sd_dpl   = ssd->ssd_dpl;
623 	sd->sd_p     = ssd->ssd_p;
624 	sd->sd_gran  = ssd->ssd_gran;
625 }
626 
627 u_int basemem;
628 
629 static int
630 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
631     int *physmap_idxp)
632 {
633 	int i, insert_idx, physmap_idx;
634 
635 	physmap_idx = *physmap_idxp;
636 
637 	if (length == 0)
638 		return (1);
639 
640 	/*
641 	 * Find insertion point while checking for overlap.  Start off by
642 	 * assuming the new entry will be added to the end.
643 	 *
644 	 * NB: physmap_idx points to the next free slot.
645 	 */
646 	insert_idx = physmap_idx;
647 	for (i = 0; i <= physmap_idx; i += 2) {
648 		if (base < physmap[i + 1]) {
649 			if (base + length <= physmap[i]) {
650 				insert_idx = i;
651 				break;
652 			}
653 			if (boothowto & RB_VERBOSE)
654 				printf(
655 		    "Overlapping memory regions, ignoring second region\n");
656 			return (1);
657 		}
658 	}
659 
660 	/* See if we can prepend to the next entry. */
661 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
662 		physmap[insert_idx] = base;
663 		return (1);
664 	}
665 
666 	/* See if we can append to the previous entry. */
667 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
668 		physmap[insert_idx - 1] += length;
669 		return (1);
670 	}
671 
672 	physmap_idx += 2;
673 	*physmap_idxp = physmap_idx;
674 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
675 		printf(
676 		"Too many segments in the physical address map, giving up\n");
677 		return (0);
678 	}
679 
680 	/*
681 	 * Move the last 'N' entries down to make room for the new
682 	 * entry if needed.
683 	 */
684 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
685 		physmap[i] = physmap[i - 2];
686 		physmap[i + 1] = physmap[i - 1];
687 	}
688 
689 	/* Insert the new entry. */
690 	physmap[insert_idx] = base;
691 	physmap[insert_idx + 1] = base + length;
692 	return (1);
693 }
694 
695 void
696 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
697                       vm_paddr_t *physmap, int *physmap_idx)
698 {
699 	struct bios_smap *smap, *smapend;
700 
701 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
702 
703 	for (smap = smapbase; smap < smapend; smap++) {
704 		if (boothowto & RB_VERBOSE)
705 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
706 			    smap->type, smap->base, smap->length);
707 
708 		if (smap->type != SMAP_TYPE_MEMORY)
709 			continue;
710 
711 		if (!add_physmap_entry(smap->base, smap->length, physmap,
712 		    physmap_idx))
713 			break;
714 	}
715 }
716 
717 static void
718 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
719     int *physmap_idx)
720 {
721 	struct efi_md *map, *p;
722 	const char *type;
723 	size_t efisz;
724 	int ndesc, i;
725 
726 	static const char *types[] = {
727 		"Reserved",
728 		"LoaderCode",
729 		"LoaderData",
730 		"BootServicesCode",
731 		"BootServicesData",
732 		"RuntimeServicesCode",
733 		"RuntimeServicesData",
734 		"ConventionalMemory",
735 		"UnusableMemory",
736 		"ACPIReclaimMemory",
737 		"ACPIMemoryNVS",
738 		"MemoryMappedIO",
739 		"MemoryMappedIOPortSpace",
740 		"PalCode",
741 		"PersistentMemory"
742 	};
743 
744 	/*
745 	 * Memory map data provided by UEFI via the GetMemoryMap
746 	 * Boot Services API.
747 	 */
748 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
749 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
750 
751 	if (efihdr->descriptor_size == 0)
752 		return;
753 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
754 
755 	if (boothowto & RB_VERBOSE)
756 		printf("%23s %12s %12s %8s %4s\n",
757 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
758 
759 	for (i = 0, p = map; i < ndesc; i++,
760 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
761 		if (boothowto & RB_VERBOSE) {
762 			if (p->md_type < nitems(types))
763 				type = types[p->md_type];
764 			else
765 				type = "<INVALID>";
766 			printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
767 			    p->md_virt, p->md_pages);
768 			if (p->md_attr & EFI_MD_ATTR_UC)
769 				printf("UC ");
770 			if (p->md_attr & EFI_MD_ATTR_WC)
771 				printf("WC ");
772 			if (p->md_attr & EFI_MD_ATTR_WT)
773 				printf("WT ");
774 			if (p->md_attr & EFI_MD_ATTR_WB)
775 				printf("WB ");
776 			if (p->md_attr & EFI_MD_ATTR_UCE)
777 				printf("UCE ");
778 			if (p->md_attr & EFI_MD_ATTR_WP)
779 				printf("WP ");
780 			if (p->md_attr & EFI_MD_ATTR_RP)
781 				printf("RP ");
782 			if (p->md_attr & EFI_MD_ATTR_XP)
783 				printf("XP ");
784 			if (p->md_attr & EFI_MD_ATTR_NV)
785 				printf("NV ");
786 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
787 				printf("MORE_RELIABLE ");
788 			if (p->md_attr & EFI_MD_ATTR_RO)
789 				printf("RO ");
790 			if (p->md_attr & EFI_MD_ATTR_RT)
791 				printf("RUNTIME");
792 			printf("\n");
793 		}
794 
795 		switch (p->md_type) {
796 		case EFI_MD_TYPE_CODE:
797 		case EFI_MD_TYPE_DATA:
798 		case EFI_MD_TYPE_BS_CODE:
799 		case EFI_MD_TYPE_BS_DATA:
800 		case EFI_MD_TYPE_FREE:
801 			/*
802 			 * We're allowed to use any entry with these types.
803 			 */
804 			break;
805 		default:
806 			continue;
807 		}
808 
809 		if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE,
810 		    physmap, physmap_idx))
811 			break;
812 	}
813 }
814 
815 static void
816 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
817 {
818 	struct bios_smap *smap;
819 	struct efi_map_header *efihdr;
820 	u_int32_t size;
821 
822 	/*
823 	 * Memory map from INT 15:E820.
824 	 *
825 	 * subr_module.c says:
826 	 * "Consumer may safely assume that size value precedes data."
827 	 * ie: an int32_t immediately precedes smap.
828 	 */
829 
830 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
831 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
832 	smap = (struct bios_smap *)preload_search_info(kmdp,
833 	    MODINFO_METADATA | MODINFOMD_SMAP);
834 	if (efihdr == NULL && smap == NULL)
835 		panic("No BIOS smap or EFI map info from loader!");
836 
837 	if (efihdr != NULL) {
838 		add_efi_map_entries(efihdr, physmap, physmap_idx);
839 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
840 	} else {
841 		size = *((u_int32_t *)smap - 1);
842 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
843 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
844 	}
845 }
846 
847 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
848 
849 /*
850  * Populate the (physmap) array with base/bound pairs describing the
851  * available physical memory in the system, then test this memory and
852  * build the phys_avail array describing the actually-available memory.
853  *
854  * Total memory size may be set by the kernel environment variable
855  * hw.physmem or the compile-time define MAXMEM.
856  *
857  * XXX first should be vm_paddr_t.
858  */
859 static void
860 getmemsize(caddr_t kmdp, u_int64_t first)
861 {
862 	int i, physmap_idx, pa_indx, da_indx;
863 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
864 	u_long physmem_start, physmem_tunable, memtest;
865 	pt_entry_t *pte;
866 	quad_t dcons_addr, dcons_size;
867 	int page_counter;
868 
869 	TSENTER();
870 	/*
871 	 * Tell the physical memory allocator about pages used to store
872 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
873 	 */
874 	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
875 
876 	bzero(physmap, sizeof(physmap));
877 	physmap_idx = 0;
878 
879 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
880 	physmap_idx -= 2;
881 
882 	/*
883 	 * Find the 'base memory' segment for SMP
884 	 */
885 	basemem = 0;
886 	for (i = 0; i <= physmap_idx; i += 2) {
887 		if (physmap[i] <= 0xA0000) {
888 			basemem = physmap[i + 1] / 1024;
889 			break;
890 		}
891 	}
892 	if (basemem == 0 || basemem > 640) {
893 		if (bootverbose)
894 			printf(
895 		"Memory map doesn't contain a basemem segment, faking it");
896 		basemem = 640;
897 	}
898 
899 	/*
900 	 * Maxmem isn't the "maximum memory", it's one larger than the
901 	 * highest page of the physical address space.  It should be
902 	 * called something like "Maxphyspage".  We may adjust this
903 	 * based on ``hw.physmem'' and the results of the memory test.
904 	 */
905 	Maxmem = atop(physmap[physmap_idx + 1]);
906 
907 #ifdef MAXMEM
908 	Maxmem = MAXMEM / 4;
909 #endif
910 
911 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
912 		Maxmem = atop(physmem_tunable);
913 
914 	/*
915 	 * The boot memory test is disabled by default, as it takes a
916 	 * significant amount of time on large-memory systems, and is
917 	 * unfriendly to virtual machines as it unnecessarily touches all
918 	 * pages.
919 	 *
920 	 * A general name is used as the code may be extended to support
921 	 * additional tests beyond the current "page present" test.
922 	 */
923 	memtest = 0;
924 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
925 
926 	/*
927 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
928 	 * in the system.
929 	 */
930 	if (Maxmem > atop(physmap[physmap_idx + 1]))
931 		Maxmem = atop(physmap[physmap_idx + 1]);
932 
933 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
934 	    (boothowto & RB_VERBOSE))
935 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
936 
937 	/* call pmap initialization to make new kernel address space */
938 	pmap_bootstrap(&first);
939 
940 	/*
941 	 * Size up each available chunk of physical memory.
942 	 *
943 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
944 	 * By default, mask off the first 16 pages unless we appear to be
945 	 * running in a VM.
946 	 */
947 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
948 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
949 	if (physmap[0] < physmem_start) {
950 		if (physmem_start < PAGE_SIZE)
951 			physmap[0] = PAGE_SIZE;
952 		else if (physmem_start >= physmap[1])
953 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
954 		else
955 			physmap[0] = round_page(physmem_start);
956 	}
957 	pa_indx = 0;
958 	da_indx = 1;
959 	phys_avail[pa_indx++] = physmap[0];
960 	phys_avail[pa_indx] = physmap[0];
961 	dump_avail[da_indx] = physmap[0];
962 	pte = CMAP1;
963 
964 	/*
965 	 * Get dcons buffer address
966 	 */
967 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
968 	    getenv_quad("dcons.size", &dcons_size) == 0)
969 		dcons_addr = 0;
970 
971 	/*
972 	 * physmap is in bytes, so when converting to page boundaries,
973 	 * round up the start address and round down the end address.
974 	 */
975 	page_counter = 0;
976 	if (memtest != 0)
977 		printf("Testing system memory");
978 	for (i = 0; i <= physmap_idx; i += 2) {
979 		vm_paddr_t end;
980 
981 		end = ptoa((vm_paddr_t)Maxmem);
982 		if (physmap[i + 1] < end)
983 			end = trunc_page(physmap[i + 1]);
984 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
985 			int tmp, page_bad, full;
986 			int *ptr = (int *)CADDR1;
987 
988 			full = FALSE;
989 			/*
990 			 * block out kernel memory as not available.
991 			 */
992 			if (pa >= (vm_paddr_t)kernphys && pa < first)
993 				goto do_dump_avail;
994 
995 			/*
996 			 * block out dcons buffer
997 			 */
998 			if (dcons_addr > 0
999 			    && pa >= trunc_page(dcons_addr)
1000 			    && pa < dcons_addr + dcons_size)
1001 				goto do_dump_avail;
1002 
1003 			page_bad = FALSE;
1004 			if (memtest == 0)
1005 				goto skip_memtest;
1006 
1007 			/*
1008 			 * Print a "." every GB to show we're making
1009 			 * progress.
1010 			 */
1011 			page_counter++;
1012 			if ((page_counter % PAGES_PER_GB) == 0)
1013 				printf(".");
1014 
1015 			/*
1016 			 * map page into kernel: valid, read/write,non-cacheable
1017 			 */
1018 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1019 			invltlb();
1020 
1021 			tmp = *(int *)ptr;
1022 			/*
1023 			 * Test for alternating 1's and 0's
1024 			 */
1025 			*(volatile int *)ptr = 0xaaaaaaaa;
1026 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1027 				page_bad = TRUE;
1028 			/*
1029 			 * Test for alternating 0's and 1's
1030 			 */
1031 			*(volatile int *)ptr = 0x55555555;
1032 			if (*(volatile int *)ptr != 0x55555555)
1033 				page_bad = TRUE;
1034 			/*
1035 			 * Test for all 1's
1036 			 */
1037 			*(volatile int *)ptr = 0xffffffff;
1038 			if (*(volatile int *)ptr != 0xffffffff)
1039 				page_bad = TRUE;
1040 			/*
1041 			 * Test for all 0's
1042 			 */
1043 			*(volatile int *)ptr = 0x0;
1044 			if (*(volatile int *)ptr != 0x0)
1045 				page_bad = TRUE;
1046 			/*
1047 			 * Restore original value.
1048 			 */
1049 			*(int *)ptr = tmp;
1050 
1051 skip_memtest:
1052 			/*
1053 			 * Adjust array of valid/good pages.
1054 			 */
1055 			if (page_bad == TRUE)
1056 				continue;
1057 			/*
1058 			 * If this good page is a continuation of the
1059 			 * previous set of good pages, then just increase
1060 			 * the end pointer. Otherwise start a new chunk.
1061 			 * Note that "end" points one higher than end,
1062 			 * making the range >= start and < end.
1063 			 * If we're also doing a speculative memory
1064 			 * test and we at or past the end, bump up Maxmem
1065 			 * so that we keep going. The first bad page
1066 			 * will terminate the loop.
1067 			 */
1068 			if (phys_avail[pa_indx] == pa) {
1069 				phys_avail[pa_indx] += PAGE_SIZE;
1070 			} else {
1071 				pa_indx++;
1072 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
1073 					printf(
1074 		"Too many holes in the physical address space, giving up\n");
1075 					pa_indx--;
1076 					full = TRUE;
1077 					goto do_dump_avail;
1078 				}
1079 				phys_avail[pa_indx++] = pa;	/* start */
1080 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1081 			}
1082 			physmem++;
1083 do_dump_avail:
1084 			if (dump_avail[da_indx] == pa) {
1085 				dump_avail[da_indx] += PAGE_SIZE;
1086 			} else {
1087 				da_indx++;
1088 				if (da_indx == PHYS_AVAIL_ENTRIES) {
1089 					da_indx--;
1090 					goto do_next;
1091 				}
1092 				dump_avail[da_indx++] = pa; /* start */
1093 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1094 			}
1095 do_next:
1096 			if (full)
1097 				break;
1098 		}
1099 	}
1100 	*pte = 0;
1101 	invltlb();
1102 	if (memtest != 0)
1103 		printf("\n");
1104 
1105 	/*
1106 	 * XXX
1107 	 * The last chunk must contain at least one page plus the message
1108 	 * buffer to avoid complicating other code (message buffer address
1109 	 * calculation, etc.).
1110 	 */
1111 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1112 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1113 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1114 		phys_avail[pa_indx--] = 0;
1115 		phys_avail[pa_indx--] = 0;
1116 	}
1117 
1118 	Maxmem = atop(phys_avail[pa_indx]);
1119 
1120 	/* Trim off space for the message buffer. */
1121 	phys_avail[pa_indx] -= round_page(msgbufsize);
1122 
1123 	/* Map the message buffer. */
1124 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1125 	TSEXIT();
1126 }
1127 
1128 static caddr_t
1129 native_parse_preload_data(u_int64_t modulep)
1130 {
1131 	caddr_t kmdp;
1132 	char *envp;
1133 #ifdef DDB
1134 	vm_offset_t ksym_start;
1135 	vm_offset_t ksym_end;
1136 #endif
1137 
1138 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1139 	preload_bootstrap_relocate(KERNBASE);
1140 	kmdp = preload_search_by_type("elf kernel");
1141 	if (kmdp == NULL)
1142 		kmdp = preload_search_by_type("elf64 kernel");
1143 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1144 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1145 	if (envp != NULL)
1146 		envp += KERNBASE;
1147 	init_static_kenv(envp, 0);
1148 #ifdef DDB
1149 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1150 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1151 	db_fetch_ksymtab(ksym_start, ksym_end, 0);
1152 #endif
1153 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1154 
1155 	return (kmdp);
1156 }
1157 
1158 static void
1159 native_clock_source_init(void)
1160 {
1161 	i8254_init();
1162 }
1163 
1164 static void
1165 amd64_kdb_init(void)
1166 {
1167 	kdb_init();
1168 #ifdef KDB
1169 	if (boothowto & RB_KDB)
1170 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1171 #endif
1172 }
1173 
1174 /* Set up the fast syscall stuff */
1175 void
1176 amd64_conf_fast_syscall(void)
1177 {
1178 	uint64_t msr;
1179 
1180 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1181 	wrmsr(MSR_EFER, msr);
1182 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1183 	    (u_int64_t)IDTVEC(fast_syscall));
1184 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1185 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1186 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1187 	wrmsr(MSR_STAR, msr);
1188 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1189 }
1190 
1191 void
1192 amd64_bsp_pcpu_init1(struct pcpu *pc)
1193 {
1194 	struct user_segment_descriptor *gdt;
1195 
1196 	PCPU_SET(prvspace, pc);
1197 	gdt = *PCPU_PTR(gdt);
1198 	PCPU_SET(curthread, &thread0);
1199 	PCPU_SET(tssp, PCPU_PTR(common_tss));
1200 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1201 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1202 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1203 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1204 	PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1205 	PCPU_SET(smp_tlb_gen, 1);
1206 }
1207 
1208 void
1209 amd64_bsp_pcpu_init2(uint64_t rsp0)
1210 {
1211 
1212 	PCPU_SET(rsp0, rsp0);
1213 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1214 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1215 	PCPU_SET(curpcb, thread0.td_pcb);
1216 }
1217 
1218 void
1219 amd64_bsp_ist_init(struct pcpu *pc)
1220 {
1221 	struct nmi_pcpu *np;
1222 	struct amd64tss *tssp;
1223 
1224 	tssp = &pc->pc_common_tss;
1225 
1226 	/* doublefault stack space, runs on ist1 */
1227 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1228 	np->np_pcpu = (register_t)pc;
1229 	tssp->tss_ist1 = (long)np;
1230 
1231 	/*
1232 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1233 	 * above the start of the ist2 stack.
1234 	 */
1235 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1236 	np->np_pcpu = (register_t)pc;
1237 	tssp->tss_ist2 = (long)np;
1238 
1239 	/*
1240 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1241 	 * above the start of the ist3 stack.
1242 	 */
1243 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1244 	np->np_pcpu = (register_t)pc;
1245 	tssp->tss_ist3 = (long)np;
1246 
1247 	/*
1248 	 * DB# stack, runs on ist4.
1249 	 */
1250 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1251 	np->np_pcpu = (register_t)pc;
1252 	tssp->tss_ist4 = (long)np;
1253 }
1254 
1255 /*
1256  * Calculate the kernel load address by inspecting page table created by loader.
1257  * The assumptions:
1258  * - kernel is mapped at KERNBASE, backed by contiguous phys memory
1259  *   aligned at 2M, below 4G (the latter is important for AP startup)
1260  * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
1261  * - kernel is mapped with 2M superpages
1262  * - all participating memory, i.e. kernel, modules, metadata,
1263  *   page table is accessible by pre-created 1:1 mapping
1264  *   (right now loader creates 1:1 mapping for lower 4G, and all
1265  *   memory is from there)
1266  * - there is a usable memory block right after the end of the
1267  *   mapped kernel and all modules/metadata, pointed to by
1268  *   physfree, for early allocations
1269  */
1270 vm_paddr_t __nosanitizeaddress __nosanitizememory
1271 amd64_loadaddr(void)
1272 {
1273 	pml4_entry_t *pml4e;
1274 	pdp_entry_t *pdpe;
1275 	pd_entry_t *pde;
1276 	uint64_t cr3;
1277 
1278 	cr3 = rcr3();
1279 	pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART);
1280 	pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART);
1281 	pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART);
1282 	return (*pde & PG_FRAME);
1283 }
1284 
1285 u_int64_t
1286 hammer_time(u_int64_t modulep, u_int64_t physfree)
1287 {
1288 	caddr_t kmdp;
1289 	int gsel_tss, x;
1290 	struct pcpu *pc;
1291 	uint64_t rsp0;
1292 	char *env;
1293 	struct user_segment_descriptor *gdt;
1294 	struct region_descriptor r_gdt;
1295 	size_t kstack0_sz;
1296 
1297 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1298 
1299 	kernphys = amd64_loadaddr();
1300 
1301 	physfree += kernphys;
1302 
1303 	kmdp = init_ops.parse_preload_data(modulep);
1304 
1305 	efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
1306 	    MODINFOMD_EFI_MAP) != NULL;
1307 
1308 	if (!efi_boot) {
1309 		/* Tell the bios to warmboot next time */
1310 		atomic_store_short((u_short *)0x472, 0x1234);
1311 	}
1312 
1313 	physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
1314 	physfree = roundup2(physfree, PAGE_SIZE);
1315 
1316 	identify_cpu1();
1317 	identify_hypervisor();
1318 	identify_hypervisor_smbios();
1319 	identify_cpu_fixup_bsp();
1320 	identify_cpu2();
1321 	initializecpucache();
1322 
1323 	/*
1324 	 * Check for pti, pcid, and invpcid before ifuncs are
1325 	 * resolved, to correctly select the implementation for
1326 	 * pmap_activate_sw_mode().
1327 	 */
1328 	pti = pti_get_default();
1329 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1330 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1331 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1332 		invpcid_works = (cpu_stdext_feature &
1333 		    CPUID_STDEXT_INVPCID) != 0;
1334 	} else {
1335 		pmap_pcid_enabled = 0;
1336 	}
1337 
1338 	/*
1339 	 * Now we can do small core initialization, after the PCID
1340 	 * CPU features and user knobs are evaluated.
1341 	 */
1342 	TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
1343 	    &pmap_pcid_invlpg_workaround_uena);
1344 	cpu_init_small_core();
1345 
1346 	if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
1347 		use_xsave = 1;
1348 		TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave);
1349 	}
1350 
1351 	link_elf_ireloc(kmdp);
1352 
1353 	/*
1354 	 * This may be done better later if it gets more high level
1355 	 * components in it. If so just link td->td_proc here.
1356 	 */
1357 	proc_linkup0(&proc0, &thread0);
1358 
1359 	/* Init basic tunables, hz etc */
1360 	init_param1();
1361 
1362 	thread0.td_kstack = physfree - kernphys + KERNSTART;
1363 	thread0.td_kstack_pages = kstack_pages;
1364 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1365 	bzero((void *)thread0.td_kstack, kstack0_sz);
1366 	physfree += kstack0_sz;
1367 
1368 	/*
1369 	 * Initialize enough of thread0 for delayed invalidation to
1370 	 * work very early.  Rely on thread0.td_base_pri
1371 	 * zero-initialization, it is reset to PVM at proc0_init().
1372 	 */
1373 	pmap_thread_init_invl_gen(&thread0);
1374 
1375 	pc = &temp_bsp_pcpu;
1376 	pcpu_init(pc, 0, sizeof(struct pcpu));
1377 	gdt = &temp_bsp_pcpu.pc_gdt[0];
1378 
1379 	/*
1380 	 * make gdt memory segments
1381 	 */
1382 	for (x = 0; x < NGDT; x++) {
1383 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1384 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1385 			ssdtosd(&gdt_segs[x], &gdt[x]);
1386 	}
1387 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1388 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1389 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1390 
1391 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1392 	r_gdt.rd_base = (long)gdt;
1393 	lgdt(&r_gdt);
1394 
1395 	wrmsr(MSR_FSBASE, 0);		/* User value */
1396 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1397 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1398 
1399 	dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
1400 	physfree += DPCPU_SIZE;
1401 	amd64_bsp_pcpu_init1(pc);
1402 	/* Non-late cninit() and printf() can be moved up to here. */
1403 
1404 	/*
1405 	 * Initialize mutexes.
1406 	 *
1407 	 * icu_lock: in order to allow an interrupt to occur in a critical
1408 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1409 	 *	     must be able to get the icu lock, so it can't be
1410 	 *	     under witness.
1411 	 */
1412 	mutex_init();
1413 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1414 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1415 
1416 	/* exceptions */
1417 	for (x = 0; x < NIDT; x++)
1418 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1419 		    SEL_KPL, 0);
1420 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1421 	    SEL_KPL, 0);
1422 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1423 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1424 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1425 	    SEL_UPL, 0);
1426 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1427 	    SEL_UPL, 0);
1428 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1429 	    SEL_KPL, 0);
1430 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1431 	    SEL_KPL, 0);
1432 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1433 	    SEL_KPL, 0);
1434 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1435 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1436 	    SDT_SYSIGT, SEL_KPL, 0);
1437 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1438 	    SEL_KPL, 0);
1439 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1440 	    SDT_SYSIGT, SEL_KPL, 0);
1441 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1442 	    SEL_KPL, 0);
1443 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1444 	    SEL_KPL, 0);
1445 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1446 	    SEL_KPL, 0);
1447 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1448 	    SEL_KPL, 0);
1449 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1450 	    SEL_KPL, 0);
1451 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1452 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1453 	    SEL_KPL, 0);
1454 #ifdef KDTRACE_HOOKS
1455 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1456 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1457 #endif
1458 #ifdef XENHVM
1459 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1460 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1461 #endif
1462 	r_idt.rd_limit = sizeof(idt0) - 1;
1463 	r_idt.rd_base = (long) idt;
1464 	lidt(&r_idt);
1465 
1466 	/*
1467 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1468 	 * transition).
1469 	 * Once bootblocks have updated, we can test directly for
1470 	 * efi_systbl != NULL here...
1471 	 */
1472 	if (efi_boot)
1473 		vty_set_preferred(VTY_VT);
1474 
1475 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1476 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1477 
1478 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1479 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1480 
1481 	TUNABLE_INT_FETCH("machdep.syscall_ret_flush_l1d",
1482 	    &syscall_ret_l1d_flush_mode);
1483 
1484 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1485 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1486 
1487 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1488 
1489 	TUNABLE_INT_FETCH("machdep.mitigations.rngds.enable",
1490 	    &x86_rngds_mitg_enable);
1491 
1492 	TUNABLE_INT_FETCH("machdep.mitigations.zenbleed.enable",
1493 	    &zenbleed_enable);
1494 	zenbleed_sanitize_enable();
1495 
1496 	finishidentcpu();	/* Final stage of CPU initialization */
1497 
1498 	/*
1499 	 * Initialize the clock before the console so that console
1500 	 * initialization can use DELAY().
1501 	 */
1502 	clock_init();
1503 
1504 	initializecpu();	/* Initialize CPU registers */
1505 
1506 	amd64_bsp_ist_init(pc);
1507 
1508 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1509 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1510 	    IOPERM_BITMAP_SIZE;
1511 
1512 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1513 	ltr(gsel_tss);
1514 
1515 	amd64_conf_fast_syscall();
1516 
1517 	/*
1518 	 * We initialize the PCB pointer early so that exception
1519 	 * handlers will work.  Also set up td_critnest to short-cut
1520 	 * the page fault handler.
1521 	 */
1522 	cpu_max_ext_state_size = sizeof(struct savefpu);
1523 	set_top_of_stack_td(&thread0);
1524 	thread0.td_pcb = get_pcb_td(&thread0);
1525 	thread0.td_critnest = 1;
1526 
1527 	/*
1528 	 * The console and kdb should be initialized even earlier than here,
1529 	 * but some console drivers don't work until after getmemsize().
1530 	 * Default to late console initialization to support these drivers.
1531 	 * This loses mainly printf()s in getmemsize() and early debugging.
1532 	 */
1533 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1534 	if (!late_console) {
1535 		cninit();
1536 		amd64_kdb_init();
1537 	}
1538 
1539 	getmemsize(kmdp, physfree);
1540 	init_param2(physmem);
1541 
1542 	/* now running on new page tables, configured,and u/iom is accessible */
1543 
1544 #ifdef DEV_PCI
1545         /* This call might adjust phys_avail[]. */
1546         pci_early_quirks();
1547 #endif
1548 
1549 	if (late_console)
1550 		cninit();
1551 
1552 	/*
1553 	 * Dump the boot metadata. We have to wait for cninit() since console
1554 	 * output is required. If it's grossly incorrect the kernel will never
1555 	 * make it this far.
1556 	 */
1557 	if (getenv_is_true("debug.dump_modinfo_at_boot"))
1558 		preload_dump();
1559 
1560 #ifdef DEV_ISA
1561 #ifdef DEV_ATPIC
1562 	elcr_probe();
1563 	atpic_startup();
1564 #else
1565 	/* Reset and mask the atpics and leave them shut down. */
1566 	atpic_reset();
1567 
1568 	/*
1569 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1570 	 * interrupt handler.
1571 	 */
1572 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1573 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1574 #endif
1575 #else
1576 #error "have you forgotten the isa device?"
1577 #endif
1578 
1579 	if (late_console)
1580 		amd64_kdb_init();
1581 
1582 	msgbufinit(msgbufp, msgbufsize);
1583 	fpuinit();
1584 
1585 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1586 	rsp0 = thread0.td_md.md_stack_base;
1587 	/* Ensure the stack is aligned to 16 bytes */
1588 	rsp0 &= ~0xFul;
1589 	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1590 	amd64_bsp_pcpu_init2(rsp0);
1591 
1592 	/* transfer to user mode */
1593 
1594 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1595 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1596 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1597 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1598 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1599 
1600 	load_ds(_udatasel);
1601 	load_es(_udatasel);
1602 	load_fs(_ufssel);
1603 
1604 	/* setup proc 0's pcb */
1605 	thread0.td_pcb->pcb_flags = 0;
1606 
1607         env = kern_getenv("kernelname");
1608 	if (env != NULL)
1609 		strlcpy(kernelname, env, sizeof(kernelname));
1610 
1611 	kcsan_cpu_init(0);
1612 
1613 #ifdef FDT
1614 	x86_init_fdt();
1615 #endif
1616 	thread0.td_critnest = 0;
1617 
1618 	kasan_init();
1619 	kmsan_init();
1620 
1621 	TSEXIT();
1622 
1623 	/* Location of kernel stack for locore */
1624 	return (thread0.td_md.md_stack_base);
1625 }
1626 
1627 void
1628 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1629 {
1630 
1631 	pcpu->pc_acpi_id = 0xffffffff;
1632 }
1633 
1634 static int
1635 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1636 {
1637 	struct bios_smap *smapbase;
1638 	struct bios_smap_xattr smap;
1639 	caddr_t kmdp;
1640 	uint32_t *smapattr;
1641 	int count, error, i;
1642 
1643 	/* Retrieve the system memory map from the loader. */
1644 	kmdp = preload_search_by_type("elf kernel");
1645 	if (kmdp == NULL)
1646 		kmdp = preload_search_by_type("elf64 kernel");
1647 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1648 	    MODINFO_METADATA | MODINFOMD_SMAP);
1649 	if (smapbase == NULL)
1650 		return (0);
1651 	smapattr = (uint32_t *)preload_search_info(kmdp,
1652 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1653 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1654 	error = 0;
1655 	for (i = 0; i < count; i++) {
1656 		smap.base = smapbase[i].base;
1657 		smap.length = smapbase[i].length;
1658 		smap.type = smapbase[i].type;
1659 		if (smapattr != NULL)
1660 			smap.xattr = smapattr[i];
1661 		else
1662 			smap.xattr = 0;
1663 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1664 	}
1665 	return (error);
1666 }
1667 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1668     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1669     smap_sysctl_handler, "S,bios_smap_xattr",
1670     "Raw BIOS SMAP data");
1671 
1672 static int
1673 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1674 {
1675 	struct efi_map_header *efihdr;
1676 	caddr_t kmdp;
1677 	uint32_t efisize;
1678 
1679 	kmdp = preload_search_by_type("elf kernel");
1680 	if (kmdp == NULL)
1681 		kmdp = preload_search_by_type("elf64 kernel");
1682 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1683 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1684 	if (efihdr == NULL)
1685 		return (0);
1686 	efisize = *((uint32_t *)efihdr - 1);
1687 	return (SYSCTL_OUT(req, efihdr, efisize));
1688 }
1689 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1690     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1691     efi_map_sysctl_handler, "S,efi_map_header",
1692     "Raw EFI Memory Map");
1693 
1694 void
1695 spinlock_enter(void)
1696 {
1697 	struct thread *td;
1698 	register_t flags;
1699 
1700 	td = curthread;
1701 	if (td->td_md.md_spinlock_count == 0) {
1702 		flags = intr_disable();
1703 		td->td_md.md_spinlock_count = 1;
1704 		td->td_md.md_saved_flags = flags;
1705 		critical_enter();
1706 	} else
1707 		td->td_md.md_spinlock_count++;
1708 }
1709 
1710 void
1711 spinlock_exit(void)
1712 {
1713 	struct thread *td;
1714 	register_t flags;
1715 
1716 	td = curthread;
1717 	flags = td->td_md.md_saved_flags;
1718 	td->td_md.md_spinlock_count--;
1719 	if (td->td_md.md_spinlock_count == 0) {
1720 		critical_exit();
1721 		intr_restore(flags);
1722 	}
1723 }
1724 
1725 /*
1726  * Construct a PCB from a trapframe. This is called from kdb_trap() where
1727  * we want to start a backtrace from the function that caused us to enter
1728  * the debugger. We have the context in the trapframe, but base the trace
1729  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1730  * enough for a backtrace.
1731  */
1732 void
1733 makectx(struct trapframe *tf, struct pcb *pcb)
1734 {
1735 
1736 	pcb->pcb_r12 = tf->tf_r12;
1737 	pcb->pcb_r13 = tf->tf_r13;
1738 	pcb->pcb_r14 = tf->tf_r14;
1739 	pcb->pcb_r15 = tf->tf_r15;
1740 	pcb->pcb_rbp = tf->tf_rbp;
1741 	pcb->pcb_rbx = tf->tf_rbx;
1742 	pcb->pcb_rip = tf->tf_rip;
1743 	pcb->pcb_rsp = tf->tf_rsp;
1744 }
1745 
1746 /*
1747  * The pcb_flags is only modified by current thread, or by other threads
1748  * when current thread is stopped.  However, current thread may change it
1749  * from the interrupt context in cpu_switch(), or in the trap handler.
1750  * When we read-modify-write pcb_flags from C sources, compiler may generate
1751  * code that is not atomic regarding the interrupt handler.  If a trap or
1752  * interrupt happens and any flag is modified from the handler, it can be
1753  * clobbered with the cached value later.  Therefore, we implement setting
1754  * and clearing flags with single-instruction functions, which do not race
1755  * with possible modification of the flags from the trap or interrupt context,
1756  * because traps and interrupts are executed only on instruction boundary.
1757  */
1758 void
1759 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
1760 {
1761 
1762 	__asm __volatile("orl %1,%0"
1763 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
1764 	    : "cc", "memory");
1765 
1766 }
1767 
1768 /*
1769  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
1770  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
1771  * pcb if user space modified the bases.  We must save on the context
1772  * switch or if the return to usermode happens through the doreti.
1773  *
1774  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
1775  * which have a consequence that the base MSRs must be saved each time
1776  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
1777  * context switches.
1778  */
1779 static void
1780 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
1781 {
1782 	register_t r;
1783 
1784 	if (curpcb == pcb &&
1785 	    (flags & PCB_FULL_IRET) != 0 &&
1786 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1787 		r = intr_disable();
1788 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1789 			if (rfs() == _ufssel)
1790 				pcb->pcb_fsbase = rdfsbase();
1791 			if (rgs() == _ugssel)
1792 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
1793 		}
1794 		set_pcb_flags_raw(pcb, flags);
1795 		intr_restore(r);
1796 	} else {
1797 		set_pcb_flags_raw(pcb, flags);
1798 	}
1799 }
1800 
1801 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
1802 {
1803 
1804 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
1805 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
1806 }
1807 
1808 void
1809 clear_pcb_flags(struct pcb *pcb, const u_int flags)
1810 {
1811 
1812 	__asm __volatile("andl %1,%0"
1813 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
1814 	    : "cc", "memory");
1815 }
1816 
1817 #ifdef KDB
1818 
1819 /*
1820  * Provide inb() and outb() as functions.  They are normally only available as
1821  * inline functions, thus cannot be called from the debugger.
1822  */
1823 
1824 /* silence compiler warnings */
1825 u_char inb_(u_short);
1826 void outb_(u_short, u_char);
1827 
1828 u_char
1829 inb_(u_short port)
1830 {
1831 	return inb(port);
1832 }
1833 
1834 void
1835 outb_(u_short port, u_char data)
1836 {
1837 	outb(port, data);
1838 }
1839 
1840 #endif /* KDB */
1841 
1842 #undef memset
1843 #undef memmove
1844 #undef memcpy
1845 
1846 void	*memset_std(void *buf, int c, size_t len);
1847 void	*memset_erms(void *buf, int c, size_t len);
1848 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
1849 	    size_t len);
1850 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
1851 	    size_t len);
1852 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
1853 	    size_t len);
1854 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
1855 	    size_t len);
1856 
1857 #ifdef KCSAN
1858 /*
1859  * These fail to build as ifuncs when used with KCSAN.
1860  */
1861 void *
1862 memset(void *buf, int c, size_t len)
1863 {
1864 
1865 	return (memset_std(buf, c, len));
1866 }
1867 
1868 void *
1869 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1870 {
1871 
1872 	return (memmove_std(dst, src, len));
1873 }
1874 
1875 void *
1876 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1877 {
1878 
1879 	return (memcpy_std(dst, src, len));
1880 }
1881 #else
1882 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
1883 {
1884 
1885 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1886 	    memset_erms : memset_std);
1887 }
1888 
1889 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
1890     size_t))
1891 {
1892 
1893 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1894 	    memmove_erms : memmove_std);
1895 }
1896 
1897 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
1898 {
1899 
1900 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1901 	    memcpy_erms : memcpy_std);
1902 }
1903 #endif
1904 
1905 void	pagezero_std(void *addr);
1906 void	pagezero_erms(void *addr);
1907 DEFINE_IFUNC(, void , pagezero, (void *))
1908 {
1909 
1910 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1911 	    pagezero_erms : pagezero_std);
1912 }
1913