xref: /freebsd/sys/amd64/amd64/machdep.c (revision 4e8d558c)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 1992 Terrence R. Lambert.
6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
41  */
42 
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 #include "opt_atpic.h"
47 #include "opt_cpu.h"
48 #include "opt_ddb.h"
49 #include "opt_inet.h"
50 #include "opt_isa.h"
51 #include "opt_kstack_pages.h"
52 #include "opt_maxmem.h"
53 #include "opt_pci.h"
54 #include "opt_platform.h"
55 #include "opt_sched.h"
56 
57 #include <sys/param.h>
58 #include <sys/proc.h>
59 #include <sys/systm.h>
60 #include <sys/asan.h>
61 #include <sys/bio.h>
62 #include <sys/buf.h>
63 #include <sys/bus.h>
64 #include <sys/callout.h>
65 #include <sys/cons.h>
66 #include <sys/cpu.h>
67 #include <sys/csan.h>
68 #include <sys/efi.h>
69 #include <sys/eventhandler.h>
70 #include <sys/exec.h>
71 #include <sys/imgact.h>
72 #include <sys/kdb.h>
73 #include <sys/kernel.h>
74 #include <sys/ktr.h>
75 #include <sys/linker.h>
76 #include <sys/lock.h>
77 #include <sys/malloc.h>
78 #include <sys/memrange.h>
79 #include <sys/msan.h>
80 #include <sys/msgbuf.h>
81 #include <sys/mutex.h>
82 #include <sys/pcpu.h>
83 #include <sys/ptrace.h>
84 #include <sys/reboot.h>
85 #include <sys/reg.h>
86 #include <sys/rwlock.h>
87 #include <sys/sched.h>
88 #include <sys/signalvar.h>
89 #ifdef SMP
90 #include <sys/smp.h>
91 #endif
92 #include <sys/syscallsubr.h>
93 #include <sys/sysctl.h>
94 #include <sys/sysent.h>
95 #include <sys/sysproto.h>
96 #include <sys/ucontext.h>
97 #include <sys/vmmeter.h>
98 
99 #include <vm/vm.h>
100 #include <vm/vm_param.h>
101 #include <vm/vm_extern.h>
102 #include <vm/vm_kern.h>
103 #include <vm/vm_page.h>
104 #include <vm/vm_map.h>
105 #include <vm/vm_object.h>
106 #include <vm/vm_pager.h>
107 #include <vm/vm_phys.h>
108 #include <vm/vm_dumpset.h>
109 
110 #ifdef DDB
111 #ifndef KDB
112 #error KDB must be enabled in order for DDB to work!
113 #endif
114 #include <ddb/ddb.h>
115 #include <ddb/db_sym.h>
116 #endif
117 
118 #include <net/netisr.h>
119 
120 #include <dev/smbios/smbios.h>
121 
122 #include <machine/clock.h>
123 #include <machine/cpu.h>
124 #include <machine/cputypes.h>
125 #include <machine/frame.h>
126 #include <machine/intr_machdep.h>
127 #include <x86/mca.h>
128 #include <machine/md_var.h>
129 #include <machine/metadata.h>
130 #include <machine/pc/bios.h>
131 #include <machine/pcb.h>
132 #include <machine/proc.h>
133 #include <machine/sigframe.h>
134 #include <machine/specialreg.h>
135 #include <machine/trap.h>
136 #include <machine/tss.h>
137 #include <x86/ucode.h>
138 #include <x86/ifunc.h>
139 #ifdef SMP
140 #include <machine/smp.h>
141 #endif
142 #ifdef FDT
143 #include <x86/fdt.h>
144 #endif
145 
146 #ifdef DEV_ATPIC
147 #include <x86/isa/icu.h>
148 #else
149 #include <x86/apicvar.h>
150 #endif
151 
152 #include <isa/isareg.h>
153 #include <isa/rtc.h>
154 #include <x86/init.h>
155 
156 /* Sanity check for __curthread() */
157 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
158 
159 /*
160  * The PTI trampoline stack needs enough space for a hardware trapframe and a
161  * couple of scratch registers, as well as the trapframe left behind after an
162  * iret fault.
163  */
164 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
165     offsetof(struct pti_frame, pti_rip));
166 
167 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
168 
169 static void cpu_startup(void *);
170 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
171 
172 /* Probe 8254 PIT and TSC. */
173 static void native_clock_source_init(void);
174 
175 /* Preload data parse function */
176 static caddr_t native_parse_preload_data(u_int64_t);
177 
178 /* Native function to fetch and parse the e820 map */
179 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
180 
181 /* Default init_ops implementation. */
182 struct init_ops init_ops = {
183 	.parse_preload_data =		native_parse_preload_data,
184 	.early_clock_source_init =	native_clock_source_init,
185 	.early_delay =			i8254_delay,
186 	.parse_memmap =			native_parse_memmap,
187 };
188 
189 /*
190  * Physical address of the EFI System Table. Stashed from the metadata hints
191  * passed into the kernel and used by the EFI code to call runtime services.
192  */
193 vm_paddr_t efi_systbl_phys;
194 
195 /* Intel ICH registers */
196 #define ICH_PMBASE	0x400
197 #define ICH_SMI_EN	ICH_PMBASE + 0x30
198 
199 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
200 
201 int cold = 1;
202 
203 long Maxmem = 0;
204 long realmem = 0;
205 int late_console = 1;
206 
207 struct kva_md_info kmi;
208 
209 struct region_descriptor r_idt;
210 
211 struct pcpu *__pcpu;
212 struct pcpu temp_bsp_pcpu;
213 
214 struct mtx icu_lock;
215 
216 struct mem_range_softc mem_range_softc;
217 
218 struct mtx dt_lock;	/* lock for GDT and LDT */
219 
220 void (*vmm_resume_p)(void);
221 
222 bool efi_boot;
223 
224 static void
225 cpu_startup(void *dummy)
226 {
227 	uintmax_t memsize;
228 	char *sysenv;
229 
230 	/*
231 	 * On MacBooks, we need to disallow the legacy USB circuit to
232 	 * generate an SMI# because this can cause several problems,
233 	 * namely: incorrect CPU frequency detection and failure to
234 	 * start the APs.
235 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
236 	 * Enable register) of the Intel ICH LPC Interface Bridge.
237 	 */
238 	sysenv = kern_getenv("smbios.system.product");
239 	if (sysenv != NULL) {
240 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
241 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
242 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
243 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
244 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
245 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
246 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
247 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
248 			if (bootverbose)
249 				printf("Disabling LEGACY_USB_EN bit on "
250 				    "Intel ICH.\n");
251 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
252 		}
253 		freeenv(sysenv);
254 	}
255 
256 	/*
257 	 * Good {morning,afternoon,evening,night}.
258 	 */
259 	startrtclock();
260 	printcpuinfo();
261 
262 	/*
263 	 * Display physical memory if SMBIOS reports reasonable amount.
264 	 */
265 	memsize = 0;
266 	sysenv = kern_getenv("smbios.memory.enabled");
267 	if (sysenv != NULL) {
268 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
269 		freeenv(sysenv);
270 	}
271 	if (memsize < ptoa((uintmax_t)vm_free_count()))
272 		memsize = ptoa((uintmax_t)Maxmem);
273 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
274 	realmem = atop(memsize);
275 
276 	/*
277 	 * Display any holes after the first chunk of extended memory.
278 	 */
279 	if (bootverbose) {
280 		int indx;
281 
282 		printf("Physical memory chunk(s):\n");
283 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
284 			vm_paddr_t size;
285 
286 			size = phys_avail[indx + 1] - phys_avail[indx];
287 			printf(
288 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
289 			    (uintmax_t)phys_avail[indx],
290 			    (uintmax_t)phys_avail[indx + 1] - 1,
291 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
292 		}
293 	}
294 
295 	vm_ksubmap_init(&kmi);
296 
297 	printf("avail memory = %ju (%ju MB)\n",
298 	    ptoa((uintmax_t)vm_free_count()),
299 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
300 #ifdef DEV_PCI
301 	if (bootverbose && intel_graphics_stolen_base != 0)
302 		printf("intel stolen mem: base %#jx size %ju MB\n",
303 		    (uintmax_t)intel_graphics_stolen_base,
304 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
305 #endif
306 
307 	/*
308 	 * Set up buffers, so they can be used to read disk labels.
309 	 */
310 	bufinit();
311 	vm_pager_bufferinit();
312 
313 	cpu_setregs();
314 }
315 
316 static void
317 late_ifunc_resolve(void *dummy __unused)
318 {
319 	link_elf_late_ireloc();
320 }
321 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
322 
323 
324 void
325 cpu_setregs(void)
326 {
327 	register_t cr0;
328 
329 	TSENTER();
330 	cr0 = rcr0();
331 	/*
332 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
333 	 * BSP.  See the comments there about why we set them.
334 	 */
335 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
336 	TSENTER2("load_cr0");
337 	load_cr0(cr0);
338 	TSEXIT2("load_cr0");
339 	TSEXIT();
340 }
341 
342 /*
343  * Initialize amd64 and configure to run kernel
344  */
345 
346 /*
347  * Initialize segments & interrupt table
348  */
349 static struct gate_descriptor idt0[NIDT];
350 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
351 
352 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
353 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
354 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
355 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
356 CTASSERT(sizeof(struct nmi_pcpu) == 16);
357 
358 /*
359  * Software prototypes -- in more palatable form.
360  *
361  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
362  * slots as corresponding segments for i386 kernel.
363  */
364 struct soft_segment_descriptor gdt_segs[] = {
365 /* GNULL_SEL	0 Null Descriptor */
366 {	.ssd_base = 0x0,
367 	.ssd_limit = 0x0,
368 	.ssd_type = 0,
369 	.ssd_dpl = 0,
370 	.ssd_p = 0,
371 	.ssd_long = 0,
372 	.ssd_def32 = 0,
373 	.ssd_gran = 0		},
374 /* GNULL2_SEL	1 Null Descriptor */
375 {	.ssd_base = 0x0,
376 	.ssd_limit = 0x0,
377 	.ssd_type = 0,
378 	.ssd_dpl = 0,
379 	.ssd_p = 0,
380 	.ssd_long = 0,
381 	.ssd_def32 = 0,
382 	.ssd_gran = 0		},
383 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
384 {	.ssd_base = 0x0,
385 	.ssd_limit = 0xfffff,
386 	.ssd_type = SDT_MEMRWA,
387 	.ssd_dpl = SEL_UPL,
388 	.ssd_p = 1,
389 	.ssd_long = 0,
390 	.ssd_def32 = 1,
391 	.ssd_gran = 1		},
392 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
393 {	.ssd_base = 0x0,
394 	.ssd_limit = 0xfffff,
395 	.ssd_type = SDT_MEMRWA,
396 	.ssd_dpl = SEL_UPL,
397 	.ssd_p = 1,
398 	.ssd_long = 0,
399 	.ssd_def32 = 1,
400 	.ssd_gran = 1		},
401 /* GCODE_SEL	4 Code Descriptor for kernel */
402 {	.ssd_base = 0x0,
403 	.ssd_limit = 0xfffff,
404 	.ssd_type = SDT_MEMERA,
405 	.ssd_dpl = SEL_KPL,
406 	.ssd_p = 1,
407 	.ssd_long = 1,
408 	.ssd_def32 = 0,
409 	.ssd_gran = 1		},
410 /* GDATA_SEL	5 Data Descriptor for kernel */
411 {	.ssd_base = 0x0,
412 	.ssd_limit = 0xfffff,
413 	.ssd_type = SDT_MEMRWA,
414 	.ssd_dpl = SEL_KPL,
415 	.ssd_p = 1,
416 	.ssd_long = 1,
417 	.ssd_def32 = 0,
418 	.ssd_gran = 1		},
419 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
420 {	.ssd_base = 0x0,
421 	.ssd_limit = 0xfffff,
422 	.ssd_type = SDT_MEMERA,
423 	.ssd_dpl = SEL_UPL,
424 	.ssd_p = 1,
425 	.ssd_long = 0,
426 	.ssd_def32 = 1,
427 	.ssd_gran = 1		},
428 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
429 {	.ssd_base = 0x0,
430 	.ssd_limit = 0xfffff,
431 	.ssd_type = SDT_MEMRWA,
432 	.ssd_dpl = SEL_UPL,
433 	.ssd_p = 1,
434 	.ssd_long = 0,
435 	.ssd_def32 = 1,
436 	.ssd_gran = 1		},
437 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
438 {	.ssd_base = 0x0,
439 	.ssd_limit = 0xfffff,
440 	.ssd_type = SDT_MEMERA,
441 	.ssd_dpl = SEL_UPL,
442 	.ssd_p = 1,
443 	.ssd_long = 1,
444 	.ssd_def32 = 0,
445 	.ssd_gran = 1		},
446 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
447 {	.ssd_base = 0x0,
448 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
449 	.ssd_type = SDT_SYSTSS,
450 	.ssd_dpl = SEL_KPL,
451 	.ssd_p = 1,
452 	.ssd_long = 0,
453 	.ssd_def32 = 0,
454 	.ssd_gran = 0		},
455 /* Actually, the TSS is a system descriptor which is double size */
456 {	.ssd_base = 0x0,
457 	.ssd_limit = 0x0,
458 	.ssd_type = 0,
459 	.ssd_dpl = 0,
460 	.ssd_p = 0,
461 	.ssd_long = 0,
462 	.ssd_def32 = 0,
463 	.ssd_gran = 0		},
464 /* GUSERLDT_SEL	11 LDT Descriptor */
465 {	.ssd_base = 0x0,
466 	.ssd_limit = 0x0,
467 	.ssd_type = 0,
468 	.ssd_dpl = 0,
469 	.ssd_p = 0,
470 	.ssd_long = 0,
471 	.ssd_def32 = 0,
472 	.ssd_gran = 0		},
473 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
474 {	.ssd_base = 0x0,
475 	.ssd_limit = 0x0,
476 	.ssd_type = 0,
477 	.ssd_dpl = 0,
478 	.ssd_p = 0,
479 	.ssd_long = 0,
480 	.ssd_def32 = 0,
481 	.ssd_gran = 0		},
482 };
483 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
484 
485 void
486 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
487 {
488 	struct gate_descriptor *ip;
489 
490 	ip = idt + idx;
491 	ip->gd_looffset = (uintptr_t)func;
492 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
493 	ip->gd_ist = ist;
494 	ip->gd_xx = 0;
495 	ip->gd_type = typ;
496 	ip->gd_dpl = dpl;
497 	ip->gd_p = 1;
498 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
499 }
500 
501 extern inthand_t
502 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
503 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
504 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
505 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
506 	IDTVEC(xmm), IDTVEC(dblfault),
507 	IDTVEC(div_pti), IDTVEC(bpt_pti),
508 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
509 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
510 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
511 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
512 	IDTVEC(xmm_pti),
513 #ifdef KDTRACE_HOOKS
514 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
515 #endif
516 #ifdef XENHVM
517 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
518 #endif
519 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
520 	IDTVEC(fast_syscall_pti);
521 
522 #ifdef DDB
523 /*
524  * Display the index and function name of any IDT entries that don't use
525  * the default 'rsvd' entry point.
526  */
527 DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE)
528 {
529 	struct gate_descriptor *ip;
530 	int idx;
531 	uintptr_t func;
532 
533 	ip = idt;
534 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
535 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
536 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
537 			db_printf("%3d\t", idx);
538 			db_printsym(func, DB_STGY_PROC);
539 			db_printf("\n");
540 		}
541 		ip++;
542 	}
543 }
544 
545 /* Show privileged registers. */
546 DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE)
547 {
548 	struct {
549 		uint16_t limit;
550 		uint64_t base;
551 	} __packed idtr, gdtr;
552 	uint16_t ldt, tr;
553 
554 	__asm __volatile("sidt %0" : "=m" (idtr));
555 	db_printf("idtr\t0x%016lx/%04x\n",
556 	    (u_long)idtr.base, (u_int)idtr.limit);
557 	__asm __volatile("sgdt %0" : "=m" (gdtr));
558 	db_printf("gdtr\t0x%016lx/%04x\n",
559 	    (u_long)gdtr.base, (u_int)gdtr.limit);
560 	__asm __volatile("sldt %0" : "=r" (ldt));
561 	db_printf("ldtr\t0x%04x\n", ldt);
562 	__asm __volatile("str %0" : "=r" (tr));
563 	db_printf("tr\t0x%04x\n", tr);
564 	db_printf("cr0\t0x%016lx\n", rcr0());
565 	db_printf("cr2\t0x%016lx\n", rcr2());
566 	db_printf("cr3\t0x%016lx\n", rcr3());
567 	db_printf("cr4\t0x%016lx\n", rcr4());
568 	if (rcr4() & CR4_XSAVE)
569 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
570 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
571 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
572 		db_printf("FEATURES_CTL\t%016lx\n",
573 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
574 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
575 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
576 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
577 }
578 
579 DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE)
580 {
581 
582 	db_printf("dr0\t0x%016lx\n", rdr0());
583 	db_printf("dr1\t0x%016lx\n", rdr1());
584 	db_printf("dr2\t0x%016lx\n", rdr2());
585 	db_printf("dr3\t0x%016lx\n", rdr3());
586 	db_printf("dr6\t0x%016lx\n", rdr6());
587 	db_printf("dr7\t0x%016lx\n", rdr7());
588 }
589 #endif
590 
591 void
592 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
593 {
594 
595 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
596 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
597 	ssd->ssd_type  = sd->sd_type;
598 	ssd->ssd_dpl   = sd->sd_dpl;
599 	ssd->ssd_p     = sd->sd_p;
600 	ssd->ssd_long  = sd->sd_long;
601 	ssd->ssd_def32 = sd->sd_def32;
602 	ssd->ssd_gran  = sd->sd_gran;
603 }
604 
605 void
606 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
607 {
608 
609 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
610 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
611 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
612 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
613 	sd->sd_type  = ssd->ssd_type;
614 	sd->sd_dpl   = ssd->ssd_dpl;
615 	sd->sd_p     = ssd->ssd_p;
616 	sd->sd_long  = ssd->ssd_long;
617 	sd->sd_def32 = ssd->ssd_def32;
618 	sd->sd_gran  = ssd->ssd_gran;
619 }
620 
621 void
622 ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd)
623 {
624 
625 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
626 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
627 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
628 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
629 	sd->sd_type  = ssd->ssd_type;
630 	sd->sd_dpl   = ssd->ssd_dpl;
631 	sd->sd_p     = ssd->ssd_p;
632 	sd->sd_gran  = ssd->ssd_gran;
633 }
634 
635 u_int basemem;
636 
637 static int
638 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
639     int *physmap_idxp)
640 {
641 	int i, insert_idx, physmap_idx;
642 
643 	physmap_idx = *physmap_idxp;
644 
645 	if (length == 0)
646 		return (1);
647 
648 	/*
649 	 * Find insertion point while checking for overlap.  Start off by
650 	 * assuming the new entry will be added to the end.
651 	 *
652 	 * NB: physmap_idx points to the next free slot.
653 	 */
654 	insert_idx = physmap_idx;
655 	for (i = 0; i <= physmap_idx; i += 2) {
656 		if (base < physmap[i + 1]) {
657 			if (base + length <= physmap[i]) {
658 				insert_idx = i;
659 				break;
660 			}
661 			if (boothowto & RB_VERBOSE)
662 				printf(
663 		    "Overlapping memory regions, ignoring second region\n");
664 			return (1);
665 		}
666 	}
667 
668 	/* See if we can prepend to the next entry. */
669 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
670 		physmap[insert_idx] = base;
671 		return (1);
672 	}
673 
674 	/* See if we can append to the previous entry. */
675 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
676 		physmap[insert_idx - 1] += length;
677 		return (1);
678 	}
679 
680 	physmap_idx += 2;
681 	*physmap_idxp = physmap_idx;
682 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
683 		printf(
684 		"Too many segments in the physical address map, giving up\n");
685 		return (0);
686 	}
687 
688 	/*
689 	 * Move the last 'N' entries down to make room for the new
690 	 * entry if needed.
691 	 */
692 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
693 		physmap[i] = physmap[i - 2];
694 		physmap[i + 1] = physmap[i - 1];
695 	}
696 
697 	/* Insert the new entry. */
698 	physmap[insert_idx] = base;
699 	physmap[insert_idx + 1] = base + length;
700 	return (1);
701 }
702 
703 void
704 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
705                       vm_paddr_t *physmap, int *physmap_idx)
706 {
707 	struct bios_smap *smap, *smapend;
708 
709 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
710 
711 	for (smap = smapbase; smap < smapend; smap++) {
712 		if (boothowto & RB_VERBOSE)
713 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
714 			    smap->type, smap->base, smap->length);
715 
716 		if (smap->type != SMAP_TYPE_MEMORY)
717 			continue;
718 
719 		if (!add_physmap_entry(smap->base, smap->length, physmap,
720 		    physmap_idx))
721 			break;
722 	}
723 }
724 
725 static void
726 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
727     int *physmap_idx)
728 {
729 	struct efi_md *map, *p;
730 	const char *type;
731 	size_t efisz;
732 	int ndesc, i;
733 
734 	static const char *types[] = {
735 		"Reserved",
736 		"LoaderCode",
737 		"LoaderData",
738 		"BootServicesCode",
739 		"BootServicesData",
740 		"RuntimeServicesCode",
741 		"RuntimeServicesData",
742 		"ConventionalMemory",
743 		"UnusableMemory",
744 		"ACPIReclaimMemory",
745 		"ACPIMemoryNVS",
746 		"MemoryMappedIO",
747 		"MemoryMappedIOPortSpace",
748 		"PalCode",
749 		"PersistentMemory"
750 	};
751 
752 	/*
753 	 * Memory map data provided by UEFI via the GetMemoryMap
754 	 * Boot Services API.
755 	 */
756 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
757 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
758 
759 	if (efihdr->descriptor_size == 0)
760 		return;
761 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
762 
763 	if (boothowto & RB_VERBOSE)
764 		printf("%23s %12s %12s %8s %4s\n",
765 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
766 
767 	for (i = 0, p = map; i < ndesc; i++,
768 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
769 		if (boothowto & RB_VERBOSE) {
770 			if (p->md_type < nitems(types))
771 				type = types[p->md_type];
772 			else
773 				type = "<INVALID>";
774 			printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
775 			    p->md_virt, p->md_pages);
776 			if (p->md_attr & EFI_MD_ATTR_UC)
777 				printf("UC ");
778 			if (p->md_attr & EFI_MD_ATTR_WC)
779 				printf("WC ");
780 			if (p->md_attr & EFI_MD_ATTR_WT)
781 				printf("WT ");
782 			if (p->md_attr & EFI_MD_ATTR_WB)
783 				printf("WB ");
784 			if (p->md_attr & EFI_MD_ATTR_UCE)
785 				printf("UCE ");
786 			if (p->md_attr & EFI_MD_ATTR_WP)
787 				printf("WP ");
788 			if (p->md_attr & EFI_MD_ATTR_RP)
789 				printf("RP ");
790 			if (p->md_attr & EFI_MD_ATTR_XP)
791 				printf("XP ");
792 			if (p->md_attr & EFI_MD_ATTR_NV)
793 				printf("NV ");
794 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
795 				printf("MORE_RELIABLE ");
796 			if (p->md_attr & EFI_MD_ATTR_RO)
797 				printf("RO ");
798 			if (p->md_attr & EFI_MD_ATTR_RT)
799 				printf("RUNTIME");
800 			printf("\n");
801 		}
802 
803 		switch (p->md_type) {
804 		case EFI_MD_TYPE_CODE:
805 		case EFI_MD_TYPE_DATA:
806 		case EFI_MD_TYPE_BS_CODE:
807 		case EFI_MD_TYPE_BS_DATA:
808 		case EFI_MD_TYPE_FREE:
809 			/*
810 			 * We're allowed to use any entry with these types.
811 			 */
812 			break;
813 		default:
814 			continue;
815 		}
816 
817 		if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE,
818 		    physmap, physmap_idx))
819 			break;
820 	}
821 }
822 
823 static void
824 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
825 {
826 	struct bios_smap *smap;
827 	struct efi_map_header *efihdr;
828 	u_int32_t size;
829 
830 	/*
831 	 * Memory map from INT 15:E820.
832 	 *
833 	 * subr_module.c says:
834 	 * "Consumer may safely assume that size value precedes data."
835 	 * ie: an int32_t immediately precedes smap.
836 	 */
837 
838 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
839 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
840 	smap = (struct bios_smap *)preload_search_info(kmdp,
841 	    MODINFO_METADATA | MODINFOMD_SMAP);
842 	if (efihdr == NULL && smap == NULL)
843 		panic("No BIOS smap or EFI map info from loader!");
844 
845 	if (efihdr != NULL) {
846 		add_efi_map_entries(efihdr, physmap, physmap_idx);
847 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
848 	} else {
849 		size = *((u_int32_t *)smap - 1);
850 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
851 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
852 	}
853 }
854 
855 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
856 
857 /*
858  * Populate the (physmap) array with base/bound pairs describing the
859  * available physical memory in the system, then test this memory and
860  * build the phys_avail array describing the actually-available memory.
861  *
862  * Total memory size may be set by the kernel environment variable
863  * hw.physmem or the compile-time define MAXMEM.
864  *
865  * XXX first should be vm_paddr_t.
866  */
867 static void
868 getmemsize(caddr_t kmdp, u_int64_t first)
869 {
870 	int i, physmap_idx, pa_indx, da_indx;
871 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
872 	u_long physmem_start, physmem_tunable, memtest;
873 	pt_entry_t *pte;
874 	quad_t dcons_addr, dcons_size;
875 	int page_counter;
876 
877 	TSENTER();
878 	/*
879 	 * Tell the physical memory allocator about pages used to store
880 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
881 	 */
882 	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
883 
884 	bzero(physmap, sizeof(physmap));
885 	physmap_idx = 0;
886 
887 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
888 	physmap_idx -= 2;
889 
890 	/*
891 	 * Find the 'base memory' segment for SMP
892 	 */
893 	basemem = 0;
894 	for (i = 0; i <= physmap_idx; i += 2) {
895 		if (physmap[i] <= 0xA0000) {
896 			basemem = physmap[i + 1] / 1024;
897 			break;
898 		}
899 	}
900 	if (basemem == 0 || basemem > 640) {
901 		if (bootverbose)
902 			printf(
903 		"Memory map doesn't contain a basemem segment, faking it");
904 		basemem = 640;
905 	}
906 
907 	/*
908 	 * Maxmem isn't the "maximum memory", it's one larger than the
909 	 * highest page of the physical address space.  It should be
910 	 * called something like "Maxphyspage".  We may adjust this
911 	 * based on ``hw.physmem'' and the results of the memory test.
912 	 */
913 	Maxmem = atop(physmap[physmap_idx + 1]);
914 
915 #ifdef MAXMEM
916 	Maxmem = MAXMEM / 4;
917 #endif
918 
919 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
920 		Maxmem = atop(physmem_tunable);
921 
922 	/*
923 	 * The boot memory test is disabled by default, as it takes a
924 	 * significant amount of time on large-memory systems, and is
925 	 * unfriendly to virtual machines as it unnecessarily touches all
926 	 * pages.
927 	 *
928 	 * A general name is used as the code may be extended to support
929 	 * additional tests beyond the current "page present" test.
930 	 */
931 	memtest = 0;
932 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
933 
934 	/*
935 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
936 	 * in the system.
937 	 */
938 	if (Maxmem > atop(physmap[physmap_idx + 1]))
939 		Maxmem = atop(physmap[physmap_idx + 1]);
940 
941 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
942 	    (boothowto & RB_VERBOSE))
943 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
944 
945 	/* call pmap initialization to make new kernel address space */
946 	pmap_bootstrap(&first);
947 
948 	/*
949 	 * Size up each available chunk of physical memory.
950 	 *
951 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
952 	 * By default, mask off the first 16 pages unless we appear to be
953 	 * running in a VM.
954 	 */
955 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
956 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
957 	if (physmap[0] < physmem_start) {
958 		if (physmem_start < PAGE_SIZE)
959 			physmap[0] = PAGE_SIZE;
960 		else if (physmem_start >= physmap[1])
961 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
962 		else
963 			physmap[0] = round_page(physmem_start);
964 	}
965 	pa_indx = 0;
966 	da_indx = 1;
967 	phys_avail[pa_indx++] = physmap[0];
968 	phys_avail[pa_indx] = physmap[0];
969 	dump_avail[da_indx] = physmap[0];
970 	pte = CMAP1;
971 
972 	/*
973 	 * Get dcons buffer address
974 	 */
975 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
976 	    getenv_quad("dcons.size", &dcons_size) == 0)
977 		dcons_addr = 0;
978 
979 	/*
980 	 * physmap is in bytes, so when converting to page boundaries,
981 	 * round up the start address and round down the end address.
982 	 */
983 	page_counter = 0;
984 	if (memtest != 0)
985 		printf("Testing system memory");
986 	for (i = 0; i <= physmap_idx; i += 2) {
987 		vm_paddr_t end;
988 
989 		end = ptoa((vm_paddr_t)Maxmem);
990 		if (physmap[i + 1] < end)
991 			end = trunc_page(physmap[i + 1]);
992 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
993 			int tmp, page_bad, full;
994 			int *ptr = (int *)CADDR1;
995 
996 			full = FALSE;
997 			/*
998 			 * block out kernel memory as not available.
999 			 */
1000 			if (pa >= (vm_paddr_t)kernphys && pa < first)
1001 				goto do_dump_avail;
1002 
1003 			/*
1004 			 * block out dcons buffer
1005 			 */
1006 			if (dcons_addr > 0
1007 			    && pa >= trunc_page(dcons_addr)
1008 			    && pa < dcons_addr + dcons_size)
1009 				goto do_dump_avail;
1010 
1011 			page_bad = FALSE;
1012 			if (memtest == 0)
1013 				goto skip_memtest;
1014 
1015 			/*
1016 			 * Print a "." every GB to show we're making
1017 			 * progress.
1018 			 */
1019 			page_counter++;
1020 			if ((page_counter % PAGES_PER_GB) == 0)
1021 				printf(".");
1022 
1023 			/*
1024 			 * map page into kernel: valid, read/write,non-cacheable
1025 			 */
1026 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1027 			invltlb();
1028 
1029 			tmp = *(int *)ptr;
1030 			/*
1031 			 * Test for alternating 1's and 0's
1032 			 */
1033 			*(volatile int *)ptr = 0xaaaaaaaa;
1034 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1035 				page_bad = TRUE;
1036 			/*
1037 			 * Test for alternating 0's and 1's
1038 			 */
1039 			*(volatile int *)ptr = 0x55555555;
1040 			if (*(volatile int *)ptr != 0x55555555)
1041 				page_bad = TRUE;
1042 			/*
1043 			 * Test for all 1's
1044 			 */
1045 			*(volatile int *)ptr = 0xffffffff;
1046 			if (*(volatile int *)ptr != 0xffffffff)
1047 				page_bad = TRUE;
1048 			/*
1049 			 * Test for all 0's
1050 			 */
1051 			*(volatile int *)ptr = 0x0;
1052 			if (*(volatile int *)ptr != 0x0)
1053 				page_bad = TRUE;
1054 			/*
1055 			 * Restore original value.
1056 			 */
1057 			*(int *)ptr = tmp;
1058 
1059 skip_memtest:
1060 			/*
1061 			 * Adjust array of valid/good pages.
1062 			 */
1063 			if (page_bad == TRUE)
1064 				continue;
1065 			/*
1066 			 * If this good page is a continuation of the
1067 			 * previous set of good pages, then just increase
1068 			 * the end pointer. Otherwise start a new chunk.
1069 			 * Note that "end" points one higher than end,
1070 			 * making the range >= start and < end.
1071 			 * If we're also doing a speculative memory
1072 			 * test and we at or past the end, bump up Maxmem
1073 			 * so that we keep going. The first bad page
1074 			 * will terminate the loop.
1075 			 */
1076 			if (phys_avail[pa_indx] == pa) {
1077 				phys_avail[pa_indx] += PAGE_SIZE;
1078 			} else {
1079 				pa_indx++;
1080 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
1081 					printf(
1082 		"Too many holes in the physical address space, giving up\n");
1083 					pa_indx--;
1084 					full = TRUE;
1085 					goto do_dump_avail;
1086 				}
1087 				phys_avail[pa_indx++] = pa;	/* start */
1088 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1089 			}
1090 			physmem++;
1091 do_dump_avail:
1092 			if (dump_avail[da_indx] == pa) {
1093 				dump_avail[da_indx] += PAGE_SIZE;
1094 			} else {
1095 				da_indx++;
1096 				if (da_indx == PHYS_AVAIL_ENTRIES) {
1097 					da_indx--;
1098 					goto do_next;
1099 				}
1100 				dump_avail[da_indx++] = pa; /* start */
1101 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1102 			}
1103 do_next:
1104 			if (full)
1105 				break;
1106 		}
1107 	}
1108 	*pte = 0;
1109 	invltlb();
1110 	if (memtest != 0)
1111 		printf("\n");
1112 
1113 	/*
1114 	 * XXX
1115 	 * The last chunk must contain at least one page plus the message
1116 	 * buffer to avoid complicating other code (message buffer address
1117 	 * calculation, etc.).
1118 	 */
1119 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1120 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1121 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1122 		phys_avail[pa_indx--] = 0;
1123 		phys_avail[pa_indx--] = 0;
1124 	}
1125 
1126 	Maxmem = atop(phys_avail[pa_indx]);
1127 
1128 	/* Trim off space for the message buffer. */
1129 	phys_avail[pa_indx] -= round_page(msgbufsize);
1130 
1131 	/* Map the message buffer. */
1132 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1133 	TSEXIT();
1134 }
1135 
1136 static caddr_t
1137 native_parse_preload_data(u_int64_t modulep)
1138 {
1139 	caddr_t kmdp;
1140 	char *envp;
1141 #ifdef DDB
1142 	vm_offset_t ksym_start;
1143 	vm_offset_t ksym_end;
1144 #endif
1145 
1146 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1147 	preload_bootstrap_relocate(KERNBASE);
1148 	kmdp = preload_search_by_type("elf kernel");
1149 	if (kmdp == NULL)
1150 		kmdp = preload_search_by_type("elf64 kernel");
1151 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1152 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1153 	if (envp != NULL)
1154 		envp += KERNBASE;
1155 	init_static_kenv(envp, 0);
1156 #ifdef DDB
1157 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1158 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1159 	db_fetch_ksymtab(ksym_start, ksym_end, 0);
1160 #endif
1161 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1162 
1163 	return (kmdp);
1164 }
1165 
1166 static void
1167 native_clock_source_init(void)
1168 {
1169 	i8254_init();
1170 }
1171 
1172 static void
1173 amd64_kdb_init(void)
1174 {
1175 	kdb_init();
1176 #ifdef KDB
1177 	if (boothowto & RB_KDB)
1178 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1179 #endif
1180 }
1181 
1182 /* Set up the fast syscall stuff */
1183 void
1184 amd64_conf_fast_syscall(void)
1185 {
1186 	uint64_t msr;
1187 
1188 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1189 	wrmsr(MSR_EFER, msr);
1190 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1191 	    (u_int64_t)IDTVEC(fast_syscall));
1192 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1193 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1194 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1195 	wrmsr(MSR_STAR, msr);
1196 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1197 }
1198 
1199 void
1200 amd64_bsp_pcpu_init1(struct pcpu *pc)
1201 {
1202 	struct user_segment_descriptor *gdt;
1203 
1204 	PCPU_SET(prvspace, pc);
1205 	gdt = *PCPU_PTR(gdt);
1206 	PCPU_SET(curthread, &thread0);
1207 	PCPU_SET(tssp, PCPU_PTR(common_tss));
1208 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1209 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1210 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1211 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1212 	PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1213 	PCPU_SET(smp_tlb_gen, 1);
1214 }
1215 
1216 void
1217 amd64_bsp_pcpu_init2(uint64_t rsp0)
1218 {
1219 
1220 	PCPU_SET(rsp0, rsp0);
1221 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1222 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1223 	PCPU_SET(curpcb, thread0.td_pcb);
1224 }
1225 
1226 void
1227 amd64_bsp_ist_init(struct pcpu *pc)
1228 {
1229 	struct nmi_pcpu *np;
1230 	struct amd64tss *tssp;
1231 
1232 	tssp = &pc->pc_common_tss;
1233 
1234 	/* doublefault stack space, runs on ist1 */
1235 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1236 	np->np_pcpu = (register_t)pc;
1237 	tssp->tss_ist1 = (long)np;
1238 
1239 	/*
1240 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1241 	 * above the start of the ist2 stack.
1242 	 */
1243 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1244 	np->np_pcpu = (register_t)pc;
1245 	tssp->tss_ist2 = (long)np;
1246 
1247 	/*
1248 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1249 	 * above the start of the ist3 stack.
1250 	 */
1251 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1252 	np->np_pcpu = (register_t)pc;
1253 	tssp->tss_ist3 = (long)np;
1254 
1255 	/*
1256 	 * DB# stack, runs on ist4.
1257 	 */
1258 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1259 	np->np_pcpu = (register_t)pc;
1260 	tssp->tss_ist4 = (long)np;
1261 }
1262 
1263 /*
1264  * Calculate the kernel load address by inspecting page table created by loader.
1265  * The assumptions:
1266  * - kernel is mapped at KERNBASE, backed by contiguous phys memory
1267  *   aligned at 2M, below 4G (the latter is important for AP startup)
1268  * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
1269  * - kernel is mapped with 2M superpages
1270  * - all participating memory, i.e. kernel, modules, metadata,
1271  *   page table is accessible by pre-created 1:1 mapping
1272  *   (right now loader creates 1:1 mapping for lower 4G, and all
1273  *   memory is from there)
1274  * - there is a usable memory block right after the end of the
1275  *   mapped kernel and all modules/metadata, pointed to by
1276  *   physfree, for early allocations
1277  */
1278 vm_paddr_t __nosanitizeaddress __nosanitizememory
1279 amd64_loadaddr(void)
1280 {
1281 	pml4_entry_t *pml4e;
1282 	pdp_entry_t *pdpe;
1283 	pd_entry_t *pde;
1284 	uint64_t cr3;
1285 
1286 	cr3 = rcr3();
1287 	pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART);
1288 	pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART);
1289 	pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART);
1290 	return (*pde & PG_FRAME);
1291 }
1292 
1293 u_int64_t
1294 hammer_time(u_int64_t modulep, u_int64_t physfree)
1295 {
1296 	caddr_t kmdp;
1297 	int gsel_tss, x;
1298 	struct pcpu *pc;
1299 	uint64_t rsp0;
1300 	char *env;
1301 	struct user_segment_descriptor *gdt;
1302 	struct region_descriptor r_gdt;
1303 	size_t kstack0_sz;
1304 
1305 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1306 
1307 	kernphys = amd64_loadaddr();
1308 
1309 	physfree += kernphys;
1310 
1311 	kmdp = init_ops.parse_preload_data(modulep);
1312 
1313 	efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
1314 	    MODINFOMD_EFI_MAP) != NULL;
1315 
1316 	if (!efi_boot) {
1317 		/* Tell the bios to warmboot next time */
1318 		atomic_store_short((u_short *)0x472, 0x1234);
1319 	}
1320 
1321 	physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
1322 	physfree = roundup2(physfree, PAGE_SIZE);
1323 
1324 	identify_cpu1();
1325 	identify_hypervisor();
1326 	identify_hypervisor_smbios();
1327 	identify_cpu_fixup_bsp();
1328 	identify_cpu2();
1329 	initializecpucache();
1330 
1331 	/*
1332 	 * Check for pti, pcid, and invpcid before ifuncs are
1333 	 * resolved, to correctly select the implementation for
1334 	 * pmap_activate_sw_mode().
1335 	 */
1336 	pti = pti_get_default();
1337 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1338 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1339 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1340 		invpcid_works = (cpu_stdext_feature &
1341 		    CPUID_STDEXT_INVPCID) != 0;
1342 	} else {
1343 		pmap_pcid_enabled = 0;
1344 	}
1345 
1346 	/*
1347 	 * Now we can do small core initialization, after the PCID
1348 	 * CPU features and user knobs are evaluated.
1349 	 */
1350 	TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
1351 	    &pmap_pcid_invlpg_workaround_uena);
1352 	cpu_init_small_core();
1353 
1354 	if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
1355 		use_xsave = 1;
1356 		TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave);
1357 	}
1358 
1359 	link_elf_ireloc(kmdp);
1360 
1361 	/*
1362 	 * This may be done better later if it gets more high level
1363 	 * components in it. If so just link td->td_proc here.
1364 	 */
1365 	proc_linkup0(&proc0, &thread0);
1366 
1367 	/* Init basic tunables, hz etc */
1368 	init_param1();
1369 
1370 	thread0.td_kstack = physfree - kernphys + KERNSTART;
1371 	thread0.td_kstack_pages = kstack_pages;
1372 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1373 	bzero((void *)thread0.td_kstack, kstack0_sz);
1374 	physfree += kstack0_sz;
1375 
1376 	/*
1377 	 * Initialize enough of thread0 for delayed invalidation to
1378 	 * work very early.  Rely on thread0.td_base_pri
1379 	 * zero-initialization, it is reset to PVM at proc0_init().
1380 	 */
1381 	pmap_thread_init_invl_gen(&thread0);
1382 
1383 	pc = &temp_bsp_pcpu;
1384 	pcpu_init(pc, 0, sizeof(struct pcpu));
1385 	gdt = &temp_bsp_pcpu.pc_gdt[0];
1386 
1387 	/*
1388 	 * make gdt memory segments
1389 	 */
1390 	for (x = 0; x < NGDT; x++) {
1391 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1392 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1393 			ssdtosd(&gdt_segs[x], &gdt[x]);
1394 	}
1395 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1396 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1397 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1398 
1399 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1400 	r_gdt.rd_base = (long)gdt;
1401 	lgdt(&r_gdt);
1402 
1403 	wrmsr(MSR_FSBASE, 0);		/* User value */
1404 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1405 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1406 
1407 	dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
1408 	physfree += DPCPU_SIZE;
1409 	amd64_bsp_pcpu_init1(pc);
1410 	/* Non-late cninit() and printf() can be moved up to here. */
1411 
1412 	/*
1413 	 * Initialize mutexes.
1414 	 *
1415 	 * icu_lock: in order to allow an interrupt to occur in a critical
1416 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1417 	 *	     must be able to get the icu lock, so it can't be
1418 	 *	     under witness.
1419 	 */
1420 	mutex_init();
1421 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1422 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1423 
1424 	/* exceptions */
1425 	for (x = 0; x < NIDT; x++)
1426 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1427 		    SEL_KPL, 0);
1428 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1429 	    SEL_KPL, 0);
1430 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1431 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1432 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1433 	    SEL_UPL, 0);
1434 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1435 	    SEL_UPL, 0);
1436 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1437 	    SEL_KPL, 0);
1438 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1439 	    SEL_KPL, 0);
1440 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1441 	    SEL_KPL, 0);
1442 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1443 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1444 	    SDT_SYSIGT, SEL_KPL, 0);
1445 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1446 	    SEL_KPL, 0);
1447 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1448 	    SDT_SYSIGT, SEL_KPL, 0);
1449 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1450 	    SEL_KPL, 0);
1451 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1452 	    SEL_KPL, 0);
1453 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1454 	    SEL_KPL, 0);
1455 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1456 	    SEL_KPL, 0);
1457 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1458 	    SEL_KPL, 0);
1459 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1460 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1461 	    SEL_KPL, 0);
1462 #ifdef KDTRACE_HOOKS
1463 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1464 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1465 #endif
1466 #ifdef XENHVM
1467 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1468 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1469 #endif
1470 	r_idt.rd_limit = sizeof(idt0) - 1;
1471 	r_idt.rd_base = (long) idt;
1472 	lidt(&r_idt);
1473 
1474 	/*
1475 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1476 	 * transition).
1477 	 * Once bootblocks have updated, we can test directly for
1478 	 * efi_systbl != NULL here...
1479 	 */
1480 	if (efi_boot)
1481 		vty_set_preferred(VTY_VT);
1482 
1483 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1484 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1485 
1486 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1487 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1488 
1489 	TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
1490 	    &syscall_ret_l1d_flush_mode);
1491 
1492 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1493 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1494 
1495 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1496 
1497 	TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
1498 	    &x86_rngds_mitg_enable);
1499 
1500 	finishidentcpu();	/* Final stage of CPU initialization */
1501 
1502 	/*
1503 	 * Initialize the clock before the console so that console
1504 	 * initialization can use DELAY().
1505 	 */
1506 	clock_init();
1507 
1508 	initializecpu();	/* Initialize CPU registers */
1509 
1510 	amd64_bsp_ist_init(pc);
1511 
1512 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1513 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1514 	    IOPERM_BITMAP_SIZE;
1515 
1516 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1517 	ltr(gsel_tss);
1518 
1519 	amd64_conf_fast_syscall();
1520 
1521 	/*
1522 	 * We initialize the PCB pointer early so that exception
1523 	 * handlers will work.  Also set up td_critnest to short-cut
1524 	 * the page fault handler.
1525 	 */
1526 	cpu_max_ext_state_size = sizeof(struct savefpu);
1527 	set_top_of_stack_td(&thread0);
1528 	thread0.td_pcb = get_pcb_td(&thread0);
1529 	thread0.td_critnest = 1;
1530 
1531 	/*
1532 	 * The console and kdb should be initialized even earlier than here,
1533 	 * but some console drivers don't work until after getmemsize().
1534 	 * Default to late console initialization to support these drivers.
1535 	 * This loses mainly printf()s in getmemsize() and early debugging.
1536 	 */
1537 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1538 	if (!late_console) {
1539 		cninit();
1540 		amd64_kdb_init();
1541 	}
1542 
1543 	getmemsize(kmdp, physfree);
1544 	init_param2(physmem);
1545 
1546 	/* now running on new page tables, configured,and u/iom is accessible */
1547 
1548 #ifdef DEV_PCI
1549         /* This call might adjust phys_avail[]. */
1550         pci_early_quirks();
1551 #endif
1552 
1553 	if (late_console)
1554 		cninit();
1555 
1556 	/*
1557 	 * Dump the boot metadata. We have to wait for cninit() since console
1558 	 * output is required. If it's grossly incorrect the kernel will never
1559 	 * make it this far.
1560 	 */
1561 	if (getenv_is_true("debug.dump_modinfo_at_boot"))
1562 		preload_dump();
1563 
1564 #ifdef DEV_ISA
1565 #ifdef DEV_ATPIC
1566 	elcr_probe();
1567 	atpic_startup();
1568 #else
1569 	/* Reset and mask the atpics and leave them shut down. */
1570 	atpic_reset();
1571 
1572 	/*
1573 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1574 	 * interrupt handler.
1575 	 */
1576 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1577 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1578 #endif
1579 #else
1580 #error "have you forgotten the isa device?"
1581 #endif
1582 
1583 	if (late_console)
1584 		amd64_kdb_init();
1585 
1586 	msgbufinit(msgbufp, msgbufsize);
1587 	fpuinit();
1588 
1589 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1590 	rsp0 = thread0.td_md.md_stack_base;
1591 	/* Ensure the stack is aligned to 16 bytes */
1592 	rsp0 &= ~0xFul;
1593 	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1594 	amd64_bsp_pcpu_init2(rsp0);
1595 
1596 	/* transfer to user mode */
1597 
1598 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1599 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1600 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1601 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1602 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1603 
1604 	load_ds(_udatasel);
1605 	load_es(_udatasel);
1606 	load_fs(_ufssel);
1607 
1608 	/* setup proc 0's pcb */
1609 	thread0.td_pcb->pcb_flags = 0;
1610 
1611         env = kern_getenv("kernelname");
1612 	if (env != NULL)
1613 		strlcpy(kernelname, env, sizeof(kernelname));
1614 
1615 	kcsan_cpu_init(0);
1616 
1617 #ifdef FDT
1618 	x86_init_fdt();
1619 #endif
1620 	thread0.td_critnest = 0;
1621 
1622 	kasan_init();
1623 	kmsan_init();
1624 
1625 	TSEXIT();
1626 
1627 	/* Location of kernel stack for locore */
1628 	return (thread0.td_md.md_stack_base);
1629 }
1630 
1631 void
1632 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1633 {
1634 
1635 	pcpu->pc_acpi_id = 0xffffffff;
1636 }
1637 
1638 static int
1639 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1640 {
1641 	struct bios_smap *smapbase;
1642 	struct bios_smap_xattr smap;
1643 	caddr_t kmdp;
1644 	uint32_t *smapattr;
1645 	int count, error, i;
1646 
1647 	/* Retrieve the system memory map from the loader. */
1648 	kmdp = preload_search_by_type("elf kernel");
1649 	if (kmdp == NULL)
1650 		kmdp = preload_search_by_type("elf64 kernel");
1651 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1652 	    MODINFO_METADATA | MODINFOMD_SMAP);
1653 	if (smapbase == NULL)
1654 		return (0);
1655 	smapattr = (uint32_t *)preload_search_info(kmdp,
1656 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1657 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1658 	error = 0;
1659 	for (i = 0; i < count; i++) {
1660 		smap.base = smapbase[i].base;
1661 		smap.length = smapbase[i].length;
1662 		smap.type = smapbase[i].type;
1663 		if (smapattr != NULL)
1664 			smap.xattr = smapattr[i];
1665 		else
1666 			smap.xattr = 0;
1667 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1668 	}
1669 	return (error);
1670 }
1671 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1672     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1673     smap_sysctl_handler, "S,bios_smap_xattr",
1674     "Raw BIOS SMAP data");
1675 
1676 static int
1677 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1678 {
1679 	struct efi_map_header *efihdr;
1680 	caddr_t kmdp;
1681 	uint32_t efisize;
1682 
1683 	kmdp = preload_search_by_type("elf kernel");
1684 	if (kmdp == NULL)
1685 		kmdp = preload_search_by_type("elf64 kernel");
1686 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1687 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1688 	if (efihdr == NULL)
1689 		return (0);
1690 	efisize = *((uint32_t *)efihdr - 1);
1691 	return (SYSCTL_OUT(req, efihdr, efisize));
1692 }
1693 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1694     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1695     efi_map_sysctl_handler, "S,efi_map_header",
1696     "Raw EFI Memory Map");
1697 
1698 void
1699 spinlock_enter(void)
1700 {
1701 	struct thread *td;
1702 	register_t flags;
1703 
1704 	td = curthread;
1705 	if (td->td_md.md_spinlock_count == 0) {
1706 		flags = intr_disable();
1707 		td->td_md.md_spinlock_count = 1;
1708 		td->td_md.md_saved_flags = flags;
1709 		critical_enter();
1710 	} else
1711 		td->td_md.md_spinlock_count++;
1712 }
1713 
1714 void
1715 spinlock_exit(void)
1716 {
1717 	struct thread *td;
1718 	register_t flags;
1719 
1720 	td = curthread;
1721 	flags = td->td_md.md_saved_flags;
1722 	td->td_md.md_spinlock_count--;
1723 	if (td->td_md.md_spinlock_count == 0) {
1724 		critical_exit();
1725 		intr_restore(flags);
1726 	}
1727 }
1728 
1729 /*
1730  * Construct a PCB from a trapframe. This is called from kdb_trap() where
1731  * we want to start a backtrace from the function that caused us to enter
1732  * the debugger. We have the context in the trapframe, but base the trace
1733  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1734  * enough for a backtrace.
1735  */
1736 void
1737 makectx(struct trapframe *tf, struct pcb *pcb)
1738 {
1739 
1740 	pcb->pcb_r12 = tf->tf_r12;
1741 	pcb->pcb_r13 = tf->tf_r13;
1742 	pcb->pcb_r14 = tf->tf_r14;
1743 	pcb->pcb_r15 = tf->tf_r15;
1744 	pcb->pcb_rbp = tf->tf_rbp;
1745 	pcb->pcb_rbx = tf->tf_rbx;
1746 	pcb->pcb_rip = tf->tf_rip;
1747 	pcb->pcb_rsp = tf->tf_rsp;
1748 }
1749 
1750 /*
1751  * The pcb_flags is only modified by current thread, or by other threads
1752  * when current thread is stopped.  However, current thread may change it
1753  * from the interrupt context in cpu_switch(), or in the trap handler.
1754  * When we read-modify-write pcb_flags from C sources, compiler may generate
1755  * code that is not atomic regarding the interrupt handler.  If a trap or
1756  * interrupt happens and any flag is modified from the handler, it can be
1757  * clobbered with the cached value later.  Therefore, we implement setting
1758  * and clearing flags with single-instruction functions, which do not race
1759  * with possible modification of the flags from the trap or interrupt context,
1760  * because traps and interrupts are executed only on instruction boundary.
1761  */
1762 void
1763 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
1764 {
1765 
1766 	__asm __volatile("orl %1,%0"
1767 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
1768 	    : "cc", "memory");
1769 
1770 }
1771 
1772 /*
1773  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
1774  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
1775  * pcb if user space modified the bases.  We must save on the context
1776  * switch or if the return to usermode happens through the doreti.
1777  *
1778  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
1779  * which have a consequence that the base MSRs must be saved each time
1780  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
1781  * context switches.
1782  */
1783 static void
1784 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
1785 {
1786 	register_t r;
1787 
1788 	if (curpcb == pcb &&
1789 	    (flags & PCB_FULL_IRET) != 0 &&
1790 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1791 		r = intr_disable();
1792 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1793 			if (rfs() == _ufssel)
1794 				pcb->pcb_fsbase = rdfsbase();
1795 			if (rgs() == _ugssel)
1796 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
1797 		}
1798 		set_pcb_flags_raw(pcb, flags);
1799 		intr_restore(r);
1800 	} else {
1801 		set_pcb_flags_raw(pcb, flags);
1802 	}
1803 }
1804 
1805 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
1806 {
1807 
1808 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
1809 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
1810 }
1811 
1812 void
1813 clear_pcb_flags(struct pcb *pcb, const u_int flags)
1814 {
1815 
1816 	__asm __volatile("andl %1,%0"
1817 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
1818 	    : "cc", "memory");
1819 }
1820 
1821 #ifdef KDB
1822 
1823 /*
1824  * Provide inb() and outb() as functions.  They are normally only available as
1825  * inline functions, thus cannot be called from the debugger.
1826  */
1827 
1828 /* silence compiler warnings */
1829 u_char inb_(u_short);
1830 void outb_(u_short, u_char);
1831 
1832 u_char
1833 inb_(u_short port)
1834 {
1835 	return inb(port);
1836 }
1837 
1838 void
1839 outb_(u_short port, u_char data)
1840 {
1841 	outb(port, data);
1842 }
1843 
1844 #endif /* KDB */
1845 
1846 #undef memset
1847 #undef memmove
1848 #undef memcpy
1849 
1850 void	*memset_std(void *buf, int c, size_t len);
1851 void	*memset_erms(void *buf, int c, size_t len);
1852 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
1853 	    size_t len);
1854 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
1855 	    size_t len);
1856 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
1857 	    size_t len);
1858 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
1859 	    size_t len);
1860 
1861 #ifdef KCSAN
1862 /*
1863  * These fail to build as ifuncs when used with KCSAN.
1864  */
1865 void *
1866 memset(void *buf, int c, size_t len)
1867 {
1868 
1869 	return (memset_std(buf, c, len));
1870 }
1871 
1872 void *
1873 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1874 {
1875 
1876 	return (memmove_std(dst, src, len));
1877 }
1878 
1879 void *
1880 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1881 {
1882 
1883 	return (memcpy_std(dst, src, len));
1884 }
1885 #else
1886 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
1887 {
1888 
1889 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1890 	    memset_erms : memset_std);
1891 }
1892 
1893 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
1894     size_t))
1895 {
1896 
1897 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1898 	    memmove_erms : memmove_std);
1899 }
1900 
1901 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
1902 {
1903 
1904 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1905 	    memcpy_erms : memcpy_std);
1906 }
1907 #endif
1908 
1909 void	pagezero_std(void *addr);
1910 void	pagezero_erms(void *addr);
1911 DEFINE_IFUNC(, void , pagezero, (void *))
1912 {
1913 
1914 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1915 	    pagezero_erms : pagezero_std);
1916 }
1917