xref: /freebsd/sys/amd64/amd64/machdep.c (revision 81b22a98)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 1992 Terrence R. Lambert.
6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
41  */
42 
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 #include "opt_atpic.h"
47 #include "opt_cpu.h"
48 #include "opt_ddb.h"
49 #include "opt_inet.h"
50 #include "opt_isa.h"
51 #include "opt_kstack_pages.h"
52 #include "opt_maxmem.h"
53 #include "opt_mp_watchdog.h"
54 #include "opt_pci.h"
55 #include "opt_platform.h"
56 #include "opt_sched.h"
57 
58 #include <sys/param.h>
59 #include <sys/proc.h>
60 #include <sys/systm.h>
61 #include <sys/asan.h>
62 #include <sys/bio.h>
63 #include <sys/buf.h>
64 #include <sys/bus.h>
65 #include <sys/callout.h>
66 #include <sys/cons.h>
67 #include <sys/cpu.h>
68 #include <sys/csan.h>
69 #include <sys/efi.h>
70 #include <sys/eventhandler.h>
71 #include <sys/exec.h>
72 #include <sys/imgact.h>
73 #include <sys/kdb.h>
74 #include <sys/kernel.h>
75 #include <sys/ktr.h>
76 #include <sys/linker.h>
77 #include <sys/lock.h>
78 #include <sys/malloc.h>
79 #include <sys/memrange.h>
80 #include <sys/msan.h>
81 #include <sys/msgbuf.h>
82 #include <sys/mutex.h>
83 #include <sys/pcpu.h>
84 #include <sys/ptrace.h>
85 #include <sys/reboot.h>
86 #include <sys/reg.h>
87 #include <sys/rwlock.h>
88 #include <sys/sched.h>
89 #include <sys/signalvar.h>
90 #ifdef SMP
91 #include <sys/smp.h>
92 #endif
93 #include <sys/syscallsubr.h>
94 #include <sys/sysctl.h>
95 #include <sys/sysent.h>
96 #include <sys/sysproto.h>
97 #include <sys/ucontext.h>
98 #include <sys/vmmeter.h>
99 
100 #include <vm/vm.h>
101 #include <vm/vm_param.h>
102 #include <vm/vm_extern.h>
103 #include <vm/vm_kern.h>
104 #include <vm/vm_page.h>
105 #include <vm/vm_map.h>
106 #include <vm/vm_object.h>
107 #include <vm/vm_pager.h>
108 #include <vm/vm_phys.h>
109 #include <vm/vm_dumpset.h>
110 
111 #ifdef DDB
112 #ifndef KDB
113 #error KDB must be enabled in order for DDB to work!
114 #endif
115 #include <ddb/ddb.h>
116 #include <ddb/db_sym.h>
117 #endif
118 
119 #include <net/netisr.h>
120 
121 #include <machine/clock.h>
122 #include <machine/cpu.h>
123 #include <machine/cputypes.h>
124 #include <machine/frame.h>
125 #include <machine/intr_machdep.h>
126 #include <x86/mca.h>
127 #include <machine/md_var.h>
128 #include <machine/metadata.h>
129 #include <machine/mp_watchdog.h>
130 #include <machine/pc/bios.h>
131 #include <machine/pcb.h>
132 #include <machine/proc.h>
133 #include <machine/sigframe.h>
134 #include <machine/specialreg.h>
135 #include <machine/trap.h>
136 #include <machine/tss.h>
137 #include <x86/ucode.h>
138 #include <x86/ifunc.h>
139 #ifdef SMP
140 #include <machine/smp.h>
141 #endif
142 #ifdef FDT
143 #include <x86/fdt.h>
144 #endif
145 
146 #ifdef DEV_ATPIC
147 #include <x86/isa/icu.h>
148 #else
149 #include <x86/apicvar.h>
150 #endif
151 
152 #include <isa/isareg.h>
153 #include <isa/rtc.h>
154 #include <x86/init.h>
155 
156 /* Sanity check for __curthread() */
157 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
158 
159 /*
160  * The PTI trampoline stack needs enough space for a hardware trapframe and a
161  * couple of scratch registers, as well as the trapframe left behind after an
162  * iret fault.
163  */
164 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
165     offsetof(struct pti_frame, pti_rip));
166 
167 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
168 
169 static void cpu_startup(void *);
170 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
171 
172 /* Preload data parse function */
173 static caddr_t native_parse_preload_data(u_int64_t);
174 
175 /* Native function to fetch and parse the e820 map */
176 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
177 
178 /* Default init_ops implementation. */
179 struct init_ops init_ops = {
180 	.parse_preload_data =	native_parse_preload_data,
181 	.early_clock_source_init =	i8254_init,
182 	.early_delay =			i8254_delay,
183 	.parse_memmap =			native_parse_memmap,
184 };
185 
186 /*
187  * Physical address of the EFI System Table. Stashed from the metadata hints
188  * passed into the kernel and used by the EFI code to call runtime services.
189  */
190 vm_paddr_t efi_systbl_phys;
191 
192 /* Intel ICH registers */
193 #define ICH_PMBASE	0x400
194 #define ICH_SMI_EN	ICH_PMBASE + 0x30
195 
196 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
197 
198 int cold = 1;
199 
200 long Maxmem = 0;
201 long realmem = 0;
202 
203 struct kva_md_info kmi;
204 
205 struct region_descriptor r_idt;
206 
207 struct pcpu *__pcpu;
208 struct pcpu temp_bsp_pcpu;
209 
210 struct mtx icu_lock;
211 
212 struct mem_range_softc mem_range_softc;
213 
214 struct mtx dt_lock;	/* lock for GDT and LDT */
215 
216 void (*vmm_resume_p)(void);
217 
218 bool efi_boot;
219 
220 static void
221 cpu_startup(dummy)
222 	void *dummy;
223 {
224 	uintmax_t memsize;
225 	char *sysenv;
226 
227 	/*
228 	 * On MacBooks, we need to disallow the legacy USB circuit to
229 	 * generate an SMI# because this can cause several problems,
230 	 * namely: incorrect CPU frequency detection and failure to
231 	 * start the APs.
232 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
233 	 * Enable register) of the Intel ICH LPC Interface Bridge.
234 	 */
235 	sysenv = kern_getenv("smbios.system.product");
236 	if (sysenv != NULL) {
237 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
238 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
239 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
240 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
241 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
242 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
243 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
244 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
245 			if (bootverbose)
246 				printf("Disabling LEGACY_USB_EN bit on "
247 				    "Intel ICH.\n");
248 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
249 		}
250 		freeenv(sysenv);
251 	}
252 
253 	/*
254 	 * Good {morning,afternoon,evening,night}.
255 	 */
256 	startrtclock();
257 	printcpuinfo();
258 
259 	/*
260 	 * Display physical memory if SMBIOS reports reasonable amount.
261 	 */
262 	memsize = 0;
263 	sysenv = kern_getenv("smbios.memory.enabled");
264 	if (sysenv != NULL) {
265 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
266 		freeenv(sysenv);
267 	}
268 	if (memsize < ptoa((uintmax_t)vm_free_count()))
269 		memsize = ptoa((uintmax_t)Maxmem);
270 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
271 	realmem = atop(memsize);
272 
273 	/*
274 	 * Display any holes after the first chunk of extended memory.
275 	 */
276 	if (bootverbose) {
277 		int indx;
278 
279 		printf("Physical memory chunk(s):\n");
280 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
281 			vm_paddr_t size;
282 
283 			size = phys_avail[indx + 1] - phys_avail[indx];
284 			printf(
285 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
286 			    (uintmax_t)phys_avail[indx],
287 			    (uintmax_t)phys_avail[indx + 1] - 1,
288 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
289 		}
290 	}
291 
292 	vm_ksubmap_init(&kmi);
293 
294 	printf("avail memory = %ju (%ju MB)\n",
295 	    ptoa((uintmax_t)vm_free_count()),
296 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
297 #ifdef DEV_PCI
298 	if (bootverbose && intel_graphics_stolen_base != 0)
299 		printf("intel stolen mem: base %#jx size %ju MB\n",
300 		    (uintmax_t)intel_graphics_stolen_base,
301 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
302 #endif
303 
304 	/*
305 	 * Set up buffers, so they can be used to read disk labels.
306 	 */
307 	bufinit();
308 	vm_pager_bufferinit();
309 
310 	cpu_setregs();
311 }
312 
313 static void
314 late_ifunc_resolve(void *dummy __unused)
315 {
316 	link_elf_late_ireloc();
317 }
318 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
319 
320 
321 void
322 cpu_setregs(void)
323 {
324 	register_t cr0;
325 
326 	cr0 = rcr0();
327 	/*
328 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
329 	 * BSP.  See the comments there about why we set them.
330 	 */
331 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
332 	load_cr0(cr0);
333 }
334 
335 /*
336  * Initialize amd64 and configure to run kernel
337  */
338 
339 /*
340  * Initialize segments & interrupt table
341  */
342 static struct gate_descriptor idt0[NIDT];
343 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
344 
345 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
346 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
347 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
348 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
349 CTASSERT(sizeof(struct nmi_pcpu) == 16);
350 
351 /*
352  * Software prototypes -- in more palatable form.
353  *
354  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
355  * slots as corresponding segments for i386 kernel.
356  */
357 struct soft_segment_descriptor gdt_segs[] = {
358 /* GNULL_SEL	0 Null Descriptor */
359 {	.ssd_base = 0x0,
360 	.ssd_limit = 0x0,
361 	.ssd_type = 0,
362 	.ssd_dpl = 0,
363 	.ssd_p = 0,
364 	.ssd_long = 0,
365 	.ssd_def32 = 0,
366 	.ssd_gran = 0		},
367 /* GNULL2_SEL	1 Null Descriptor */
368 {	.ssd_base = 0x0,
369 	.ssd_limit = 0x0,
370 	.ssd_type = 0,
371 	.ssd_dpl = 0,
372 	.ssd_p = 0,
373 	.ssd_long = 0,
374 	.ssd_def32 = 0,
375 	.ssd_gran = 0		},
376 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
377 {	.ssd_base = 0x0,
378 	.ssd_limit = 0xfffff,
379 	.ssd_type = SDT_MEMRWA,
380 	.ssd_dpl = SEL_UPL,
381 	.ssd_p = 1,
382 	.ssd_long = 0,
383 	.ssd_def32 = 1,
384 	.ssd_gran = 1		},
385 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
386 {	.ssd_base = 0x0,
387 	.ssd_limit = 0xfffff,
388 	.ssd_type = SDT_MEMRWA,
389 	.ssd_dpl = SEL_UPL,
390 	.ssd_p = 1,
391 	.ssd_long = 0,
392 	.ssd_def32 = 1,
393 	.ssd_gran = 1		},
394 /* GCODE_SEL	4 Code Descriptor for kernel */
395 {	.ssd_base = 0x0,
396 	.ssd_limit = 0xfffff,
397 	.ssd_type = SDT_MEMERA,
398 	.ssd_dpl = SEL_KPL,
399 	.ssd_p = 1,
400 	.ssd_long = 1,
401 	.ssd_def32 = 0,
402 	.ssd_gran = 1		},
403 /* GDATA_SEL	5 Data Descriptor for kernel */
404 {	.ssd_base = 0x0,
405 	.ssd_limit = 0xfffff,
406 	.ssd_type = SDT_MEMRWA,
407 	.ssd_dpl = SEL_KPL,
408 	.ssd_p = 1,
409 	.ssd_long = 1,
410 	.ssd_def32 = 0,
411 	.ssd_gran = 1		},
412 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
413 {	.ssd_base = 0x0,
414 	.ssd_limit = 0xfffff,
415 	.ssd_type = SDT_MEMERA,
416 	.ssd_dpl = SEL_UPL,
417 	.ssd_p = 1,
418 	.ssd_long = 0,
419 	.ssd_def32 = 1,
420 	.ssd_gran = 1		},
421 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
422 {	.ssd_base = 0x0,
423 	.ssd_limit = 0xfffff,
424 	.ssd_type = SDT_MEMRWA,
425 	.ssd_dpl = SEL_UPL,
426 	.ssd_p = 1,
427 	.ssd_long = 0,
428 	.ssd_def32 = 1,
429 	.ssd_gran = 1		},
430 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
431 {	.ssd_base = 0x0,
432 	.ssd_limit = 0xfffff,
433 	.ssd_type = SDT_MEMERA,
434 	.ssd_dpl = SEL_UPL,
435 	.ssd_p = 1,
436 	.ssd_long = 1,
437 	.ssd_def32 = 0,
438 	.ssd_gran = 1		},
439 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
440 {	.ssd_base = 0x0,
441 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
442 	.ssd_type = SDT_SYSTSS,
443 	.ssd_dpl = SEL_KPL,
444 	.ssd_p = 1,
445 	.ssd_long = 0,
446 	.ssd_def32 = 0,
447 	.ssd_gran = 0		},
448 /* Actually, the TSS is a system descriptor which is double size */
449 {	.ssd_base = 0x0,
450 	.ssd_limit = 0x0,
451 	.ssd_type = 0,
452 	.ssd_dpl = 0,
453 	.ssd_p = 0,
454 	.ssd_long = 0,
455 	.ssd_def32 = 0,
456 	.ssd_gran = 0		},
457 /* GUSERLDT_SEL	11 LDT Descriptor */
458 {	.ssd_base = 0x0,
459 	.ssd_limit = 0x0,
460 	.ssd_type = 0,
461 	.ssd_dpl = 0,
462 	.ssd_p = 0,
463 	.ssd_long = 0,
464 	.ssd_def32 = 0,
465 	.ssd_gran = 0		},
466 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
467 {	.ssd_base = 0x0,
468 	.ssd_limit = 0x0,
469 	.ssd_type = 0,
470 	.ssd_dpl = 0,
471 	.ssd_p = 0,
472 	.ssd_long = 0,
473 	.ssd_def32 = 0,
474 	.ssd_gran = 0		},
475 };
476 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
477 
478 void
479 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
480 {
481 	struct gate_descriptor *ip;
482 
483 	ip = idt + idx;
484 	ip->gd_looffset = (uintptr_t)func;
485 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
486 	ip->gd_ist = ist;
487 	ip->gd_xx = 0;
488 	ip->gd_type = typ;
489 	ip->gd_dpl = dpl;
490 	ip->gd_p = 1;
491 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
492 }
493 
494 extern inthand_t
495 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
496 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
497 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
498 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
499 	IDTVEC(xmm), IDTVEC(dblfault),
500 	IDTVEC(div_pti), IDTVEC(bpt_pti),
501 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
502 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
503 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
504 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
505 	IDTVEC(xmm_pti),
506 #ifdef KDTRACE_HOOKS
507 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
508 #endif
509 #ifdef XENHVM
510 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
511 #endif
512 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
513 	IDTVEC(fast_syscall_pti);
514 
515 #ifdef DDB
516 /*
517  * Display the index and function name of any IDT entries that don't use
518  * the default 'rsvd' entry point.
519  */
520 DB_SHOW_COMMAND(idt, db_show_idt)
521 {
522 	struct gate_descriptor *ip;
523 	int idx;
524 	uintptr_t func;
525 
526 	ip = idt;
527 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
528 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
529 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
530 			db_printf("%3d\t", idx);
531 			db_printsym(func, DB_STGY_PROC);
532 			db_printf("\n");
533 		}
534 		ip++;
535 	}
536 }
537 
538 /* Show privileged registers. */
539 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
540 {
541 	struct {
542 		uint16_t limit;
543 		uint64_t base;
544 	} __packed idtr, gdtr;
545 	uint16_t ldt, tr;
546 
547 	__asm __volatile("sidt %0" : "=m" (idtr));
548 	db_printf("idtr\t0x%016lx/%04x\n",
549 	    (u_long)idtr.base, (u_int)idtr.limit);
550 	__asm __volatile("sgdt %0" : "=m" (gdtr));
551 	db_printf("gdtr\t0x%016lx/%04x\n",
552 	    (u_long)gdtr.base, (u_int)gdtr.limit);
553 	__asm __volatile("sldt %0" : "=r" (ldt));
554 	db_printf("ldtr\t0x%04x\n", ldt);
555 	__asm __volatile("str %0" : "=r" (tr));
556 	db_printf("tr\t0x%04x\n", tr);
557 	db_printf("cr0\t0x%016lx\n", rcr0());
558 	db_printf("cr2\t0x%016lx\n", rcr2());
559 	db_printf("cr3\t0x%016lx\n", rcr3());
560 	db_printf("cr4\t0x%016lx\n", rcr4());
561 	if (rcr4() & CR4_XSAVE)
562 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
563 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
564 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
565 		db_printf("FEATURES_CTL\t%016lx\n",
566 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
567 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
568 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
569 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
570 }
571 
572 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
573 {
574 
575 	db_printf("dr0\t0x%016lx\n", rdr0());
576 	db_printf("dr1\t0x%016lx\n", rdr1());
577 	db_printf("dr2\t0x%016lx\n", rdr2());
578 	db_printf("dr3\t0x%016lx\n", rdr3());
579 	db_printf("dr6\t0x%016lx\n", rdr6());
580 	db_printf("dr7\t0x%016lx\n", rdr7());
581 }
582 #endif
583 
584 void
585 sdtossd(sd, ssd)
586 	struct user_segment_descriptor *sd;
587 	struct soft_segment_descriptor *ssd;
588 {
589 
590 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
591 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
592 	ssd->ssd_type  = sd->sd_type;
593 	ssd->ssd_dpl   = sd->sd_dpl;
594 	ssd->ssd_p     = sd->sd_p;
595 	ssd->ssd_long  = sd->sd_long;
596 	ssd->ssd_def32 = sd->sd_def32;
597 	ssd->ssd_gran  = sd->sd_gran;
598 }
599 
600 void
601 ssdtosd(ssd, sd)
602 	struct soft_segment_descriptor *ssd;
603 	struct user_segment_descriptor *sd;
604 {
605 
606 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
607 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
608 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
609 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
610 	sd->sd_type  = ssd->ssd_type;
611 	sd->sd_dpl   = ssd->ssd_dpl;
612 	sd->sd_p     = ssd->ssd_p;
613 	sd->sd_long  = ssd->ssd_long;
614 	sd->sd_def32 = ssd->ssd_def32;
615 	sd->sd_gran  = ssd->ssd_gran;
616 }
617 
618 void
619 ssdtosyssd(ssd, sd)
620 	struct soft_segment_descriptor *ssd;
621 	struct system_segment_descriptor *sd;
622 {
623 
624 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
625 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
626 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
627 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
628 	sd->sd_type  = ssd->ssd_type;
629 	sd->sd_dpl   = ssd->ssd_dpl;
630 	sd->sd_p     = ssd->ssd_p;
631 	sd->sd_gran  = ssd->ssd_gran;
632 }
633 
634 u_int basemem;
635 
636 static int
637 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
638     int *physmap_idxp)
639 {
640 	int i, insert_idx, physmap_idx;
641 
642 	physmap_idx = *physmap_idxp;
643 
644 	if (length == 0)
645 		return (1);
646 
647 	/*
648 	 * Find insertion point while checking for overlap.  Start off by
649 	 * assuming the new entry will be added to the end.
650 	 *
651 	 * NB: physmap_idx points to the next free slot.
652 	 */
653 	insert_idx = physmap_idx;
654 	for (i = 0; i <= physmap_idx; i += 2) {
655 		if (base < physmap[i + 1]) {
656 			if (base + length <= physmap[i]) {
657 				insert_idx = i;
658 				break;
659 			}
660 			if (boothowto & RB_VERBOSE)
661 				printf(
662 		    "Overlapping memory regions, ignoring second region\n");
663 			return (1);
664 		}
665 	}
666 
667 	/* See if we can prepend to the next entry. */
668 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
669 		physmap[insert_idx] = base;
670 		return (1);
671 	}
672 
673 	/* See if we can append to the previous entry. */
674 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
675 		physmap[insert_idx - 1] += length;
676 		return (1);
677 	}
678 
679 	physmap_idx += 2;
680 	*physmap_idxp = physmap_idx;
681 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
682 		printf(
683 		"Too many segments in the physical address map, giving up\n");
684 		return (0);
685 	}
686 
687 	/*
688 	 * Move the last 'N' entries down to make room for the new
689 	 * entry if needed.
690 	 */
691 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
692 		physmap[i] = physmap[i - 2];
693 		physmap[i + 1] = physmap[i - 1];
694 	}
695 
696 	/* Insert the new entry. */
697 	physmap[insert_idx] = base;
698 	physmap[insert_idx + 1] = base + length;
699 	return (1);
700 }
701 
702 void
703 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
704                       vm_paddr_t *physmap, int *physmap_idx)
705 {
706 	struct bios_smap *smap, *smapend;
707 
708 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
709 
710 	for (smap = smapbase; smap < smapend; smap++) {
711 		if (boothowto & RB_VERBOSE)
712 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
713 			    smap->type, smap->base, smap->length);
714 
715 		if (smap->type != SMAP_TYPE_MEMORY)
716 			continue;
717 
718 		if (!add_physmap_entry(smap->base, smap->length, physmap,
719 		    physmap_idx))
720 			break;
721 	}
722 }
723 
724 static void
725 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
726     int *physmap_idx)
727 {
728 	struct efi_md *map, *p;
729 	const char *type;
730 	size_t efisz;
731 	int ndesc, i;
732 
733 	static const char *types[] = {
734 		"Reserved",
735 		"LoaderCode",
736 		"LoaderData",
737 		"BootServicesCode",
738 		"BootServicesData",
739 		"RuntimeServicesCode",
740 		"RuntimeServicesData",
741 		"ConventionalMemory",
742 		"UnusableMemory",
743 		"ACPIReclaimMemory",
744 		"ACPIMemoryNVS",
745 		"MemoryMappedIO",
746 		"MemoryMappedIOPortSpace",
747 		"PalCode",
748 		"PersistentMemory"
749 	};
750 
751 	/*
752 	 * Memory map data provided by UEFI via the GetMemoryMap
753 	 * Boot Services API.
754 	 */
755 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
756 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
757 
758 	if (efihdr->descriptor_size == 0)
759 		return;
760 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
761 
762 	if (boothowto & RB_VERBOSE)
763 		printf("%23s %12s %12s %8s %4s\n",
764 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
765 
766 	for (i = 0, p = map; i < ndesc; i++,
767 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
768 		if (boothowto & RB_VERBOSE) {
769 			if (p->md_type < nitems(types))
770 				type = types[p->md_type];
771 			else
772 				type = "<INVALID>";
773 			printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
774 			    p->md_virt, p->md_pages);
775 			if (p->md_attr & EFI_MD_ATTR_UC)
776 				printf("UC ");
777 			if (p->md_attr & EFI_MD_ATTR_WC)
778 				printf("WC ");
779 			if (p->md_attr & EFI_MD_ATTR_WT)
780 				printf("WT ");
781 			if (p->md_attr & EFI_MD_ATTR_WB)
782 				printf("WB ");
783 			if (p->md_attr & EFI_MD_ATTR_UCE)
784 				printf("UCE ");
785 			if (p->md_attr & EFI_MD_ATTR_WP)
786 				printf("WP ");
787 			if (p->md_attr & EFI_MD_ATTR_RP)
788 				printf("RP ");
789 			if (p->md_attr & EFI_MD_ATTR_XP)
790 				printf("XP ");
791 			if (p->md_attr & EFI_MD_ATTR_NV)
792 				printf("NV ");
793 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
794 				printf("MORE_RELIABLE ");
795 			if (p->md_attr & EFI_MD_ATTR_RO)
796 				printf("RO ");
797 			if (p->md_attr & EFI_MD_ATTR_RT)
798 				printf("RUNTIME");
799 			printf("\n");
800 		}
801 
802 		switch (p->md_type) {
803 		case EFI_MD_TYPE_CODE:
804 		case EFI_MD_TYPE_DATA:
805 		case EFI_MD_TYPE_BS_CODE:
806 		case EFI_MD_TYPE_BS_DATA:
807 		case EFI_MD_TYPE_FREE:
808 			/*
809 			 * We're allowed to use any entry with these types.
810 			 */
811 			break;
812 		default:
813 			continue;
814 		}
815 
816 		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
817 		    physmap, physmap_idx))
818 			break;
819 	}
820 }
821 
822 static void
823 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
824 {
825 	struct bios_smap *smap;
826 	struct efi_map_header *efihdr;
827 	u_int32_t size;
828 
829 	/*
830 	 * Memory map from INT 15:E820.
831 	 *
832 	 * subr_module.c says:
833 	 * "Consumer may safely assume that size value precedes data."
834 	 * ie: an int32_t immediately precedes smap.
835 	 */
836 
837 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
838 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
839 	smap = (struct bios_smap *)preload_search_info(kmdp,
840 	    MODINFO_METADATA | MODINFOMD_SMAP);
841 	if (efihdr == NULL && smap == NULL)
842 		panic("No BIOS smap or EFI map info from loader!");
843 
844 	if (efihdr != NULL) {
845 		add_efi_map_entries(efihdr, physmap, physmap_idx);
846 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
847 	} else {
848 		size = *((u_int32_t *)smap - 1);
849 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
850 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
851 	}
852 }
853 
854 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
855 
856 /*
857  * Populate the (physmap) array with base/bound pairs describing the
858  * available physical memory in the system, then test this memory and
859  * build the phys_avail array describing the actually-available memory.
860  *
861  * Total memory size may be set by the kernel environment variable
862  * hw.physmem or the compile-time define MAXMEM.
863  *
864  * XXX first should be vm_paddr_t.
865  */
866 static void
867 getmemsize(caddr_t kmdp, u_int64_t first)
868 {
869 	int i, physmap_idx, pa_indx, da_indx;
870 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
871 	u_long physmem_start, physmem_tunable, memtest;
872 	pt_entry_t *pte;
873 	quad_t dcons_addr, dcons_size;
874 	int page_counter;
875 
876 	/*
877 	 * Tell the physical memory allocator about pages used to store
878 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
879 	 */
880 	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
881 
882 	bzero(physmap, sizeof(physmap));
883 	physmap_idx = 0;
884 
885 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
886 	physmap_idx -= 2;
887 
888 	/*
889 	 * Find the 'base memory' segment for SMP
890 	 */
891 	basemem = 0;
892 	for (i = 0; i <= physmap_idx; i += 2) {
893 		if (physmap[i] <= 0xA0000) {
894 			basemem = physmap[i + 1] / 1024;
895 			break;
896 		}
897 	}
898 	if (basemem == 0 || basemem > 640) {
899 		if (bootverbose)
900 			printf(
901 		"Memory map doesn't contain a basemem segment, faking it");
902 		basemem = 640;
903 	}
904 
905 	/*
906 	 * Maxmem isn't the "maximum memory", it's one larger than the
907 	 * highest page of the physical address space.  It should be
908 	 * called something like "Maxphyspage".  We may adjust this
909 	 * based on ``hw.physmem'' and the results of the memory test.
910 	 */
911 	Maxmem = atop(physmap[physmap_idx + 1]);
912 
913 #ifdef MAXMEM
914 	Maxmem = MAXMEM / 4;
915 #endif
916 
917 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
918 		Maxmem = atop(physmem_tunable);
919 
920 	/*
921 	 * The boot memory test is disabled by default, as it takes a
922 	 * significant amount of time on large-memory systems, and is
923 	 * unfriendly to virtual machines as it unnecessarily touches all
924 	 * pages.
925 	 *
926 	 * A general name is used as the code may be extended to support
927 	 * additional tests beyond the current "page present" test.
928 	 */
929 	memtest = 0;
930 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
931 
932 	/*
933 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
934 	 * in the system.
935 	 */
936 	if (Maxmem > atop(physmap[physmap_idx + 1]))
937 		Maxmem = atop(physmap[physmap_idx + 1]);
938 
939 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
940 	    (boothowto & RB_VERBOSE))
941 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
942 
943 	/* call pmap initialization to make new kernel address space */
944 	pmap_bootstrap(&first);
945 
946 	/*
947 	 * Size up each available chunk of physical memory.
948 	 *
949 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
950 	 * By default, mask off the first 16 pages unless we appear to be
951 	 * running in a VM.
952 	 */
953 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
954 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
955 	if (physmap[0] < physmem_start) {
956 		if (physmem_start < PAGE_SIZE)
957 			physmap[0] = PAGE_SIZE;
958 		else if (physmem_start >= physmap[1])
959 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
960 		else
961 			physmap[0] = round_page(physmem_start);
962 	}
963 	pa_indx = 0;
964 	da_indx = 1;
965 	phys_avail[pa_indx++] = physmap[0];
966 	phys_avail[pa_indx] = physmap[0];
967 	dump_avail[da_indx] = physmap[0];
968 	pte = CMAP1;
969 
970 	/*
971 	 * Get dcons buffer address
972 	 */
973 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
974 	    getenv_quad("dcons.size", &dcons_size) == 0)
975 		dcons_addr = 0;
976 
977 	/*
978 	 * physmap is in bytes, so when converting to page boundaries,
979 	 * round up the start address and round down the end address.
980 	 */
981 	page_counter = 0;
982 	if (memtest != 0)
983 		printf("Testing system memory");
984 	for (i = 0; i <= physmap_idx; i += 2) {
985 		vm_paddr_t end;
986 
987 		end = ptoa((vm_paddr_t)Maxmem);
988 		if (physmap[i + 1] < end)
989 			end = trunc_page(physmap[i + 1]);
990 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
991 			int tmp, page_bad, full;
992 			int *ptr = (int *)CADDR1;
993 
994 			full = FALSE;
995 			/*
996 			 * block out kernel memory as not available.
997 			 */
998 			if (pa >= (vm_paddr_t)kernphys && pa < first)
999 				goto do_dump_avail;
1000 
1001 			/*
1002 			 * block out dcons buffer
1003 			 */
1004 			if (dcons_addr > 0
1005 			    && pa >= trunc_page(dcons_addr)
1006 			    && pa < dcons_addr + dcons_size)
1007 				goto do_dump_avail;
1008 
1009 			page_bad = FALSE;
1010 			if (memtest == 0)
1011 				goto skip_memtest;
1012 
1013 			/*
1014 			 * Print a "." every GB to show we're making
1015 			 * progress.
1016 			 */
1017 			page_counter++;
1018 			if ((page_counter % PAGES_PER_GB) == 0)
1019 				printf(".");
1020 
1021 			/*
1022 			 * map page into kernel: valid, read/write,non-cacheable
1023 			 */
1024 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1025 			invltlb();
1026 
1027 			tmp = *(int *)ptr;
1028 			/*
1029 			 * Test for alternating 1's and 0's
1030 			 */
1031 			*(volatile int *)ptr = 0xaaaaaaaa;
1032 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1033 				page_bad = TRUE;
1034 			/*
1035 			 * Test for alternating 0's and 1's
1036 			 */
1037 			*(volatile int *)ptr = 0x55555555;
1038 			if (*(volatile int *)ptr != 0x55555555)
1039 				page_bad = TRUE;
1040 			/*
1041 			 * Test for all 1's
1042 			 */
1043 			*(volatile int *)ptr = 0xffffffff;
1044 			if (*(volatile int *)ptr != 0xffffffff)
1045 				page_bad = TRUE;
1046 			/*
1047 			 * Test for all 0's
1048 			 */
1049 			*(volatile int *)ptr = 0x0;
1050 			if (*(volatile int *)ptr != 0x0)
1051 				page_bad = TRUE;
1052 			/*
1053 			 * Restore original value.
1054 			 */
1055 			*(int *)ptr = tmp;
1056 
1057 skip_memtest:
1058 			/*
1059 			 * Adjust array of valid/good pages.
1060 			 */
1061 			if (page_bad == TRUE)
1062 				continue;
1063 			/*
1064 			 * If this good page is a continuation of the
1065 			 * previous set of good pages, then just increase
1066 			 * the end pointer. Otherwise start a new chunk.
1067 			 * Note that "end" points one higher than end,
1068 			 * making the range >= start and < end.
1069 			 * If we're also doing a speculative memory
1070 			 * test and we at or past the end, bump up Maxmem
1071 			 * so that we keep going. The first bad page
1072 			 * will terminate the loop.
1073 			 */
1074 			if (phys_avail[pa_indx] == pa) {
1075 				phys_avail[pa_indx] += PAGE_SIZE;
1076 			} else {
1077 				pa_indx++;
1078 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
1079 					printf(
1080 		"Too many holes in the physical address space, giving up\n");
1081 					pa_indx--;
1082 					full = TRUE;
1083 					goto do_dump_avail;
1084 				}
1085 				phys_avail[pa_indx++] = pa;	/* start */
1086 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1087 			}
1088 			physmem++;
1089 do_dump_avail:
1090 			if (dump_avail[da_indx] == pa) {
1091 				dump_avail[da_indx] += PAGE_SIZE;
1092 			} else {
1093 				da_indx++;
1094 				if (da_indx == PHYS_AVAIL_ENTRIES) {
1095 					da_indx--;
1096 					goto do_next;
1097 				}
1098 				dump_avail[da_indx++] = pa; /* start */
1099 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1100 			}
1101 do_next:
1102 			if (full)
1103 				break;
1104 		}
1105 	}
1106 	*pte = 0;
1107 	invltlb();
1108 	if (memtest != 0)
1109 		printf("\n");
1110 
1111 	/*
1112 	 * XXX
1113 	 * The last chunk must contain at least one page plus the message
1114 	 * buffer to avoid complicating other code (message buffer address
1115 	 * calculation, etc.).
1116 	 */
1117 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1118 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1119 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1120 		phys_avail[pa_indx--] = 0;
1121 		phys_avail[pa_indx--] = 0;
1122 	}
1123 
1124 	Maxmem = atop(phys_avail[pa_indx]);
1125 
1126 	/* Trim off space for the message buffer. */
1127 	phys_avail[pa_indx] -= round_page(msgbufsize);
1128 
1129 	/* Map the message buffer. */
1130 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1131 }
1132 
1133 static caddr_t
1134 native_parse_preload_data(u_int64_t modulep)
1135 {
1136 	caddr_t kmdp;
1137 	char *envp;
1138 #ifdef DDB
1139 	vm_offset_t ksym_start;
1140 	vm_offset_t ksym_end;
1141 #endif
1142 
1143 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1144 	preload_bootstrap_relocate(KERNBASE);
1145 	kmdp = preload_search_by_type("elf kernel");
1146 	if (kmdp == NULL)
1147 		kmdp = preload_search_by_type("elf64 kernel");
1148 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1149 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1150 	if (envp != NULL)
1151 		envp += KERNBASE;
1152 	init_static_kenv(envp, 0);
1153 #ifdef DDB
1154 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1155 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1156 	db_fetch_ksymtab(ksym_start, ksym_end, 0);
1157 #endif
1158 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1159 
1160 	return (kmdp);
1161 }
1162 
1163 static void
1164 amd64_kdb_init(void)
1165 {
1166 	kdb_init();
1167 #ifdef KDB
1168 	if (boothowto & RB_KDB)
1169 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1170 #endif
1171 }
1172 
1173 /* Set up the fast syscall stuff */
1174 void
1175 amd64_conf_fast_syscall(void)
1176 {
1177 	uint64_t msr;
1178 
1179 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1180 	wrmsr(MSR_EFER, msr);
1181 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1182 	    (u_int64_t)IDTVEC(fast_syscall));
1183 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1184 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1185 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1186 	wrmsr(MSR_STAR, msr);
1187 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1188 }
1189 
1190 void
1191 amd64_bsp_pcpu_init1(struct pcpu *pc)
1192 {
1193 	struct user_segment_descriptor *gdt;
1194 
1195 	PCPU_SET(prvspace, pc);
1196 	gdt = *PCPU_PTR(gdt);
1197 	PCPU_SET(curthread, &thread0);
1198 	PCPU_SET(tssp, PCPU_PTR(common_tss));
1199 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1200 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1201 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1202 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1203 	PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1204 	PCPU_SET(smp_tlb_gen, 1);
1205 }
1206 
1207 void
1208 amd64_bsp_pcpu_init2(uint64_t rsp0)
1209 {
1210 
1211 	PCPU_SET(rsp0, rsp0);
1212 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1213 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1214 	PCPU_SET(curpcb, thread0.td_pcb);
1215 }
1216 
1217 void
1218 amd64_bsp_ist_init(struct pcpu *pc)
1219 {
1220 	struct nmi_pcpu *np;
1221 	struct amd64tss *tssp;
1222 
1223 	tssp = &pc->pc_common_tss;
1224 
1225 	/* doublefault stack space, runs on ist1 */
1226 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1227 	np->np_pcpu = (register_t)pc;
1228 	tssp->tss_ist1 = (long)np;
1229 
1230 	/*
1231 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1232 	 * above the start of the ist2 stack.
1233 	 */
1234 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1235 	np->np_pcpu = (register_t)pc;
1236 	tssp->tss_ist2 = (long)np;
1237 
1238 	/*
1239 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1240 	 * above the start of the ist3 stack.
1241 	 */
1242 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1243 	np->np_pcpu = (register_t)pc;
1244 	tssp->tss_ist3 = (long)np;
1245 
1246 	/*
1247 	 * DB# stack, runs on ist4.
1248 	 */
1249 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1250 	np->np_pcpu = (register_t)pc;
1251 	tssp->tss_ist4 = (long)np;
1252 }
1253 
1254 u_int64_t
1255 hammer_time(u_int64_t modulep, u_int64_t physfree)
1256 {
1257 	caddr_t kmdp;
1258 	int gsel_tss, x;
1259 	struct pcpu *pc;
1260 	uint64_t cr3, rsp0;
1261 	pml4_entry_t *pml4e;
1262 	pdp_entry_t *pdpe;
1263 	pd_entry_t *pde;
1264 	char *env;
1265 	struct user_segment_descriptor *gdt;
1266 	struct region_descriptor r_gdt;
1267 	size_t kstack0_sz;
1268 	int late_console;
1269 
1270 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1271 
1272 	/*
1273 	 * Calculate kernphys by inspecting page table created by loader.
1274 	 * The assumptions:
1275 	 * - kernel is mapped at KERNBASE, backed by contiguous phys memory
1276 	 *   aligned at 2M, below 4G (the latter is important for AP startup)
1277 	 * - there is a 2M hole at KERNBASE
1278 	 * - kernel is mapped with 2M superpages
1279 	 * - all participating memory, i.e. kernel, modules, metadata,
1280 	 *   page table is accessible by pre-created 1:1 mapping
1281 	 *   (right now loader creates 1:1 mapping for lower 4G, and all
1282 	 *   memory is from there)
1283 	 * - there is a usable memory block right after the end of the
1284 	 *   mapped kernel and all modules/metadata, pointed to by
1285 	 *   physfree, for early allocations
1286 	 */
1287 	cr3 = rcr3();
1288 	pml4e = (pml4_entry_t *)(cr3 & ~PAGE_MASK) + pmap_pml4e_index(
1289 	    (vm_offset_t)hammer_time);
1290 	pdpe = (pdp_entry_t *)(*pml4e & ~PAGE_MASK) + pmap_pdpe_index(
1291 	    (vm_offset_t)hammer_time);
1292 	pde = (pd_entry_t *)(*pdpe & ~PAGE_MASK) + pmap_pde_index(
1293 	    (vm_offset_t)hammer_time);
1294 	kernphys = (vm_paddr_t)(*pde & ~PDRMASK) -
1295 	    (vm_paddr_t)(((vm_offset_t)hammer_time - KERNBASE) & ~PDRMASK);
1296 
1297 	/* Fix-up for 2M hole */
1298 	physfree += kernphys;
1299 	kernphys += NBPDR;
1300 
1301 	kmdp = init_ops.parse_preload_data(modulep);
1302 
1303 	efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
1304 	    MODINFOMD_EFI_MAP) != NULL;
1305 
1306 	if (!efi_boot) {
1307 		/* Tell the bios to warmboot next time */
1308 		atomic_store_short((u_short *)0x472, 0x1234);
1309 	}
1310 
1311 	physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
1312 	physfree = roundup2(physfree, PAGE_SIZE);
1313 
1314 	identify_cpu1();
1315 	identify_hypervisor();
1316 	identify_cpu_fixup_bsp();
1317 	identify_cpu2();
1318 	initializecpucache();
1319 
1320 	/*
1321 	 * Check for pti, pcid, and invpcid before ifuncs are
1322 	 * resolved, to correctly select the implementation for
1323 	 * pmap_activate_sw_mode().
1324 	 */
1325 	pti = pti_get_default();
1326 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1327 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1328 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1329 		invpcid_works = (cpu_stdext_feature &
1330 		    CPUID_STDEXT_INVPCID) != 0;
1331 	} else {
1332 		pmap_pcid_enabled = 0;
1333 	}
1334 
1335 	link_elf_ireloc(kmdp);
1336 
1337 	/*
1338 	 * This may be done better later if it gets more high level
1339 	 * components in it. If so just link td->td_proc here.
1340 	 */
1341 	proc_linkup0(&proc0, &thread0);
1342 
1343 	/* Init basic tunables, hz etc */
1344 	init_param1();
1345 
1346 	thread0.td_kstack = physfree - kernphys + KERNSTART;
1347 	thread0.td_kstack_pages = kstack_pages;
1348 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1349 	bzero((void *)thread0.td_kstack, kstack0_sz);
1350 	physfree += kstack0_sz;
1351 
1352 	/*
1353 	 * Initialize enough of thread0 for delayed invalidation to
1354 	 * work very early.  Rely on thread0.td_base_pri
1355 	 * zero-initialization, it is reset to PVM at proc0_init().
1356 	 */
1357 	pmap_thread_init_invl_gen(&thread0);
1358 
1359 	pc = &temp_bsp_pcpu;
1360 	pcpu_init(pc, 0, sizeof(struct pcpu));
1361 	gdt = &temp_bsp_pcpu.pc_gdt[0];
1362 
1363 	/*
1364 	 * make gdt memory segments
1365 	 */
1366 	for (x = 0; x < NGDT; x++) {
1367 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1368 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1369 			ssdtosd(&gdt_segs[x], &gdt[x]);
1370 	}
1371 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1372 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1373 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1374 
1375 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1376 	r_gdt.rd_base = (long)gdt;
1377 	lgdt(&r_gdt);
1378 
1379 	wrmsr(MSR_FSBASE, 0);		/* User value */
1380 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1381 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1382 
1383 	dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
1384 	physfree += DPCPU_SIZE;
1385 	amd64_bsp_pcpu_init1(pc);
1386 	/* Non-late cninit() and printf() can be moved up to here. */
1387 
1388 	/*
1389 	 * Initialize mutexes.
1390 	 *
1391 	 * icu_lock: in order to allow an interrupt to occur in a critical
1392 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1393 	 *	     must be able to get the icu lock, so it can't be
1394 	 *	     under witness.
1395 	 */
1396 	mutex_init();
1397 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1398 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1399 
1400 	/* exceptions */
1401 	for (x = 0; x < NIDT; x++)
1402 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1403 		    SEL_KPL, 0);
1404 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1405 	    SEL_KPL, 0);
1406 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1407 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1408 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1409 	    SEL_UPL, 0);
1410 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1411 	    SEL_UPL, 0);
1412 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1413 	    SEL_KPL, 0);
1414 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1415 	    SEL_KPL, 0);
1416 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1417 	    SEL_KPL, 0);
1418 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1419 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1420 	    SDT_SYSIGT, SEL_KPL, 0);
1421 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1422 	    SEL_KPL, 0);
1423 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1424 	    SDT_SYSIGT, SEL_KPL, 0);
1425 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1426 	    SEL_KPL, 0);
1427 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1428 	    SEL_KPL, 0);
1429 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1430 	    SEL_KPL, 0);
1431 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1432 	    SEL_KPL, 0);
1433 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1434 	    SEL_KPL, 0);
1435 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1436 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1437 	    SEL_KPL, 0);
1438 #ifdef KDTRACE_HOOKS
1439 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1440 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1441 #endif
1442 #ifdef XENHVM
1443 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1444 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1445 #endif
1446 	r_idt.rd_limit = sizeof(idt0) - 1;
1447 	r_idt.rd_base = (long) idt;
1448 	lidt(&r_idt);
1449 
1450 	/*
1451 	 * Initialize the clock before the console so that console
1452 	 * initialization can use DELAY().
1453 	 */
1454 	clock_init();
1455 
1456 	/*
1457 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1458 	 * transition).
1459 	 * Once bootblocks have updated, we can test directly for
1460 	 * efi_systbl != NULL here...
1461 	 */
1462 	if (efi_boot)
1463 		vty_set_preferred(VTY_VT);
1464 
1465 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1466 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1467 
1468 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1469 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1470 
1471 	TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
1472 	    &syscall_ret_l1d_flush_mode);
1473 
1474 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1475 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1476 
1477 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1478 
1479 	TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
1480 	    &x86_rngds_mitg_enable);
1481 
1482 	finishidentcpu();	/* Final stage of CPU initialization */
1483 	initializecpu();	/* Initialize CPU registers */
1484 
1485 	amd64_bsp_ist_init(pc);
1486 
1487 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1488 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1489 	    IOPERM_BITMAP_SIZE;
1490 
1491 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1492 	ltr(gsel_tss);
1493 
1494 	amd64_conf_fast_syscall();
1495 
1496 	/*
1497 	 * We initialize the PCB pointer early so that exception
1498 	 * handlers will work.  Also set up td_critnest to short-cut
1499 	 * the page fault handler.
1500 	 */
1501 	cpu_max_ext_state_size = sizeof(struct savefpu);
1502 	set_top_of_stack_td(&thread0);
1503 	thread0.td_pcb = get_pcb_td(&thread0);
1504 	thread0.td_critnest = 1;
1505 
1506 	/*
1507 	 * The console and kdb should be initialized even earlier than here,
1508 	 * but some console drivers don't work until after getmemsize().
1509 	 * Default to late console initialization to support these drivers.
1510 	 * This loses mainly printf()s in getmemsize() and early debugging.
1511 	 */
1512 	late_console = 1;
1513 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1514 	if (!late_console) {
1515 		cninit();
1516 		amd64_kdb_init();
1517 	}
1518 
1519 	getmemsize(kmdp, physfree);
1520 	init_param2(physmem);
1521 
1522 	/* now running on new page tables, configured,and u/iom is accessible */
1523 
1524 #ifdef DEV_PCI
1525         /* This call might adjust phys_avail[]. */
1526         pci_early_quirks();
1527 #endif
1528 
1529 	if (late_console)
1530 		cninit();
1531 
1532 	/*
1533 	 * Dump the boot metadata. We have to wait for cninit() since console
1534 	 * output is required. If it's grossly incorrect the kernel will never
1535 	 * make it this far.
1536 	 */
1537 	if (getenv_is_true("debug.dump_modinfo_at_boot"))
1538 		preload_dump();
1539 
1540 #ifdef DEV_ISA
1541 #ifdef DEV_ATPIC
1542 	elcr_probe();
1543 	atpic_startup();
1544 #else
1545 	/* Reset and mask the atpics and leave them shut down. */
1546 	atpic_reset();
1547 
1548 	/*
1549 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1550 	 * interrupt handler.
1551 	 */
1552 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1553 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1554 #endif
1555 #else
1556 #error "have you forgotten the isa device?"
1557 #endif
1558 
1559 	if (late_console)
1560 		amd64_kdb_init();
1561 
1562 	msgbufinit(msgbufp, msgbufsize);
1563 	fpuinit();
1564 
1565 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1566 	rsp0 = thread0.td_md.md_stack_base;
1567 	/* Ensure the stack is aligned to 16 bytes */
1568 	rsp0 &= ~0xFul;
1569 	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1570 	amd64_bsp_pcpu_init2(rsp0);
1571 
1572 	/* transfer to user mode */
1573 
1574 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1575 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1576 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1577 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1578 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1579 
1580 	load_ds(_udatasel);
1581 	load_es(_udatasel);
1582 	load_fs(_ufssel);
1583 
1584 	/* setup proc 0's pcb */
1585 	thread0.td_pcb->pcb_flags = 0;
1586 
1587         env = kern_getenv("kernelname");
1588 	if (env != NULL)
1589 		strlcpy(kernelname, env, sizeof(kernelname));
1590 
1591 	kcsan_cpu_init(0);
1592 
1593 #ifdef FDT
1594 	x86_init_fdt();
1595 #endif
1596 	thread0.td_critnest = 0;
1597 
1598 	kasan_init();
1599 	kmsan_init();
1600 
1601 	TSEXIT();
1602 
1603 	/* Location of kernel stack for locore */
1604 	return (thread0.td_md.md_stack_base);
1605 }
1606 
1607 void
1608 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1609 {
1610 
1611 	pcpu->pc_acpi_id = 0xffffffff;
1612 }
1613 
1614 static int
1615 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1616 {
1617 	struct bios_smap *smapbase;
1618 	struct bios_smap_xattr smap;
1619 	caddr_t kmdp;
1620 	uint32_t *smapattr;
1621 	int count, error, i;
1622 
1623 	/* Retrieve the system memory map from the loader. */
1624 	kmdp = preload_search_by_type("elf kernel");
1625 	if (kmdp == NULL)
1626 		kmdp = preload_search_by_type("elf64 kernel");
1627 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1628 	    MODINFO_METADATA | MODINFOMD_SMAP);
1629 	if (smapbase == NULL)
1630 		return (0);
1631 	smapattr = (uint32_t *)preload_search_info(kmdp,
1632 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1633 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1634 	error = 0;
1635 	for (i = 0; i < count; i++) {
1636 		smap.base = smapbase[i].base;
1637 		smap.length = smapbase[i].length;
1638 		smap.type = smapbase[i].type;
1639 		if (smapattr != NULL)
1640 			smap.xattr = smapattr[i];
1641 		else
1642 			smap.xattr = 0;
1643 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1644 	}
1645 	return (error);
1646 }
1647 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1648     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1649     smap_sysctl_handler, "S,bios_smap_xattr",
1650     "Raw BIOS SMAP data");
1651 
1652 static int
1653 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1654 {
1655 	struct efi_map_header *efihdr;
1656 	caddr_t kmdp;
1657 	uint32_t efisize;
1658 
1659 	kmdp = preload_search_by_type("elf kernel");
1660 	if (kmdp == NULL)
1661 		kmdp = preload_search_by_type("elf64 kernel");
1662 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1663 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1664 	if (efihdr == NULL)
1665 		return (0);
1666 	efisize = *((uint32_t *)efihdr - 1);
1667 	return (SYSCTL_OUT(req, efihdr, efisize));
1668 }
1669 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1670     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1671     efi_map_sysctl_handler, "S,efi_map_header",
1672     "Raw EFI Memory Map");
1673 
1674 void
1675 spinlock_enter(void)
1676 {
1677 	struct thread *td;
1678 	register_t flags;
1679 
1680 	td = curthread;
1681 	if (td->td_md.md_spinlock_count == 0) {
1682 		flags = intr_disable();
1683 		td->td_md.md_spinlock_count = 1;
1684 		td->td_md.md_saved_flags = flags;
1685 		critical_enter();
1686 	} else
1687 		td->td_md.md_spinlock_count++;
1688 }
1689 
1690 void
1691 spinlock_exit(void)
1692 {
1693 	struct thread *td;
1694 	register_t flags;
1695 
1696 	td = curthread;
1697 	flags = td->td_md.md_saved_flags;
1698 	td->td_md.md_spinlock_count--;
1699 	if (td->td_md.md_spinlock_count == 0) {
1700 		critical_exit();
1701 		intr_restore(flags);
1702 	}
1703 }
1704 
1705 /*
1706  * Construct a PCB from a trapframe. This is called from kdb_trap() where
1707  * we want to start a backtrace from the function that caused us to enter
1708  * the debugger. We have the context in the trapframe, but base the trace
1709  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1710  * enough for a backtrace.
1711  */
1712 void
1713 makectx(struct trapframe *tf, struct pcb *pcb)
1714 {
1715 
1716 	pcb->pcb_r12 = tf->tf_r12;
1717 	pcb->pcb_r13 = tf->tf_r13;
1718 	pcb->pcb_r14 = tf->tf_r14;
1719 	pcb->pcb_r15 = tf->tf_r15;
1720 	pcb->pcb_rbp = tf->tf_rbp;
1721 	pcb->pcb_rbx = tf->tf_rbx;
1722 	pcb->pcb_rip = tf->tf_rip;
1723 	pcb->pcb_rsp = tf->tf_rsp;
1724 }
1725 
1726 /*
1727  * The pcb_flags is only modified by current thread, or by other threads
1728  * when current thread is stopped.  However, current thread may change it
1729  * from the interrupt context in cpu_switch(), or in the trap handler.
1730  * When we read-modify-write pcb_flags from C sources, compiler may generate
1731  * code that is not atomic regarding the interrupt handler.  If a trap or
1732  * interrupt happens and any flag is modified from the handler, it can be
1733  * clobbered with the cached value later.  Therefore, we implement setting
1734  * and clearing flags with single-instruction functions, which do not race
1735  * with possible modification of the flags from the trap or interrupt context,
1736  * because traps and interrupts are executed only on instruction boundary.
1737  */
1738 void
1739 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
1740 {
1741 
1742 	__asm __volatile("orl %1,%0"
1743 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
1744 	    : "cc", "memory");
1745 
1746 }
1747 
1748 /*
1749  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
1750  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
1751  * pcb if user space modified the bases.  We must save on the context
1752  * switch or if the return to usermode happens through the doreti.
1753  *
1754  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
1755  * which have a consequence that the base MSRs must be saved each time
1756  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
1757  * context switches.
1758  */
1759 static void
1760 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
1761 {
1762 	register_t r;
1763 
1764 	if (curpcb == pcb &&
1765 	    (flags & PCB_FULL_IRET) != 0 &&
1766 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1767 		r = intr_disable();
1768 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1769 			if (rfs() == _ufssel)
1770 				pcb->pcb_fsbase = rdfsbase();
1771 			if (rgs() == _ugssel)
1772 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
1773 		}
1774 		set_pcb_flags_raw(pcb, flags);
1775 		intr_restore(r);
1776 	} else {
1777 		set_pcb_flags_raw(pcb, flags);
1778 	}
1779 }
1780 
1781 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
1782 {
1783 
1784 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
1785 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
1786 }
1787 
1788 void
1789 clear_pcb_flags(struct pcb *pcb, const u_int flags)
1790 {
1791 
1792 	__asm __volatile("andl %1,%0"
1793 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
1794 	    : "cc", "memory");
1795 }
1796 
1797 #ifdef KDB
1798 
1799 /*
1800  * Provide inb() and outb() as functions.  They are normally only available as
1801  * inline functions, thus cannot be called from the debugger.
1802  */
1803 
1804 /* silence compiler warnings */
1805 u_char inb_(u_short);
1806 void outb_(u_short, u_char);
1807 
1808 u_char
1809 inb_(u_short port)
1810 {
1811 	return inb(port);
1812 }
1813 
1814 void
1815 outb_(u_short port, u_char data)
1816 {
1817 	outb(port, data);
1818 }
1819 
1820 #endif /* KDB */
1821 
1822 #undef memset
1823 #undef memmove
1824 #undef memcpy
1825 
1826 void	*memset_std(void *buf, int c, size_t len);
1827 void	*memset_erms(void *buf, int c, size_t len);
1828 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
1829 	    size_t len);
1830 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
1831 	    size_t len);
1832 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
1833 	    size_t len);
1834 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
1835 	    size_t len);
1836 
1837 #ifdef KCSAN
1838 /*
1839  * These fail to build as ifuncs when used with KCSAN.
1840  */
1841 void *
1842 memset(void *buf, int c, size_t len)
1843 {
1844 
1845 	return (memset_std(buf, c, len));
1846 }
1847 
1848 void *
1849 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1850 {
1851 
1852 	return (memmove_std(dst, src, len));
1853 }
1854 
1855 void *
1856 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1857 {
1858 
1859 	return (memcpy_std(dst, src, len));
1860 }
1861 #else
1862 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
1863 {
1864 
1865 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1866 	    memset_erms : memset_std);
1867 }
1868 
1869 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
1870     size_t))
1871 {
1872 
1873 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1874 	    memmove_erms : memmove_std);
1875 }
1876 
1877 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
1878 {
1879 
1880 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1881 	    memcpy_erms : memcpy_std);
1882 }
1883 #endif
1884 
1885 void	pagezero_std(void *addr);
1886 void	pagezero_erms(void *addr);
1887 DEFINE_IFUNC(, void , pagezero, (void *))
1888 {
1889 
1890 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1891 	    pagezero_erms : pagezero_std);
1892 }
1893