xref: /freebsd/sys/amd64/amd64/machdep.c (revision 9768746b)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 1992 Terrence R. Lambert.
6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
41  */
42 
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 #include "opt_atpic.h"
47 #include "opt_cpu.h"
48 #include "opt_ddb.h"
49 #include "opt_inet.h"
50 #include "opt_isa.h"
51 #include "opt_kstack_pages.h"
52 #include "opt_maxmem.h"
53 #include "opt_pci.h"
54 #include "opt_platform.h"
55 #include "opt_sched.h"
56 
57 #include <sys/param.h>
58 #include <sys/proc.h>
59 #include <sys/systm.h>
60 #include <sys/asan.h>
61 #include <sys/bio.h>
62 #include <sys/buf.h>
63 #include <sys/bus.h>
64 #include <sys/callout.h>
65 #include <sys/cons.h>
66 #include <sys/cpu.h>
67 #include <sys/csan.h>
68 #include <sys/efi.h>
69 #include <sys/eventhandler.h>
70 #include <sys/exec.h>
71 #include <sys/imgact.h>
72 #include <sys/kdb.h>
73 #include <sys/kernel.h>
74 #include <sys/ktr.h>
75 #include <sys/linker.h>
76 #include <sys/lock.h>
77 #include <sys/malloc.h>
78 #include <sys/memrange.h>
79 #include <sys/msan.h>
80 #include <sys/msgbuf.h>
81 #include <sys/mutex.h>
82 #include <sys/pcpu.h>
83 #include <sys/ptrace.h>
84 #include <sys/reboot.h>
85 #include <sys/reg.h>
86 #include <sys/rwlock.h>
87 #include <sys/sched.h>
88 #include <sys/signalvar.h>
89 #ifdef SMP
90 #include <sys/smp.h>
91 #endif
92 #include <sys/syscallsubr.h>
93 #include <sys/sysctl.h>
94 #include <sys/sysent.h>
95 #include <sys/sysproto.h>
96 #include <sys/ucontext.h>
97 #include <sys/vmmeter.h>
98 
99 #include <vm/vm.h>
100 #include <vm/vm_param.h>
101 #include <vm/vm_extern.h>
102 #include <vm/vm_kern.h>
103 #include <vm/vm_page.h>
104 #include <vm/vm_map.h>
105 #include <vm/vm_object.h>
106 #include <vm/vm_pager.h>
107 #include <vm/vm_phys.h>
108 #include <vm/vm_dumpset.h>
109 
110 #ifdef DDB
111 #ifndef KDB
112 #error KDB must be enabled in order for DDB to work!
113 #endif
114 #include <ddb/ddb.h>
115 #include <ddb/db_sym.h>
116 #endif
117 
118 #include <net/netisr.h>
119 
120 #include <machine/clock.h>
121 #include <machine/cpu.h>
122 #include <machine/cputypes.h>
123 #include <machine/frame.h>
124 #include <machine/intr_machdep.h>
125 #include <x86/mca.h>
126 #include <machine/md_var.h>
127 #include <machine/metadata.h>
128 #include <machine/pc/bios.h>
129 #include <machine/pcb.h>
130 #include <machine/proc.h>
131 #include <machine/sigframe.h>
132 #include <machine/specialreg.h>
133 #include <machine/trap.h>
134 #include <machine/tss.h>
135 #include <x86/ucode.h>
136 #include <x86/ifunc.h>
137 #ifdef SMP
138 #include <machine/smp.h>
139 #endif
140 #ifdef FDT
141 #include <x86/fdt.h>
142 #endif
143 
144 #ifdef DEV_ATPIC
145 #include <x86/isa/icu.h>
146 #else
147 #include <x86/apicvar.h>
148 #endif
149 
150 #include <isa/isareg.h>
151 #include <isa/rtc.h>
152 #include <x86/init.h>
153 
154 /* Sanity check for __curthread() */
155 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
156 
157 /*
158  * The PTI trampoline stack needs enough space for a hardware trapframe and a
159  * couple of scratch registers, as well as the trapframe left behind after an
160  * iret fault.
161  */
162 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
163     offsetof(struct pti_frame, pti_rip));
164 
165 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
166 
167 static void cpu_startup(void *);
168 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
169 
170 /* Probe 8254 PIT and TSC. */
171 static void native_clock_source_init(void);
172 
173 /* Preload data parse function */
174 static caddr_t native_parse_preload_data(u_int64_t);
175 
176 /* Native function to fetch and parse the e820 map */
177 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
178 
179 /* Default init_ops implementation. */
180 struct init_ops init_ops = {
181 	.parse_preload_data =		native_parse_preload_data,
182 	.early_clock_source_init =	native_clock_source_init,
183 	.early_delay =			i8254_delay,
184 	.parse_memmap =			native_parse_memmap,
185 };
186 
187 /*
188  * Physical address of the EFI System Table. Stashed from the metadata hints
189  * passed into the kernel and used by the EFI code to call runtime services.
190  */
191 vm_paddr_t efi_systbl_phys;
192 
193 /* Intel ICH registers */
194 #define ICH_PMBASE	0x400
195 #define ICH_SMI_EN	ICH_PMBASE + 0x30
196 
197 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
198 
199 int cold = 1;
200 
201 long Maxmem = 0;
202 long realmem = 0;
203 int late_console = 1;
204 
205 struct kva_md_info kmi;
206 
207 struct region_descriptor r_idt;
208 
209 struct pcpu *__pcpu;
210 struct pcpu temp_bsp_pcpu;
211 
212 struct mtx icu_lock;
213 
214 struct mem_range_softc mem_range_softc;
215 
216 struct mtx dt_lock;	/* lock for GDT and LDT */
217 
218 void (*vmm_resume_p)(void);
219 
220 bool efi_boot;
221 
222 static void
223 cpu_startup(void *dummy)
224 {
225 	uintmax_t memsize;
226 	char *sysenv;
227 
228 	/*
229 	 * On MacBooks, we need to disallow the legacy USB circuit to
230 	 * generate an SMI# because this can cause several problems,
231 	 * namely: incorrect CPU frequency detection and failure to
232 	 * start the APs.
233 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
234 	 * Enable register) of the Intel ICH LPC Interface Bridge.
235 	 */
236 	sysenv = kern_getenv("smbios.system.product");
237 	if (sysenv != NULL) {
238 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
239 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
240 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
241 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
242 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
243 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
244 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
245 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
246 			if (bootverbose)
247 				printf("Disabling LEGACY_USB_EN bit on "
248 				    "Intel ICH.\n");
249 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
250 		}
251 		freeenv(sysenv);
252 	}
253 
254 	/*
255 	 * Good {morning,afternoon,evening,night}.
256 	 */
257 	startrtclock();
258 	printcpuinfo();
259 
260 	/*
261 	 * Display physical memory if SMBIOS reports reasonable amount.
262 	 */
263 	memsize = 0;
264 	sysenv = kern_getenv("smbios.memory.enabled");
265 	if (sysenv != NULL) {
266 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
267 		freeenv(sysenv);
268 	}
269 	if (memsize < ptoa((uintmax_t)vm_free_count()))
270 		memsize = ptoa((uintmax_t)Maxmem);
271 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
272 	realmem = atop(memsize);
273 
274 	/*
275 	 * Display any holes after the first chunk of extended memory.
276 	 */
277 	if (bootverbose) {
278 		int indx;
279 
280 		printf("Physical memory chunk(s):\n");
281 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
282 			vm_paddr_t size;
283 
284 			size = phys_avail[indx + 1] - phys_avail[indx];
285 			printf(
286 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
287 			    (uintmax_t)phys_avail[indx],
288 			    (uintmax_t)phys_avail[indx + 1] - 1,
289 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
290 		}
291 	}
292 
293 	vm_ksubmap_init(&kmi);
294 
295 	printf("avail memory = %ju (%ju MB)\n",
296 	    ptoa((uintmax_t)vm_free_count()),
297 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
298 #ifdef DEV_PCI
299 	if (bootverbose && intel_graphics_stolen_base != 0)
300 		printf("intel stolen mem: base %#jx size %ju MB\n",
301 		    (uintmax_t)intel_graphics_stolen_base,
302 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
303 #endif
304 
305 	/*
306 	 * Set up buffers, so they can be used to read disk labels.
307 	 */
308 	bufinit();
309 	vm_pager_bufferinit();
310 
311 	cpu_setregs();
312 }
313 
314 static void
315 late_ifunc_resolve(void *dummy __unused)
316 {
317 	link_elf_late_ireloc();
318 }
319 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
320 
321 
322 void
323 cpu_setregs(void)
324 {
325 	register_t cr0;
326 
327 	cr0 = rcr0();
328 	/*
329 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
330 	 * BSP.  See the comments there about why we set them.
331 	 */
332 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
333 	load_cr0(cr0);
334 }
335 
336 /*
337  * Initialize amd64 and configure to run kernel
338  */
339 
340 /*
341  * Initialize segments & interrupt table
342  */
343 static struct gate_descriptor idt0[NIDT];
344 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
345 
346 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
347 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
348 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
349 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
350 CTASSERT(sizeof(struct nmi_pcpu) == 16);
351 
352 /*
353  * Software prototypes -- in more palatable form.
354  *
355  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
356  * slots as corresponding segments for i386 kernel.
357  */
358 struct soft_segment_descriptor gdt_segs[] = {
359 /* GNULL_SEL	0 Null Descriptor */
360 {	.ssd_base = 0x0,
361 	.ssd_limit = 0x0,
362 	.ssd_type = 0,
363 	.ssd_dpl = 0,
364 	.ssd_p = 0,
365 	.ssd_long = 0,
366 	.ssd_def32 = 0,
367 	.ssd_gran = 0		},
368 /* GNULL2_SEL	1 Null Descriptor */
369 {	.ssd_base = 0x0,
370 	.ssd_limit = 0x0,
371 	.ssd_type = 0,
372 	.ssd_dpl = 0,
373 	.ssd_p = 0,
374 	.ssd_long = 0,
375 	.ssd_def32 = 0,
376 	.ssd_gran = 0		},
377 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
378 {	.ssd_base = 0x0,
379 	.ssd_limit = 0xfffff,
380 	.ssd_type = SDT_MEMRWA,
381 	.ssd_dpl = SEL_UPL,
382 	.ssd_p = 1,
383 	.ssd_long = 0,
384 	.ssd_def32 = 1,
385 	.ssd_gran = 1		},
386 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
387 {	.ssd_base = 0x0,
388 	.ssd_limit = 0xfffff,
389 	.ssd_type = SDT_MEMRWA,
390 	.ssd_dpl = SEL_UPL,
391 	.ssd_p = 1,
392 	.ssd_long = 0,
393 	.ssd_def32 = 1,
394 	.ssd_gran = 1		},
395 /* GCODE_SEL	4 Code Descriptor for kernel */
396 {	.ssd_base = 0x0,
397 	.ssd_limit = 0xfffff,
398 	.ssd_type = SDT_MEMERA,
399 	.ssd_dpl = SEL_KPL,
400 	.ssd_p = 1,
401 	.ssd_long = 1,
402 	.ssd_def32 = 0,
403 	.ssd_gran = 1		},
404 /* GDATA_SEL	5 Data Descriptor for kernel */
405 {	.ssd_base = 0x0,
406 	.ssd_limit = 0xfffff,
407 	.ssd_type = SDT_MEMRWA,
408 	.ssd_dpl = SEL_KPL,
409 	.ssd_p = 1,
410 	.ssd_long = 1,
411 	.ssd_def32 = 0,
412 	.ssd_gran = 1		},
413 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
414 {	.ssd_base = 0x0,
415 	.ssd_limit = 0xfffff,
416 	.ssd_type = SDT_MEMERA,
417 	.ssd_dpl = SEL_UPL,
418 	.ssd_p = 1,
419 	.ssd_long = 0,
420 	.ssd_def32 = 1,
421 	.ssd_gran = 1		},
422 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
423 {	.ssd_base = 0x0,
424 	.ssd_limit = 0xfffff,
425 	.ssd_type = SDT_MEMRWA,
426 	.ssd_dpl = SEL_UPL,
427 	.ssd_p = 1,
428 	.ssd_long = 0,
429 	.ssd_def32 = 1,
430 	.ssd_gran = 1		},
431 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
432 {	.ssd_base = 0x0,
433 	.ssd_limit = 0xfffff,
434 	.ssd_type = SDT_MEMERA,
435 	.ssd_dpl = SEL_UPL,
436 	.ssd_p = 1,
437 	.ssd_long = 1,
438 	.ssd_def32 = 0,
439 	.ssd_gran = 1		},
440 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
441 {	.ssd_base = 0x0,
442 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
443 	.ssd_type = SDT_SYSTSS,
444 	.ssd_dpl = SEL_KPL,
445 	.ssd_p = 1,
446 	.ssd_long = 0,
447 	.ssd_def32 = 0,
448 	.ssd_gran = 0		},
449 /* Actually, the TSS is a system descriptor which is double size */
450 {	.ssd_base = 0x0,
451 	.ssd_limit = 0x0,
452 	.ssd_type = 0,
453 	.ssd_dpl = 0,
454 	.ssd_p = 0,
455 	.ssd_long = 0,
456 	.ssd_def32 = 0,
457 	.ssd_gran = 0		},
458 /* GUSERLDT_SEL	11 LDT Descriptor */
459 {	.ssd_base = 0x0,
460 	.ssd_limit = 0x0,
461 	.ssd_type = 0,
462 	.ssd_dpl = 0,
463 	.ssd_p = 0,
464 	.ssd_long = 0,
465 	.ssd_def32 = 0,
466 	.ssd_gran = 0		},
467 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
468 {	.ssd_base = 0x0,
469 	.ssd_limit = 0x0,
470 	.ssd_type = 0,
471 	.ssd_dpl = 0,
472 	.ssd_p = 0,
473 	.ssd_long = 0,
474 	.ssd_def32 = 0,
475 	.ssd_gran = 0		},
476 };
477 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
478 
479 void
480 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
481 {
482 	struct gate_descriptor *ip;
483 
484 	ip = idt + idx;
485 	ip->gd_looffset = (uintptr_t)func;
486 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
487 	ip->gd_ist = ist;
488 	ip->gd_xx = 0;
489 	ip->gd_type = typ;
490 	ip->gd_dpl = dpl;
491 	ip->gd_p = 1;
492 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
493 }
494 
495 extern inthand_t
496 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
497 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
498 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
499 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
500 	IDTVEC(xmm), IDTVEC(dblfault),
501 	IDTVEC(div_pti), IDTVEC(bpt_pti),
502 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
503 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
504 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
505 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
506 	IDTVEC(xmm_pti),
507 #ifdef KDTRACE_HOOKS
508 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
509 #endif
510 #ifdef XENHVM
511 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
512 #endif
513 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
514 	IDTVEC(fast_syscall_pti);
515 
516 #ifdef DDB
517 /*
518  * Display the index and function name of any IDT entries that don't use
519  * the default 'rsvd' entry point.
520  */
521 DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE)
522 {
523 	struct gate_descriptor *ip;
524 	int idx;
525 	uintptr_t func;
526 
527 	ip = idt;
528 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
529 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
530 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
531 			db_printf("%3d\t", idx);
532 			db_printsym(func, DB_STGY_PROC);
533 			db_printf("\n");
534 		}
535 		ip++;
536 	}
537 }
538 
539 /* Show privileged registers. */
540 DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE)
541 {
542 	struct {
543 		uint16_t limit;
544 		uint64_t base;
545 	} __packed idtr, gdtr;
546 	uint16_t ldt, tr;
547 
548 	__asm __volatile("sidt %0" : "=m" (idtr));
549 	db_printf("idtr\t0x%016lx/%04x\n",
550 	    (u_long)idtr.base, (u_int)idtr.limit);
551 	__asm __volatile("sgdt %0" : "=m" (gdtr));
552 	db_printf("gdtr\t0x%016lx/%04x\n",
553 	    (u_long)gdtr.base, (u_int)gdtr.limit);
554 	__asm __volatile("sldt %0" : "=r" (ldt));
555 	db_printf("ldtr\t0x%04x\n", ldt);
556 	__asm __volatile("str %0" : "=r" (tr));
557 	db_printf("tr\t0x%04x\n", tr);
558 	db_printf("cr0\t0x%016lx\n", rcr0());
559 	db_printf("cr2\t0x%016lx\n", rcr2());
560 	db_printf("cr3\t0x%016lx\n", rcr3());
561 	db_printf("cr4\t0x%016lx\n", rcr4());
562 	if (rcr4() & CR4_XSAVE)
563 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
564 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
565 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
566 		db_printf("FEATURES_CTL\t%016lx\n",
567 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
568 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
569 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
570 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
571 }
572 
573 DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE)
574 {
575 
576 	db_printf("dr0\t0x%016lx\n", rdr0());
577 	db_printf("dr1\t0x%016lx\n", rdr1());
578 	db_printf("dr2\t0x%016lx\n", rdr2());
579 	db_printf("dr3\t0x%016lx\n", rdr3());
580 	db_printf("dr6\t0x%016lx\n", rdr6());
581 	db_printf("dr7\t0x%016lx\n", rdr7());
582 }
583 #endif
584 
585 void
586 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
587 {
588 
589 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
590 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
591 	ssd->ssd_type  = sd->sd_type;
592 	ssd->ssd_dpl   = sd->sd_dpl;
593 	ssd->ssd_p     = sd->sd_p;
594 	ssd->ssd_long  = sd->sd_long;
595 	ssd->ssd_def32 = sd->sd_def32;
596 	ssd->ssd_gran  = sd->sd_gran;
597 }
598 
599 void
600 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
601 {
602 
603 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
604 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
605 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
606 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
607 	sd->sd_type  = ssd->ssd_type;
608 	sd->sd_dpl   = ssd->ssd_dpl;
609 	sd->sd_p     = ssd->ssd_p;
610 	sd->sd_long  = ssd->ssd_long;
611 	sd->sd_def32 = ssd->ssd_def32;
612 	sd->sd_gran  = ssd->ssd_gran;
613 }
614 
615 void
616 ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd)
617 {
618 
619 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
620 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
621 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
622 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
623 	sd->sd_type  = ssd->ssd_type;
624 	sd->sd_dpl   = ssd->ssd_dpl;
625 	sd->sd_p     = ssd->ssd_p;
626 	sd->sd_gran  = ssd->ssd_gran;
627 }
628 
629 u_int basemem;
630 
631 static int
632 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
633     int *physmap_idxp)
634 {
635 	int i, insert_idx, physmap_idx;
636 
637 	physmap_idx = *physmap_idxp;
638 
639 	if (length == 0)
640 		return (1);
641 
642 	/*
643 	 * Find insertion point while checking for overlap.  Start off by
644 	 * assuming the new entry will be added to the end.
645 	 *
646 	 * NB: physmap_idx points to the next free slot.
647 	 */
648 	insert_idx = physmap_idx;
649 	for (i = 0; i <= physmap_idx; i += 2) {
650 		if (base < physmap[i + 1]) {
651 			if (base + length <= physmap[i]) {
652 				insert_idx = i;
653 				break;
654 			}
655 			if (boothowto & RB_VERBOSE)
656 				printf(
657 		    "Overlapping memory regions, ignoring second region\n");
658 			return (1);
659 		}
660 	}
661 
662 	/* See if we can prepend to the next entry. */
663 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
664 		physmap[insert_idx] = base;
665 		return (1);
666 	}
667 
668 	/* See if we can append to the previous entry. */
669 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
670 		physmap[insert_idx - 1] += length;
671 		return (1);
672 	}
673 
674 	physmap_idx += 2;
675 	*physmap_idxp = physmap_idx;
676 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
677 		printf(
678 		"Too many segments in the physical address map, giving up\n");
679 		return (0);
680 	}
681 
682 	/*
683 	 * Move the last 'N' entries down to make room for the new
684 	 * entry if needed.
685 	 */
686 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
687 		physmap[i] = physmap[i - 2];
688 		physmap[i + 1] = physmap[i - 1];
689 	}
690 
691 	/* Insert the new entry. */
692 	physmap[insert_idx] = base;
693 	physmap[insert_idx + 1] = base + length;
694 	return (1);
695 }
696 
697 void
698 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
699                       vm_paddr_t *physmap, int *physmap_idx)
700 {
701 	struct bios_smap *smap, *smapend;
702 
703 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
704 
705 	for (smap = smapbase; smap < smapend; smap++) {
706 		if (boothowto & RB_VERBOSE)
707 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
708 			    smap->type, smap->base, smap->length);
709 
710 		if (smap->type != SMAP_TYPE_MEMORY)
711 			continue;
712 
713 		if (!add_physmap_entry(smap->base, smap->length, physmap,
714 		    physmap_idx))
715 			break;
716 	}
717 }
718 
719 static void
720 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
721     int *physmap_idx)
722 {
723 	struct efi_md *map, *p;
724 	const char *type;
725 	size_t efisz;
726 	int ndesc, i;
727 
728 	static const char *types[] = {
729 		"Reserved",
730 		"LoaderCode",
731 		"LoaderData",
732 		"BootServicesCode",
733 		"BootServicesData",
734 		"RuntimeServicesCode",
735 		"RuntimeServicesData",
736 		"ConventionalMemory",
737 		"UnusableMemory",
738 		"ACPIReclaimMemory",
739 		"ACPIMemoryNVS",
740 		"MemoryMappedIO",
741 		"MemoryMappedIOPortSpace",
742 		"PalCode",
743 		"PersistentMemory"
744 	};
745 
746 	/*
747 	 * Memory map data provided by UEFI via the GetMemoryMap
748 	 * Boot Services API.
749 	 */
750 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
751 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
752 
753 	if (efihdr->descriptor_size == 0)
754 		return;
755 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
756 
757 	if (boothowto & RB_VERBOSE)
758 		printf("%23s %12s %12s %8s %4s\n",
759 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
760 
761 	for (i = 0, p = map; i < ndesc; i++,
762 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
763 		if (boothowto & RB_VERBOSE) {
764 			if (p->md_type < nitems(types))
765 				type = types[p->md_type];
766 			else
767 				type = "<INVALID>";
768 			printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
769 			    p->md_virt, p->md_pages);
770 			if (p->md_attr & EFI_MD_ATTR_UC)
771 				printf("UC ");
772 			if (p->md_attr & EFI_MD_ATTR_WC)
773 				printf("WC ");
774 			if (p->md_attr & EFI_MD_ATTR_WT)
775 				printf("WT ");
776 			if (p->md_attr & EFI_MD_ATTR_WB)
777 				printf("WB ");
778 			if (p->md_attr & EFI_MD_ATTR_UCE)
779 				printf("UCE ");
780 			if (p->md_attr & EFI_MD_ATTR_WP)
781 				printf("WP ");
782 			if (p->md_attr & EFI_MD_ATTR_RP)
783 				printf("RP ");
784 			if (p->md_attr & EFI_MD_ATTR_XP)
785 				printf("XP ");
786 			if (p->md_attr & EFI_MD_ATTR_NV)
787 				printf("NV ");
788 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
789 				printf("MORE_RELIABLE ");
790 			if (p->md_attr & EFI_MD_ATTR_RO)
791 				printf("RO ");
792 			if (p->md_attr & EFI_MD_ATTR_RT)
793 				printf("RUNTIME");
794 			printf("\n");
795 		}
796 
797 		switch (p->md_type) {
798 		case EFI_MD_TYPE_CODE:
799 		case EFI_MD_TYPE_DATA:
800 		case EFI_MD_TYPE_BS_CODE:
801 		case EFI_MD_TYPE_BS_DATA:
802 		case EFI_MD_TYPE_FREE:
803 			/*
804 			 * We're allowed to use any entry with these types.
805 			 */
806 			break;
807 		default:
808 			continue;
809 		}
810 
811 		if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE,
812 		    physmap, physmap_idx))
813 			break;
814 	}
815 }
816 
817 static void
818 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
819 {
820 	struct bios_smap *smap;
821 	struct efi_map_header *efihdr;
822 	u_int32_t size;
823 
824 	/*
825 	 * Memory map from INT 15:E820.
826 	 *
827 	 * subr_module.c says:
828 	 * "Consumer may safely assume that size value precedes data."
829 	 * ie: an int32_t immediately precedes smap.
830 	 */
831 
832 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
833 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
834 	smap = (struct bios_smap *)preload_search_info(kmdp,
835 	    MODINFO_METADATA | MODINFOMD_SMAP);
836 	if (efihdr == NULL && smap == NULL)
837 		panic("No BIOS smap or EFI map info from loader!");
838 
839 	if (efihdr != NULL) {
840 		add_efi_map_entries(efihdr, physmap, physmap_idx);
841 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
842 	} else {
843 		size = *((u_int32_t *)smap - 1);
844 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
845 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
846 	}
847 }
848 
849 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
850 
851 /*
852  * Populate the (physmap) array with base/bound pairs describing the
853  * available physical memory in the system, then test this memory and
854  * build the phys_avail array describing the actually-available memory.
855  *
856  * Total memory size may be set by the kernel environment variable
857  * hw.physmem or the compile-time define MAXMEM.
858  *
859  * XXX first should be vm_paddr_t.
860  */
861 static void
862 getmemsize(caddr_t kmdp, u_int64_t first)
863 {
864 	int i, physmap_idx, pa_indx, da_indx;
865 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
866 	u_long physmem_start, physmem_tunable, memtest;
867 	pt_entry_t *pte;
868 	quad_t dcons_addr, dcons_size;
869 	int page_counter;
870 
871 	/*
872 	 * Tell the physical memory allocator about pages used to store
873 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
874 	 */
875 	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
876 
877 	bzero(physmap, sizeof(physmap));
878 	physmap_idx = 0;
879 
880 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
881 	physmap_idx -= 2;
882 
883 	/*
884 	 * Find the 'base memory' segment for SMP
885 	 */
886 	basemem = 0;
887 	for (i = 0; i <= physmap_idx; i += 2) {
888 		if (physmap[i] <= 0xA0000) {
889 			basemem = physmap[i + 1] / 1024;
890 			break;
891 		}
892 	}
893 	if (basemem == 0 || basemem > 640) {
894 		if (bootverbose)
895 			printf(
896 		"Memory map doesn't contain a basemem segment, faking it");
897 		basemem = 640;
898 	}
899 
900 	/*
901 	 * Maxmem isn't the "maximum memory", it's one larger than the
902 	 * highest page of the physical address space.  It should be
903 	 * called something like "Maxphyspage".  We may adjust this
904 	 * based on ``hw.physmem'' and the results of the memory test.
905 	 */
906 	Maxmem = atop(physmap[physmap_idx + 1]);
907 
908 #ifdef MAXMEM
909 	Maxmem = MAXMEM / 4;
910 #endif
911 
912 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
913 		Maxmem = atop(physmem_tunable);
914 
915 	/*
916 	 * The boot memory test is disabled by default, as it takes a
917 	 * significant amount of time on large-memory systems, and is
918 	 * unfriendly to virtual machines as it unnecessarily touches all
919 	 * pages.
920 	 *
921 	 * A general name is used as the code may be extended to support
922 	 * additional tests beyond the current "page present" test.
923 	 */
924 	memtest = 0;
925 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
926 
927 	/*
928 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
929 	 * in the system.
930 	 */
931 	if (Maxmem > atop(physmap[physmap_idx + 1]))
932 		Maxmem = atop(physmap[physmap_idx + 1]);
933 
934 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
935 	    (boothowto & RB_VERBOSE))
936 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
937 
938 	/* call pmap initialization to make new kernel address space */
939 	pmap_bootstrap(&first);
940 
941 	/*
942 	 * Size up each available chunk of physical memory.
943 	 *
944 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
945 	 * By default, mask off the first 16 pages unless we appear to be
946 	 * running in a VM.
947 	 */
948 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
949 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
950 	if (physmap[0] < physmem_start) {
951 		if (physmem_start < PAGE_SIZE)
952 			physmap[0] = PAGE_SIZE;
953 		else if (physmem_start >= physmap[1])
954 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
955 		else
956 			physmap[0] = round_page(physmem_start);
957 	}
958 	pa_indx = 0;
959 	da_indx = 1;
960 	phys_avail[pa_indx++] = physmap[0];
961 	phys_avail[pa_indx] = physmap[0];
962 	dump_avail[da_indx] = physmap[0];
963 	pte = CMAP1;
964 
965 	/*
966 	 * Get dcons buffer address
967 	 */
968 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
969 	    getenv_quad("dcons.size", &dcons_size) == 0)
970 		dcons_addr = 0;
971 
972 	/*
973 	 * physmap is in bytes, so when converting to page boundaries,
974 	 * round up the start address and round down the end address.
975 	 */
976 	page_counter = 0;
977 	if (memtest != 0)
978 		printf("Testing system memory");
979 	for (i = 0; i <= physmap_idx; i += 2) {
980 		vm_paddr_t end;
981 
982 		end = ptoa((vm_paddr_t)Maxmem);
983 		if (physmap[i + 1] < end)
984 			end = trunc_page(physmap[i + 1]);
985 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
986 			int tmp, page_bad, full;
987 			int *ptr = (int *)CADDR1;
988 
989 			full = FALSE;
990 			/*
991 			 * block out kernel memory as not available.
992 			 */
993 			if (pa >= (vm_paddr_t)kernphys && pa < first)
994 				goto do_dump_avail;
995 
996 			/*
997 			 * block out dcons buffer
998 			 */
999 			if (dcons_addr > 0
1000 			    && pa >= trunc_page(dcons_addr)
1001 			    && pa < dcons_addr + dcons_size)
1002 				goto do_dump_avail;
1003 
1004 			page_bad = FALSE;
1005 			if (memtest == 0)
1006 				goto skip_memtest;
1007 
1008 			/*
1009 			 * Print a "." every GB to show we're making
1010 			 * progress.
1011 			 */
1012 			page_counter++;
1013 			if ((page_counter % PAGES_PER_GB) == 0)
1014 				printf(".");
1015 
1016 			/*
1017 			 * map page into kernel: valid, read/write,non-cacheable
1018 			 */
1019 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1020 			invltlb();
1021 
1022 			tmp = *(int *)ptr;
1023 			/*
1024 			 * Test for alternating 1's and 0's
1025 			 */
1026 			*(volatile int *)ptr = 0xaaaaaaaa;
1027 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1028 				page_bad = TRUE;
1029 			/*
1030 			 * Test for alternating 0's and 1's
1031 			 */
1032 			*(volatile int *)ptr = 0x55555555;
1033 			if (*(volatile int *)ptr != 0x55555555)
1034 				page_bad = TRUE;
1035 			/*
1036 			 * Test for all 1's
1037 			 */
1038 			*(volatile int *)ptr = 0xffffffff;
1039 			if (*(volatile int *)ptr != 0xffffffff)
1040 				page_bad = TRUE;
1041 			/*
1042 			 * Test for all 0's
1043 			 */
1044 			*(volatile int *)ptr = 0x0;
1045 			if (*(volatile int *)ptr != 0x0)
1046 				page_bad = TRUE;
1047 			/*
1048 			 * Restore original value.
1049 			 */
1050 			*(int *)ptr = tmp;
1051 
1052 skip_memtest:
1053 			/*
1054 			 * Adjust array of valid/good pages.
1055 			 */
1056 			if (page_bad == TRUE)
1057 				continue;
1058 			/*
1059 			 * If this good page is a continuation of the
1060 			 * previous set of good pages, then just increase
1061 			 * the end pointer. Otherwise start a new chunk.
1062 			 * Note that "end" points one higher than end,
1063 			 * making the range >= start and < end.
1064 			 * If we're also doing a speculative memory
1065 			 * test and we at or past the end, bump up Maxmem
1066 			 * so that we keep going. The first bad page
1067 			 * will terminate the loop.
1068 			 */
1069 			if (phys_avail[pa_indx] == pa) {
1070 				phys_avail[pa_indx] += PAGE_SIZE;
1071 			} else {
1072 				pa_indx++;
1073 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
1074 					printf(
1075 		"Too many holes in the physical address space, giving up\n");
1076 					pa_indx--;
1077 					full = TRUE;
1078 					goto do_dump_avail;
1079 				}
1080 				phys_avail[pa_indx++] = pa;	/* start */
1081 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1082 			}
1083 			physmem++;
1084 do_dump_avail:
1085 			if (dump_avail[da_indx] == pa) {
1086 				dump_avail[da_indx] += PAGE_SIZE;
1087 			} else {
1088 				da_indx++;
1089 				if (da_indx == PHYS_AVAIL_ENTRIES) {
1090 					da_indx--;
1091 					goto do_next;
1092 				}
1093 				dump_avail[da_indx++] = pa; /* start */
1094 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1095 			}
1096 do_next:
1097 			if (full)
1098 				break;
1099 		}
1100 	}
1101 	*pte = 0;
1102 	invltlb();
1103 	if (memtest != 0)
1104 		printf("\n");
1105 
1106 	/*
1107 	 * XXX
1108 	 * The last chunk must contain at least one page plus the message
1109 	 * buffer to avoid complicating other code (message buffer address
1110 	 * calculation, etc.).
1111 	 */
1112 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1113 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1114 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1115 		phys_avail[pa_indx--] = 0;
1116 		phys_avail[pa_indx--] = 0;
1117 	}
1118 
1119 	Maxmem = atop(phys_avail[pa_indx]);
1120 
1121 	/* Trim off space for the message buffer. */
1122 	phys_avail[pa_indx] -= round_page(msgbufsize);
1123 
1124 	/* Map the message buffer. */
1125 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1126 }
1127 
1128 static caddr_t
1129 native_parse_preload_data(u_int64_t modulep)
1130 {
1131 	caddr_t kmdp;
1132 	char *envp;
1133 #ifdef DDB
1134 	vm_offset_t ksym_start;
1135 	vm_offset_t ksym_end;
1136 #endif
1137 
1138 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1139 	preload_bootstrap_relocate(KERNBASE);
1140 	kmdp = preload_search_by_type("elf kernel");
1141 	if (kmdp == NULL)
1142 		kmdp = preload_search_by_type("elf64 kernel");
1143 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1144 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1145 	if (envp != NULL)
1146 		envp += KERNBASE;
1147 	init_static_kenv(envp, 0);
1148 #ifdef DDB
1149 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1150 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1151 	db_fetch_ksymtab(ksym_start, ksym_end, 0);
1152 #endif
1153 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1154 
1155 	return (kmdp);
1156 }
1157 
1158 static void
1159 native_clock_source_init(void)
1160 {
1161 	i8254_init();
1162 }
1163 
1164 static void
1165 amd64_kdb_init(void)
1166 {
1167 	kdb_init();
1168 #ifdef KDB
1169 	if (boothowto & RB_KDB)
1170 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1171 #endif
1172 }
1173 
1174 /* Set up the fast syscall stuff */
1175 void
1176 amd64_conf_fast_syscall(void)
1177 {
1178 	uint64_t msr;
1179 
1180 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1181 	wrmsr(MSR_EFER, msr);
1182 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1183 	    (u_int64_t)IDTVEC(fast_syscall));
1184 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1185 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1186 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1187 	wrmsr(MSR_STAR, msr);
1188 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1189 }
1190 
1191 void
1192 amd64_bsp_pcpu_init1(struct pcpu *pc)
1193 {
1194 	struct user_segment_descriptor *gdt;
1195 
1196 	PCPU_SET(prvspace, pc);
1197 	gdt = *PCPU_PTR(gdt);
1198 	PCPU_SET(curthread, &thread0);
1199 	PCPU_SET(tssp, PCPU_PTR(common_tss));
1200 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1201 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1202 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1203 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1204 	PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1205 	PCPU_SET(smp_tlb_gen, 1);
1206 }
1207 
1208 void
1209 amd64_bsp_pcpu_init2(uint64_t rsp0)
1210 {
1211 
1212 	PCPU_SET(rsp0, rsp0);
1213 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1214 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1215 	PCPU_SET(curpcb, thread0.td_pcb);
1216 }
1217 
1218 void
1219 amd64_bsp_ist_init(struct pcpu *pc)
1220 {
1221 	struct nmi_pcpu *np;
1222 	struct amd64tss *tssp;
1223 
1224 	tssp = &pc->pc_common_tss;
1225 
1226 	/* doublefault stack space, runs on ist1 */
1227 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1228 	np->np_pcpu = (register_t)pc;
1229 	tssp->tss_ist1 = (long)np;
1230 
1231 	/*
1232 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1233 	 * above the start of the ist2 stack.
1234 	 */
1235 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1236 	np->np_pcpu = (register_t)pc;
1237 	tssp->tss_ist2 = (long)np;
1238 
1239 	/*
1240 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1241 	 * above the start of the ist3 stack.
1242 	 */
1243 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1244 	np->np_pcpu = (register_t)pc;
1245 	tssp->tss_ist3 = (long)np;
1246 
1247 	/*
1248 	 * DB# stack, runs on ist4.
1249 	 */
1250 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1251 	np->np_pcpu = (register_t)pc;
1252 	tssp->tss_ist4 = (long)np;
1253 }
1254 
1255 /*
1256  * Calculate the kernel load address by inspecting page table created by loader.
1257  * The assumptions:
1258  * - kernel is mapped at KERNBASE, backed by contiguous phys memory
1259  *   aligned at 2M, below 4G (the latter is important for AP startup)
1260  * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
1261  * - kernel is mapped with 2M superpages
1262  * - all participating memory, i.e. kernel, modules, metadata,
1263  *   page table is accessible by pre-created 1:1 mapping
1264  *   (right now loader creates 1:1 mapping for lower 4G, and all
1265  *   memory is from there)
1266  * - there is a usable memory block right after the end of the
1267  *   mapped kernel and all modules/metadata, pointed to by
1268  *   physfree, for early allocations
1269  */
1270 vm_paddr_t __nosanitizeaddress __nosanitizememory
1271 amd64_loadaddr(void)
1272 {
1273 	pml4_entry_t *pml4e;
1274 	pdp_entry_t *pdpe;
1275 	pd_entry_t *pde;
1276 	uint64_t cr3;
1277 
1278 	cr3 = rcr3();
1279 	pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART);
1280 	pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART);
1281 	pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART);
1282 	return (*pde & PG_FRAME);
1283 }
1284 
1285 u_int64_t
1286 hammer_time(u_int64_t modulep, u_int64_t physfree)
1287 {
1288 	caddr_t kmdp;
1289 	int gsel_tss, x;
1290 	struct pcpu *pc;
1291 	uint64_t rsp0;
1292 	char *env;
1293 	struct user_segment_descriptor *gdt;
1294 	struct region_descriptor r_gdt;
1295 	size_t kstack0_sz;
1296 
1297 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1298 
1299 	kernphys = amd64_loadaddr();
1300 
1301 	physfree += kernphys;
1302 
1303 	kmdp = init_ops.parse_preload_data(modulep);
1304 
1305 	efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
1306 	    MODINFOMD_EFI_MAP) != NULL;
1307 
1308 	if (!efi_boot) {
1309 		/* Tell the bios to warmboot next time */
1310 		atomic_store_short((u_short *)0x472, 0x1234);
1311 	}
1312 
1313 	physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
1314 	physfree = roundup2(physfree, PAGE_SIZE);
1315 
1316 	identify_cpu1();
1317 	identify_hypervisor();
1318 	identify_cpu_fixup_bsp();
1319 	identify_cpu2();
1320 	initializecpucache();
1321 
1322 	/*
1323 	 * Check for pti, pcid, and invpcid before ifuncs are
1324 	 * resolved, to correctly select the implementation for
1325 	 * pmap_activate_sw_mode().
1326 	 */
1327 	pti = pti_get_default();
1328 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1329 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1330 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1331 		invpcid_works = (cpu_stdext_feature &
1332 		    CPUID_STDEXT_INVPCID) != 0;
1333 	} else {
1334 		pmap_pcid_enabled = 0;
1335 	}
1336 
1337 	/*
1338 	 * Now we can do small core initialization, after the PCID
1339 	 * CPU features and user knobs are evaluated.
1340 	 */
1341 	TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
1342 	    &pmap_pcid_invlpg_workaround_uena);
1343 	cpu_init_small_core();
1344 
1345 	link_elf_ireloc(kmdp);
1346 
1347 	/*
1348 	 * This may be done better later if it gets more high level
1349 	 * components in it. If so just link td->td_proc here.
1350 	 */
1351 	proc_linkup0(&proc0, &thread0);
1352 
1353 	/* Init basic tunables, hz etc */
1354 	init_param1();
1355 
1356 	thread0.td_kstack = physfree - kernphys + KERNSTART;
1357 	thread0.td_kstack_pages = kstack_pages;
1358 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1359 	bzero((void *)thread0.td_kstack, kstack0_sz);
1360 	physfree += kstack0_sz;
1361 
1362 	/*
1363 	 * Initialize enough of thread0 for delayed invalidation to
1364 	 * work very early.  Rely on thread0.td_base_pri
1365 	 * zero-initialization, it is reset to PVM at proc0_init().
1366 	 */
1367 	pmap_thread_init_invl_gen(&thread0);
1368 
1369 	pc = &temp_bsp_pcpu;
1370 	pcpu_init(pc, 0, sizeof(struct pcpu));
1371 	gdt = &temp_bsp_pcpu.pc_gdt[0];
1372 
1373 	/*
1374 	 * make gdt memory segments
1375 	 */
1376 	for (x = 0; x < NGDT; x++) {
1377 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1378 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1379 			ssdtosd(&gdt_segs[x], &gdt[x]);
1380 	}
1381 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1382 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1383 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1384 
1385 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1386 	r_gdt.rd_base = (long)gdt;
1387 	lgdt(&r_gdt);
1388 
1389 	wrmsr(MSR_FSBASE, 0);		/* User value */
1390 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1391 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1392 
1393 	dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
1394 	physfree += DPCPU_SIZE;
1395 	amd64_bsp_pcpu_init1(pc);
1396 	/* Non-late cninit() and printf() can be moved up to here. */
1397 
1398 	/*
1399 	 * Initialize mutexes.
1400 	 *
1401 	 * icu_lock: in order to allow an interrupt to occur in a critical
1402 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1403 	 *	     must be able to get the icu lock, so it can't be
1404 	 *	     under witness.
1405 	 */
1406 	mutex_init();
1407 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1408 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1409 
1410 	/* exceptions */
1411 	for (x = 0; x < NIDT; x++)
1412 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1413 		    SEL_KPL, 0);
1414 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1415 	    SEL_KPL, 0);
1416 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1417 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1418 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1419 	    SEL_UPL, 0);
1420 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1421 	    SEL_UPL, 0);
1422 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1423 	    SEL_KPL, 0);
1424 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1425 	    SEL_KPL, 0);
1426 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1427 	    SEL_KPL, 0);
1428 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1429 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1430 	    SDT_SYSIGT, SEL_KPL, 0);
1431 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1432 	    SEL_KPL, 0);
1433 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1434 	    SDT_SYSIGT, SEL_KPL, 0);
1435 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1436 	    SEL_KPL, 0);
1437 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1438 	    SEL_KPL, 0);
1439 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1440 	    SEL_KPL, 0);
1441 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1442 	    SEL_KPL, 0);
1443 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1444 	    SEL_KPL, 0);
1445 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1446 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1447 	    SEL_KPL, 0);
1448 #ifdef KDTRACE_HOOKS
1449 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1450 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1451 #endif
1452 #ifdef XENHVM
1453 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1454 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1455 #endif
1456 	r_idt.rd_limit = sizeof(idt0) - 1;
1457 	r_idt.rd_base = (long) idt;
1458 	lidt(&r_idt);
1459 
1460 	/*
1461 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1462 	 * transition).
1463 	 * Once bootblocks have updated, we can test directly for
1464 	 * efi_systbl != NULL here...
1465 	 */
1466 	if (efi_boot)
1467 		vty_set_preferred(VTY_VT);
1468 
1469 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1470 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1471 
1472 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1473 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1474 
1475 	TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
1476 	    &syscall_ret_l1d_flush_mode);
1477 
1478 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1479 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1480 
1481 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1482 
1483 	TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
1484 	    &x86_rngds_mitg_enable);
1485 
1486 	finishidentcpu();	/* Final stage of CPU initialization */
1487 
1488 	/*
1489 	 * Initialize the clock before the console so that console
1490 	 * initialization can use DELAY().
1491 	 */
1492 	clock_init();
1493 
1494 	initializecpu();	/* Initialize CPU registers */
1495 
1496 	amd64_bsp_ist_init(pc);
1497 
1498 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1499 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1500 	    IOPERM_BITMAP_SIZE;
1501 
1502 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1503 	ltr(gsel_tss);
1504 
1505 	amd64_conf_fast_syscall();
1506 
1507 	/*
1508 	 * We initialize the PCB pointer early so that exception
1509 	 * handlers will work.  Also set up td_critnest to short-cut
1510 	 * the page fault handler.
1511 	 */
1512 	cpu_max_ext_state_size = sizeof(struct savefpu);
1513 	set_top_of_stack_td(&thread0);
1514 	thread0.td_pcb = get_pcb_td(&thread0);
1515 	thread0.td_critnest = 1;
1516 
1517 	/*
1518 	 * The console and kdb should be initialized even earlier than here,
1519 	 * but some console drivers don't work until after getmemsize().
1520 	 * Default to late console initialization to support these drivers.
1521 	 * This loses mainly printf()s in getmemsize() and early debugging.
1522 	 */
1523 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1524 	if (!late_console) {
1525 		cninit();
1526 		amd64_kdb_init();
1527 	}
1528 
1529 	getmemsize(kmdp, physfree);
1530 	init_param2(physmem);
1531 
1532 	/* now running on new page tables, configured,and u/iom is accessible */
1533 
1534 #ifdef DEV_PCI
1535         /* This call might adjust phys_avail[]. */
1536         pci_early_quirks();
1537 #endif
1538 
1539 	if (late_console)
1540 		cninit();
1541 
1542 	/*
1543 	 * Dump the boot metadata. We have to wait for cninit() since console
1544 	 * output is required. If it's grossly incorrect the kernel will never
1545 	 * make it this far.
1546 	 */
1547 	if (getenv_is_true("debug.dump_modinfo_at_boot"))
1548 		preload_dump();
1549 
1550 #ifdef DEV_ISA
1551 #ifdef DEV_ATPIC
1552 	elcr_probe();
1553 	atpic_startup();
1554 #else
1555 	/* Reset and mask the atpics and leave them shut down. */
1556 	atpic_reset();
1557 
1558 	/*
1559 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1560 	 * interrupt handler.
1561 	 */
1562 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1563 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1564 #endif
1565 #else
1566 #error "have you forgotten the isa device?"
1567 #endif
1568 
1569 	if (late_console)
1570 		amd64_kdb_init();
1571 
1572 	msgbufinit(msgbufp, msgbufsize);
1573 	fpuinit();
1574 
1575 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1576 	rsp0 = thread0.td_md.md_stack_base;
1577 	/* Ensure the stack is aligned to 16 bytes */
1578 	rsp0 &= ~0xFul;
1579 	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1580 	amd64_bsp_pcpu_init2(rsp0);
1581 
1582 	/* transfer to user mode */
1583 
1584 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1585 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1586 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1587 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1588 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1589 
1590 	load_ds(_udatasel);
1591 	load_es(_udatasel);
1592 	load_fs(_ufssel);
1593 
1594 	/* setup proc 0's pcb */
1595 	thread0.td_pcb->pcb_flags = 0;
1596 
1597         env = kern_getenv("kernelname");
1598 	if (env != NULL)
1599 		strlcpy(kernelname, env, sizeof(kernelname));
1600 
1601 	kcsan_cpu_init(0);
1602 
1603 #ifdef FDT
1604 	x86_init_fdt();
1605 #endif
1606 	thread0.td_critnest = 0;
1607 
1608 	kasan_init();
1609 	kmsan_init();
1610 
1611 	TSEXIT();
1612 
1613 	/* Location of kernel stack for locore */
1614 	return (thread0.td_md.md_stack_base);
1615 }
1616 
1617 void
1618 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1619 {
1620 
1621 	pcpu->pc_acpi_id = 0xffffffff;
1622 }
1623 
1624 static int
1625 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1626 {
1627 	struct bios_smap *smapbase;
1628 	struct bios_smap_xattr smap;
1629 	caddr_t kmdp;
1630 	uint32_t *smapattr;
1631 	int count, error, i;
1632 
1633 	/* Retrieve the system memory map from the loader. */
1634 	kmdp = preload_search_by_type("elf kernel");
1635 	if (kmdp == NULL)
1636 		kmdp = preload_search_by_type("elf64 kernel");
1637 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1638 	    MODINFO_METADATA | MODINFOMD_SMAP);
1639 	if (smapbase == NULL)
1640 		return (0);
1641 	smapattr = (uint32_t *)preload_search_info(kmdp,
1642 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1643 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1644 	error = 0;
1645 	for (i = 0; i < count; i++) {
1646 		smap.base = smapbase[i].base;
1647 		smap.length = smapbase[i].length;
1648 		smap.type = smapbase[i].type;
1649 		if (smapattr != NULL)
1650 			smap.xattr = smapattr[i];
1651 		else
1652 			smap.xattr = 0;
1653 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1654 	}
1655 	return (error);
1656 }
1657 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1658     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1659     smap_sysctl_handler, "S,bios_smap_xattr",
1660     "Raw BIOS SMAP data");
1661 
1662 static int
1663 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1664 {
1665 	struct efi_map_header *efihdr;
1666 	caddr_t kmdp;
1667 	uint32_t efisize;
1668 
1669 	kmdp = preload_search_by_type("elf kernel");
1670 	if (kmdp == NULL)
1671 		kmdp = preload_search_by_type("elf64 kernel");
1672 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1673 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1674 	if (efihdr == NULL)
1675 		return (0);
1676 	efisize = *((uint32_t *)efihdr - 1);
1677 	return (SYSCTL_OUT(req, efihdr, efisize));
1678 }
1679 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1680     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1681     efi_map_sysctl_handler, "S,efi_map_header",
1682     "Raw EFI Memory Map");
1683 
1684 void
1685 spinlock_enter(void)
1686 {
1687 	struct thread *td;
1688 	register_t flags;
1689 
1690 	td = curthread;
1691 	if (td->td_md.md_spinlock_count == 0) {
1692 		flags = intr_disable();
1693 		td->td_md.md_spinlock_count = 1;
1694 		td->td_md.md_saved_flags = flags;
1695 		critical_enter();
1696 	} else
1697 		td->td_md.md_spinlock_count++;
1698 }
1699 
1700 void
1701 spinlock_exit(void)
1702 {
1703 	struct thread *td;
1704 	register_t flags;
1705 
1706 	td = curthread;
1707 	flags = td->td_md.md_saved_flags;
1708 	td->td_md.md_spinlock_count--;
1709 	if (td->td_md.md_spinlock_count == 0) {
1710 		critical_exit();
1711 		intr_restore(flags);
1712 	}
1713 }
1714 
1715 /*
1716  * Construct a PCB from a trapframe. This is called from kdb_trap() where
1717  * we want to start a backtrace from the function that caused us to enter
1718  * the debugger. We have the context in the trapframe, but base the trace
1719  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1720  * enough for a backtrace.
1721  */
1722 void
1723 makectx(struct trapframe *tf, struct pcb *pcb)
1724 {
1725 
1726 	pcb->pcb_r12 = tf->tf_r12;
1727 	pcb->pcb_r13 = tf->tf_r13;
1728 	pcb->pcb_r14 = tf->tf_r14;
1729 	pcb->pcb_r15 = tf->tf_r15;
1730 	pcb->pcb_rbp = tf->tf_rbp;
1731 	pcb->pcb_rbx = tf->tf_rbx;
1732 	pcb->pcb_rip = tf->tf_rip;
1733 	pcb->pcb_rsp = tf->tf_rsp;
1734 }
1735 
1736 /*
1737  * The pcb_flags is only modified by current thread, or by other threads
1738  * when current thread is stopped.  However, current thread may change it
1739  * from the interrupt context in cpu_switch(), or in the trap handler.
1740  * When we read-modify-write pcb_flags from C sources, compiler may generate
1741  * code that is not atomic regarding the interrupt handler.  If a trap or
1742  * interrupt happens and any flag is modified from the handler, it can be
1743  * clobbered with the cached value later.  Therefore, we implement setting
1744  * and clearing flags with single-instruction functions, which do not race
1745  * with possible modification of the flags from the trap or interrupt context,
1746  * because traps and interrupts are executed only on instruction boundary.
1747  */
1748 void
1749 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
1750 {
1751 
1752 	__asm __volatile("orl %1,%0"
1753 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
1754 	    : "cc", "memory");
1755 
1756 }
1757 
1758 /*
1759  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
1760  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
1761  * pcb if user space modified the bases.  We must save on the context
1762  * switch or if the return to usermode happens through the doreti.
1763  *
1764  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
1765  * which have a consequence that the base MSRs must be saved each time
1766  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
1767  * context switches.
1768  */
1769 static void
1770 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
1771 {
1772 	register_t r;
1773 
1774 	if (curpcb == pcb &&
1775 	    (flags & PCB_FULL_IRET) != 0 &&
1776 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1777 		r = intr_disable();
1778 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1779 			if (rfs() == _ufssel)
1780 				pcb->pcb_fsbase = rdfsbase();
1781 			if (rgs() == _ugssel)
1782 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
1783 		}
1784 		set_pcb_flags_raw(pcb, flags);
1785 		intr_restore(r);
1786 	} else {
1787 		set_pcb_flags_raw(pcb, flags);
1788 	}
1789 }
1790 
1791 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
1792 {
1793 
1794 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
1795 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
1796 }
1797 
1798 void
1799 clear_pcb_flags(struct pcb *pcb, const u_int flags)
1800 {
1801 
1802 	__asm __volatile("andl %1,%0"
1803 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
1804 	    : "cc", "memory");
1805 }
1806 
1807 #ifdef KDB
1808 
1809 /*
1810  * Provide inb() and outb() as functions.  They are normally only available as
1811  * inline functions, thus cannot be called from the debugger.
1812  */
1813 
1814 /* silence compiler warnings */
1815 u_char inb_(u_short);
1816 void outb_(u_short, u_char);
1817 
1818 u_char
1819 inb_(u_short port)
1820 {
1821 	return inb(port);
1822 }
1823 
1824 void
1825 outb_(u_short port, u_char data)
1826 {
1827 	outb(port, data);
1828 }
1829 
1830 #endif /* KDB */
1831 
1832 #undef memset
1833 #undef memmove
1834 #undef memcpy
1835 
1836 void	*memset_std(void *buf, int c, size_t len);
1837 void	*memset_erms(void *buf, int c, size_t len);
1838 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
1839 	    size_t len);
1840 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
1841 	    size_t len);
1842 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
1843 	    size_t len);
1844 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
1845 	    size_t len);
1846 
1847 #ifdef KCSAN
1848 /*
1849  * These fail to build as ifuncs when used with KCSAN.
1850  */
1851 void *
1852 memset(void *buf, int c, size_t len)
1853 {
1854 
1855 	return (memset_std(buf, c, len));
1856 }
1857 
1858 void *
1859 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1860 {
1861 
1862 	return (memmove_std(dst, src, len));
1863 }
1864 
1865 void *
1866 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1867 {
1868 
1869 	return (memcpy_std(dst, src, len));
1870 }
1871 #else
1872 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
1873 {
1874 
1875 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1876 	    memset_erms : memset_std);
1877 }
1878 
1879 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
1880     size_t))
1881 {
1882 
1883 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1884 	    memmove_erms : memmove_std);
1885 }
1886 
1887 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
1888 {
1889 
1890 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1891 	    memcpy_erms : memcpy_std);
1892 }
1893 #endif
1894 
1895 void	pagezero_std(void *addr);
1896 void	pagezero_erms(void *addr);
1897 DEFINE_IFUNC(, void , pagezero, (void *))
1898 {
1899 
1900 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1901 	    pagezero_erms : pagezero_std);
1902 }
1903