xref: /freebsd/sys/amd64/amd64/machdep.c (revision 315ee00f)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 1992 Terrence R. Lambert.
6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
41  */
42 
43 #include <sys/cdefs.h>
44 #include "opt_atpic.h"
45 #include "opt_cpu.h"
46 #include "opt_ddb.h"
47 #include "opt_inet.h"
48 #include "opt_isa.h"
49 #include "opt_kstack_pages.h"
50 #include "opt_maxmem.h"
51 #include "opt_pci.h"
52 #include "opt_platform.h"
53 #include "opt_sched.h"
54 
55 #include <sys/param.h>
56 #include <sys/proc.h>
57 #include <sys/systm.h>
58 #include <sys/asan.h>
59 #include <sys/bio.h>
60 #include <sys/buf.h>
61 #include <sys/bus.h>
62 #include <sys/callout.h>
63 #include <sys/cons.h>
64 #include <sys/cpu.h>
65 #include <sys/csan.h>
66 #include <sys/efi.h>
67 #include <sys/eventhandler.h>
68 #include <sys/exec.h>
69 #include <sys/imgact.h>
70 #include <sys/kdb.h>
71 #include <sys/kernel.h>
72 #include <sys/ktr.h>
73 #include <sys/linker.h>
74 #include <sys/lock.h>
75 #include <sys/malloc.h>
76 #include <sys/memrange.h>
77 #include <sys/msan.h>
78 #include <sys/msgbuf.h>
79 #include <sys/mutex.h>
80 #include <sys/pcpu.h>
81 #include <sys/ptrace.h>
82 #include <sys/reboot.h>
83 #include <sys/reg.h>
84 #include <sys/rwlock.h>
85 #include <sys/sched.h>
86 #include <sys/signalvar.h>
87 #ifdef SMP
88 #include <sys/smp.h>
89 #endif
90 #include <sys/syscallsubr.h>
91 #include <sys/sysctl.h>
92 #include <sys/sysent.h>
93 #include <sys/sysproto.h>
94 #include <sys/ucontext.h>
95 #include <sys/vmmeter.h>
96 
97 #include <vm/vm.h>
98 #include <vm/vm_param.h>
99 #include <vm/vm_extern.h>
100 #include <vm/vm_kern.h>
101 #include <vm/vm_page.h>
102 #include <vm/vm_map.h>
103 #include <vm/vm_object.h>
104 #include <vm/vm_pager.h>
105 #include <vm/vm_phys.h>
106 #include <vm/vm_dumpset.h>
107 
108 #ifdef DDB
109 #ifndef KDB
110 #error KDB must be enabled in order for DDB to work!
111 #endif
112 #include <ddb/ddb.h>
113 #include <ddb/db_sym.h>
114 #endif
115 
116 #include <net/netisr.h>
117 
118 #include <dev/smbios/smbios.h>
119 
120 #include <machine/clock.h>
121 #include <machine/cpu.h>
122 #include <machine/cputypes.h>
123 #include <machine/frame.h>
124 #include <machine/intr_machdep.h>
125 #include <x86/mca.h>
126 #include <machine/md_var.h>
127 #include <machine/metadata.h>
128 #include <machine/pc/bios.h>
129 #include <machine/pcb.h>
130 #include <machine/proc.h>
131 #include <machine/sigframe.h>
132 #include <machine/specialreg.h>
133 #include <machine/trap.h>
134 #include <machine/tss.h>
135 #include <x86/ucode.h>
136 #include <x86/ifunc.h>
137 #ifdef SMP
138 #include <machine/smp.h>
139 #endif
140 #ifdef FDT
141 #include <x86/fdt.h>
142 #endif
143 
144 #ifdef DEV_ATPIC
145 #include <x86/isa/icu.h>
146 #else
147 #include <x86/apicvar.h>
148 #endif
149 
150 #include <isa/isareg.h>
151 #include <isa/rtc.h>
152 #include <x86/init.h>
153 
154 /* Sanity check for __curthread() */
155 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
156 
157 /*
158  * The PTI trampoline stack needs enough space for a hardware trapframe and a
159  * couple of scratch registers, as well as the trapframe left behind after an
160  * iret fault.
161  */
162 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
163     offsetof(struct pti_frame, pti_rip));
164 
165 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
166 
167 static void cpu_startup(void *);
168 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
169 
170 /* Probe 8254 PIT and TSC. */
171 static void native_clock_source_init(void);
172 
173 /* Preload data parse function */
174 static caddr_t native_parse_preload_data(u_int64_t);
175 
176 /* Native function to fetch and parse the e820 map */
177 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
178 
179 /* Default init_ops implementation. */
180 struct init_ops init_ops = {
181 	.parse_preload_data =		native_parse_preload_data,
182 	.early_clock_source_init =	native_clock_source_init,
183 	.early_delay =			i8254_delay,
184 	.parse_memmap =			native_parse_memmap,
185 };
186 
187 /*
188  * Physical address of the EFI System Table. Stashed from the metadata hints
189  * passed into the kernel and used by the EFI code to call runtime services.
190  */
191 vm_paddr_t efi_systbl_phys;
192 
193 /* Intel ICH registers */
194 #define ICH_PMBASE	0x400
195 #define ICH_SMI_EN	ICH_PMBASE + 0x30
196 
197 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
198 
199 int cold = 1;
200 
201 long Maxmem = 0;
202 long realmem = 0;
203 int late_console = 1;
204 
205 struct kva_md_info kmi;
206 
207 struct region_descriptor r_idt;
208 
209 struct pcpu *__pcpu;
210 struct pcpu temp_bsp_pcpu;
211 
212 struct mtx icu_lock;
213 
214 struct mem_range_softc mem_range_softc;
215 
216 struct mtx dt_lock;	/* lock for GDT and LDT */
217 
218 void (*vmm_resume_p)(void);
219 
220 bool efi_boot;
221 
222 static void
223 cpu_startup(void *dummy)
224 {
225 	uintmax_t memsize;
226 	char *sysenv;
227 
228 	/*
229 	 * On MacBooks, we need to disallow the legacy USB circuit to
230 	 * generate an SMI# because this can cause several problems,
231 	 * namely: incorrect CPU frequency detection and failure to
232 	 * start the APs.
233 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
234 	 * Enable register) of the Intel ICH LPC Interface Bridge.
235 	 */
236 	sysenv = kern_getenv("smbios.system.product");
237 	if (sysenv != NULL) {
238 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
239 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
240 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
241 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
242 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
243 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
244 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
245 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
246 			if (bootverbose)
247 				printf("Disabling LEGACY_USB_EN bit on "
248 				    "Intel ICH.\n");
249 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
250 		}
251 		freeenv(sysenv);
252 	}
253 
254 	/*
255 	 * Good {morning,afternoon,evening,night}.
256 	 */
257 	startrtclock();
258 	printcpuinfo();
259 
260 	/*
261 	 * Display physical memory if SMBIOS reports reasonable amount.
262 	 */
263 	memsize = 0;
264 	sysenv = kern_getenv("smbios.memory.enabled");
265 	if (sysenv != NULL) {
266 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
267 		freeenv(sysenv);
268 	}
269 	if (memsize < ptoa((uintmax_t)vm_free_count()))
270 		memsize = ptoa((uintmax_t)Maxmem);
271 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
272 	realmem = atop(memsize);
273 
274 	/*
275 	 * Display any holes after the first chunk of extended memory.
276 	 */
277 	if (bootverbose) {
278 		int indx;
279 
280 		printf("Physical memory chunk(s):\n");
281 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
282 			vm_paddr_t size;
283 
284 			size = phys_avail[indx + 1] - phys_avail[indx];
285 			printf(
286 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
287 			    (uintmax_t)phys_avail[indx],
288 			    (uintmax_t)phys_avail[indx + 1] - 1,
289 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
290 		}
291 	}
292 
293 	vm_ksubmap_init(&kmi);
294 
295 	printf("avail memory = %ju (%ju MB)\n",
296 	    ptoa((uintmax_t)vm_free_count()),
297 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
298 #ifdef DEV_PCI
299 	if (bootverbose && intel_graphics_stolen_base != 0)
300 		printf("intel stolen mem: base %#jx size %ju MB\n",
301 		    (uintmax_t)intel_graphics_stolen_base,
302 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
303 #endif
304 
305 	/*
306 	 * Set up buffers, so they can be used to read disk labels.
307 	 */
308 	bufinit();
309 	vm_pager_bufferinit();
310 
311 	cpu_setregs();
312 }
313 
314 static void
315 late_ifunc_resolve(void *dummy __unused)
316 {
317 	link_elf_late_ireloc();
318 }
319 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
320 
321 
322 void
323 cpu_setregs(void)
324 {
325 	register_t cr0;
326 
327 	TSENTER();
328 	cr0 = rcr0();
329 	/*
330 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
331 	 * BSP.  See the comments there about why we set them.
332 	 */
333 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
334 	TSENTER2("load_cr0");
335 	load_cr0(cr0);
336 	TSEXIT2("load_cr0");
337 	TSEXIT();
338 }
339 
340 /*
341  * Initialize amd64 and configure to run kernel
342  */
343 
344 /*
345  * Initialize segments & interrupt table
346  */
347 static struct gate_descriptor idt0[NIDT];
348 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
349 
350 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
351 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
352 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
353 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
354 CTASSERT(sizeof(struct nmi_pcpu) == 16);
355 
356 /*
357  * Software prototypes -- in more palatable form.
358  *
359  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
360  * slots as corresponding segments for i386 kernel.
361  */
362 struct soft_segment_descriptor gdt_segs[] = {
363 /* GNULL_SEL	0 Null Descriptor */
364 {	.ssd_base = 0x0,
365 	.ssd_limit = 0x0,
366 	.ssd_type = 0,
367 	.ssd_dpl = 0,
368 	.ssd_p = 0,
369 	.ssd_long = 0,
370 	.ssd_def32 = 0,
371 	.ssd_gran = 0		},
372 /* GNULL2_SEL	1 Null Descriptor */
373 {	.ssd_base = 0x0,
374 	.ssd_limit = 0x0,
375 	.ssd_type = 0,
376 	.ssd_dpl = 0,
377 	.ssd_p = 0,
378 	.ssd_long = 0,
379 	.ssd_def32 = 0,
380 	.ssd_gran = 0		},
381 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
382 {	.ssd_base = 0x0,
383 	.ssd_limit = 0xfffff,
384 	.ssd_type = SDT_MEMRWA,
385 	.ssd_dpl = SEL_UPL,
386 	.ssd_p = 1,
387 	.ssd_long = 0,
388 	.ssd_def32 = 1,
389 	.ssd_gran = 1		},
390 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
391 {	.ssd_base = 0x0,
392 	.ssd_limit = 0xfffff,
393 	.ssd_type = SDT_MEMRWA,
394 	.ssd_dpl = SEL_UPL,
395 	.ssd_p = 1,
396 	.ssd_long = 0,
397 	.ssd_def32 = 1,
398 	.ssd_gran = 1		},
399 /* GCODE_SEL	4 Code Descriptor for kernel */
400 {	.ssd_base = 0x0,
401 	.ssd_limit = 0xfffff,
402 	.ssd_type = SDT_MEMERA,
403 	.ssd_dpl = SEL_KPL,
404 	.ssd_p = 1,
405 	.ssd_long = 1,
406 	.ssd_def32 = 0,
407 	.ssd_gran = 1		},
408 /* GDATA_SEL	5 Data Descriptor for kernel */
409 {	.ssd_base = 0x0,
410 	.ssd_limit = 0xfffff,
411 	.ssd_type = SDT_MEMRWA,
412 	.ssd_dpl = SEL_KPL,
413 	.ssd_p = 1,
414 	.ssd_long = 1,
415 	.ssd_def32 = 0,
416 	.ssd_gran = 1		},
417 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
418 {	.ssd_base = 0x0,
419 	.ssd_limit = 0xfffff,
420 	.ssd_type = SDT_MEMERA,
421 	.ssd_dpl = SEL_UPL,
422 	.ssd_p = 1,
423 	.ssd_long = 0,
424 	.ssd_def32 = 1,
425 	.ssd_gran = 1		},
426 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
427 {	.ssd_base = 0x0,
428 	.ssd_limit = 0xfffff,
429 	.ssd_type = SDT_MEMRWA,
430 	.ssd_dpl = SEL_UPL,
431 	.ssd_p = 1,
432 	.ssd_long = 0,
433 	.ssd_def32 = 1,
434 	.ssd_gran = 1		},
435 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
436 {	.ssd_base = 0x0,
437 	.ssd_limit = 0xfffff,
438 	.ssd_type = SDT_MEMERA,
439 	.ssd_dpl = SEL_UPL,
440 	.ssd_p = 1,
441 	.ssd_long = 1,
442 	.ssd_def32 = 0,
443 	.ssd_gran = 1		},
444 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
445 {	.ssd_base = 0x0,
446 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
447 	.ssd_type = SDT_SYSTSS,
448 	.ssd_dpl = SEL_KPL,
449 	.ssd_p = 1,
450 	.ssd_long = 0,
451 	.ssd_def32 = 0,
452 	.ssd_gran = 0		},
453 /* Actually, the TSS is a system descriptor which is double size */
454 {	.ssd_base = 0x0,
455 	.ssd_limit = 0x0,
456 	.ssd_type = 0,
457 	.ssd_dpl = 0,
458 	.ssd_p = 0,
459 	.ssd_long = 0,
460 	.ssd_def32 = 0,
461 	.ssd_gran = 0		},
462 /* GUSERLDT_SEL	11 LDT Descriptor */
463 {	.ssd_base = 0x0,
464 	.ssd_limit = 0x0,
465 	.ssd_type = 0,
466 	.ssd_dpl = 0,
467 	.ssd_p = 0,
468 	.ssd_long = 0,
469 	.ssd_def32 = 0,
470 	.ssd_gran = 0		},
471 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
472 {	.ssd_base = 0x0,
473 	.ssd_limit = 0x0,
474 	.ssd_type = 0,
475 	.ssd_dpl = 0,
476 	.ssd_p = 0,
477 	.ssd_long = 0,
478 	.ssd_def32 = 0,
479 	.ssd_gran = 0		},
480 };
481 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
482 
483 void
484 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
485 {
486 	struct gate_descriptor *ip;
487 
488 	ip = idt + idx;
489 	ip->gd_looffset = (uintptr_t)func;
490 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
491 	ip->gd_ist = ist;
492 	ip->gd_xx = 0;
493 	ip->gd_type = typ;
494 	ip->gd_dpl = dpl;
495 	ip->gd_p = 1;
496 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
497 }
498 
499 extern inthand_t
500 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
501 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
502 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
503 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
504 	IDTVEC(xmm), IDTVEC(dblfault),
505 	IDTVEC(div_pti), IDTVEC(bpt_pti),
506 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
507 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
508 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
509 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
510 	IDTVEC(xmm_pti),
511 #ifdef KDTRACE_HOOKS
512 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
513 #endif
514 #ifdef XENHVM
515 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
516 #endif
517 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
518 	IDTVEC(fast_syscall_pti);
519 
520 #ifdef DDB
521 /*
522  * Display the index and function name of any IDT entries that don't use
523  * the default 'rsvd' entry point.
524  */
525 DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE)
526 {
527 	struct gate_descriptor *ip;
528 	int idx;
529 	uintptr_t func;
530 
531 	ip = idt;
532 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
533 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
534 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
535 			db_printf("%3d\t", idx);
536 			db_printsym(func, DB_STGY_PROC);
537 			db_printf("\n");
538 		}
539 		ip++;
540 	}
541 }
542 
543 /* Show privileged registers. */
544 DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE)
545 {
546 	struct {
547 		uint16_t limit;
548 		uint64_t base;
549 	} __packed idtr, gdtr;
550 	uint16_t ldt, tr;
551 
552 	__asm __volatile("sidt %0" : "=m" (idtr));
553 	db_printf("idtr\t0x%016lx/%04x\n",
554 	    (u_long)idtr.base, (u_int)idtr.limit);
555 	__asm __volatile("sgdt %0" : "=m" (gdtr));
556 	db_printf("gdtr\t0x%016lx/%04x\n",
557 	    (u_long)gdtr.base, (u_int)gdtr.limit);
558 	__asm __volatile("sldt %0" : "=r" (ldt));
559 	db_printf("ldtr\t0x%04x\n", ldt);
560 	__asm __volatile("str %0" : "=r" (tr));
561 	db_printf("tr\t0x%04x\n", tr);
562 	db_printf("cr0\t0x%016lx\n", rcr0());
563 	db_printf("cr2\t0x%016lx\n", rcr2());
564 	db_printf("cr3\t0x%016lx\n", rcr3());
565 	db_printf("cr4\t0x%016lx\n", rcr4());
566 	if (rcr4() & CR4_XSAVE)
567 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
568 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
569 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
570 		db_printf("FEATURES_CTL\t%016lx\n",
571 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
572 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
573 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
574 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
575 }
576 
577 DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE)
578 {
579 
580 	db_printf("dr0\t0x%016lx\n", rdr0());
581 	db_printf("dr1\t0x%016lx\n", rdr1());
582 	db_printf("dr2\t0x%016lx\n", rdr2());
583 	db_printf("dr3\t0x%016lx\n", rdr3());
584 	db_printf("dr6\t0x%016lx\n", rdr6());
585 	db_printf("dr7\t0x%016lx\n", rdr7());
586 }
587 #endif
588 
589 void
590 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
591 {
592 
593 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
594 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
595 	ssd->ssd_type  = sd->sd_type;
596 	ssd->ssd_dpl   = sd->sd_dpl;
597 	ssd->ssd_p     = sd->sd_p;
598 	ssd->ssd_long  = sd->sd_long;
599 	ssd->ssd_def32 = sd->sd_def32;
600 	ssd->ssd_gran  = sd->sd_gran;
601 }
602 
603 void
604 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
605 {
606 
607 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
608 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
609 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
610 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
611 	sd->sd_type  = ssd->ssd_type;
612 	sd->sd_dpl   = ssd->ssd_dpl;
613 	sd->sd_p     = ssd->ssd_p;
614 	sd->sd_long  = ssd->ssd_long;
615 	sd->sd_def32 = ssd->ssd_def32;
616 	sd->sd_gran  = ssd->ssd_gran;
617 }
618 
619 void
620 ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd)
621 {
622 
623 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
624 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
625 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
626 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
627 	sd->sd_type  = ssd->ssd_type;
628 	sd->sd_dpl   = ssd->ssd_dpl;
629 	sd->sd_p     = ssd->ssd_p;
630 	sd->sd_gran  = ssd->ssd_gran;
631 }
632 
633 u_int basemem;
634 
635 static int
636 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
637     int *physmap_idxp)
638 {
639 	int i, insert_idx, physmap_idx;
640 
641 	physmap_idx = *physmap_idxp;
642 
643 	if (length == 0)
644 		return (1);
645 
646 	/*
647 	 * Find insertion point while checking for overlap.  Start off by
648 	 * assuming the new entry will be added to the end.
649 	 *
650 	 * NB: physmap_idx points to the next free slot.
651 	 */
652 	insert_idx = physmap_idx;
653 	for (i = 0; i <= physmap_idx; i += 2) {
654 		if (base < physmap[i + 1]) {
655 			if (base + length <= physmap[i]) {
656 				insert_idx = i;
657 				break;
658 			}
659 			if (boothowto & RB_VERBOSE)
660 				printf(
661 		    "Overlapping memory regions, ignoring second region\n");
662 			return (1);
663 		}
664 	}
665 
666 	/* See if we can prepend to the next entry. */
667 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
668 		physmap[insert_idx] = base;
669 		return (1);
670 	}
671 
672 	/* See if we can append to the previous entry. */
673 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
674 		physmap[insert_idx - 1] += length;
675 		return (1);
676 	}
677 
678 	physmap_idx += 2;
679 	*physmap_idxp = physmap_idx;
680 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
681 		printf(
682 		"Too many segments in the physical address map, giving up\n");
683 		return (0);
684 	}
685 
686 	/*
687 	 * Move the last 'N' entries down to make room for the new
688 	 * entry if needed.
689 	 */
690 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
691 		physmap[i] = physmap[i - 2];
692 		physmap[i + 1] = physmap[i - 1];
693 	}
694 
695 	/* Insert the new entry. */
696 	physmap[insert_idx] = base;
697 	physmap[insert_idx + 1] = base + length;
698 	return (1);
699 }
700 
701 void
702 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
703                       vm_paddr_t *physmap, int *physmap_idx)
704 {
705 	struct bios_smap *smap, *smapend;
706 
707 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
708 
709 	for (smap = smapbase; smap < smapend; smap++) {
710 		if (boothowto & RB_VERBOSE)
711 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
712 			    smap->type, smap->base, smap->length);
713 
714 		if (smap->type != SMAP_TYPE_MEMORY)
715 			continue;
716 
717 		if (!add_physmap_entry(smap->base, smap->length, physmap,
718 		    physmap_idx))
719 			break;
720 	}
721 }
722 
723 static void
724 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
725     int *physmap_idx)
726 {
727 	struct efi_md *map, *p;
728 	const char *type;
729 	size_t efisz;
730 	int ndesc, i;
731 
732 	static const char *types[] = {
733 		"Reserved",
734 		"LoaderCode",
735 		"LoaderData",
736 		"BootServicesCode",
737 		"BootServicesData",
738 		"RuntimeServicesCode",
739 		"RuntimeServicesData",
740 		"ConventionalMemory",
741 		"UnusableMemory",
742 		"ACPIReclaimMemory",
743 		"ACPIMemoryNVS",
744 		"MemoryMappedIO",
745 		"MemoryMappedIOPortSpace",
746 		"PalCode",
747 		"PersistentMemory"
748 	};
749 
750 	/*
751 	 * Memory map data provided by UEFI via the GetMemoryMap
752 	 * Boot Services API.
753 	 */
754 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
755 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
756 
757 	if (efihdr->descriptor_size == 0)
758 		return;
759 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
760 
761 	if (boothowto & RB_VERBOSE)
762 		printf("%23s %12s %12s %8s %4s\n",
763 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
764 
765 	for (i = 0, p = map; i < ndesc; i++,
766 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
767 		if (boothowto & RB_VERBOSE) {
768 			if (p->md_type < nitems(types))
769 				type = types[p->md_type];
770 			else
771 				type = "<INVALID>";
772 			printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
773 			    p->md_virt, p->md_pages);
774 			if (p->md_attr & EFI_MD_ATTR_UC)
775 				printf("UC ");
776 			if (p->md_attr & EFI_MD_ATTR_WC)
777 				printf("WC ");
778 			if (p->md_attr & EFI_MD_ATTR_WT)
779 				printf("WT ");
780 			if (p->md_attr & EFI_MD_ATTR_WB)
781 				printf("WB ");
782 			if (p->md_attr & EFI_MD_ATTR_UCE)
783 				printf("UCE ");
784 			if (p->md_attr & EFI_MD_ATTR_WP)
785 				printf("WP ");
786 			if (p->md_attr & EFI_MD_ATTR_RP)
787 				printf("RP ");
788 			if (p->md_attr & EFI_MD_ATTR_XP)
789 				printf("XP ");
790 			if (p->md_attr & EFI_MD_ATTR_NV)
791 				printf("NV ");
792 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
793 				printf("MORE_RELIABLE ");
794 			if (p->md_attr & EFI_MD_ATTR_RO)
795 				printf("RO ");
796 			if (p->md_attr & EFI_MD_ATTR_RT)
797 				printf("RUNTIME");
798 			printf("\n");
799 		}
800 
801 		switch (p->md_type) {
802 		case EFI_MD_TYPE_CODE:
803 		case EFI_MD_TYPE_DATA:
804 		case EFI_MD_TYPE_BS_CODE:
805 		case EFI_MD_TYPE_BS_DATA:
806 		case EFI_MD_TYPE_FREE:
807 			/*
808 			 * We're allowed to use any entry with these types.
809 			 */
810 			break;
811 		default:
812 			continue;
813 		}
814 
815 		if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE,
816 		    physmap, physmap_idx))
817 			break;
818 	}
819 }
820 
821 static void
822 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
823 {
824 	struct bios_smap *smap;
825 	struct efi_map_header *efihdr;
826 	u_int32_t size;
827 
828 	/*
829 	 * Memory map from INT 15:E820.
830 	 *
831 	 * subr_module.c says:
832 	 * "Consumer may safely assume that size value precedes data."
833 	 * ie: an int32_t immediately precedes smap.
834 	 */
835 
836 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
837 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
838 	smap = (struct bios_smap *)preload_search_info(kmdp,
839 	    MODINFO_METADATA | MODINFOMD_SMAP);
840 	if (efihdr == NULL && smap == NULL)
841 		panic("No BIOS smap or EFI map info from loader!");
842 
843 	if (efihdr != NULL) {
844 		add_efi_map_entries(efihdr, physmap, physmap_idx);
845 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
846 	} else {
847 		size = *((u_int32_t *)smap - 1);
848 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
849 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
850 	}
851 }
852 
853 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
854 
855 /*
856  * Populate the (physmap) array with base/bound pairs describing the
857  * available physical memory in the system, then test this memory and
858  * build the phys_avail array describing the actually-available memory.
859  *
860  * Total memory size may be set by the kernel environment variable
861  * hw.physmem or the compile-time define MAXMEM.
862  *
863  * XXX first should be vm_paddr_t.
864  */
865 static void
866 getmemsize(caddr_t kmdp, u_int64_t first)
867 {
868 	int i, physmap_idx, pa_indx, da_indx;
869 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
870 	u_long physmem_start, physmem_tunable, memtest;
871 	pt_entry_t *pte;
872 	quad_t dcons_addr, dcons_size;
873 	int page_counter;
874 
875 	TSENTER();
876 	/*
877 	 * Tell the physical memory allocator about pages used to store
878 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
879 	 */
880 	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
881 
882 	bzero(physmap, sizeof(physmap));
883 	physmap_idx = 0;
884 
885 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
886 	physmap_idx -= 2;
887 
888 	/*
889 	 * Find the 'base memory' segment for SMP
890 	 */
891 	basemem = 0;
892 	for (i = 0; i <= physmap_idx; i += 2) {
893 		if (physmap[i] <= 0xA0000) {
894 			basemem = physmap[i + 1] / 1024;
895 			break;
896 		}
897 	}
898 	if (basemem == 0 || basemem > 640) {
899 		if (bootverbose)
900 			printf(
901 		"Memory map doesn't contain a basemem segment, faking it");
902 		basemem = 640;
903 	}
904 
905 	/*
906 	 * Maxmem isn't the "maximum memory", it's one larger than the
907 	 * highest page of the physical address space.  It should be
908 	 * called something like "Maxphyspage".  We may adjust this
909 	 * based on ``hw.physmem'' and the results of the memory test.
910 	 */
911 	Maxmem = atop(physmap[physmap_idx + 1]);
912 
913 #ifdef MAXMEM
914 	Maxmem = MAXMEM / 4;
915 #endif
916 
917 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
918 		Maxmem = atop(physmem_tunable);
919 
920 	/*
921 	 * The boot memory test is disabled by default, as it takes a
922 	 * significant amount of time on large-memory systems, and is
923 	 * unfriendly to virtual machines as it unnecessarily touches all
924 	 * pages.
925 	 *
926 	 * A general name is used as the code may be extended to support
927 	 * additional tests beyond the current "page present" test.
928 	 */
929 	memtest = 0;
930 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
931 
932 	/*
933 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
934 	 * in the system.
935 	 */
936 	if (Maxmem > atop(physmap[physmap_idx + 1]))
937 		Maxmem = atop(physmap[physmap_idx + 1]);
938 
939 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
940 	    (boothowto & RB_VERBOSE))
941 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
942 
943 	/* call pmap initialization to make new kernel address space */
944 	pmap_bootstrap(&first);
945 
946 	/*
947 	 * Size up each available chunk of physical memory.
948 	 *
949 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
950 	 * By default, mask off the first 16 pages unless we appear to be
951 	 * running in a VM.
952 	 */
953 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
954 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
955 	if (physmap[0] < physmem_start) {
956 		if (physmem_start < PAGE_SIZE)
957 			physmap[0] = PAGE_SIZE;
958 		else if (physmem_start >= physmap[1])
959 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
960 		else
961 			physmap[0] = round_page(physmem_start);
962 	}
963 	pa_indx = 0;
964 	da_indx = 1;
965 	phys_avail[pa_indx++] = physmap[0];
966 	phys_avail[pa_indx] = physmap[0];
967 	dump_avail[da_indx] = physmap[0];
968 	pte = CMAP1;
969 
970 	/*
971 	 * Get dcons buffer address
972 	 */
973 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
974 	    getenv_quad("dcons.size", &dcons_size) == 0)
975 		dcons_addr = 0;
976 
977 	/*
978 	 * physmap is in bytes, so when converting to page boundaries,
979 	 * round up the start address and round down the end address.
980 	 */
981 	page_counter = 0;
982 	if (memtest != 0)
983 		printf("Testing system memory");
984 	for (i = 0; i <= physmap_idx; i += 2) {
985 		vm_paddr_t end;
986 
987 		end = ptoa((vm_paddr_t)Maxmem);
988 		if (physmap[i + 1] < end)
989 			end = trunc_page(physmap[i + 1]);
990 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
991 			int tmp, page_bad, full;
992 			int *ptr = (int *)CADDR1;
993 
994 			full = FALSE;
995 			/*
996 			 * block out kernel memory as not available.
997 			 */
998 			if (pa >= (vm_paddr_t)kernphys && pa < first)
999 				goto do_dump_avail;
1000 
1001 			/*
1002 			 * block out dcons buffer
1003 			 */
1004 			if (dcons_addr > 0
1005 			    && pa >= trunc_page(dcons_addr)
1006 			    && pa < dcons_addr + dcons_size)
1007 				goto do_dump_avail;
1008 
1009 			page_bad = FALSE;
1010 			if (memtest == 0)
1011 				goto skip_memtest;
1012 
1013 			/*
1014 			 * Print a "." every GB to show we're making
1015 			 * progress.
1016 			 */
1017 			page_counter++;
1018 			if ((page_counter % PAGES_PER_GB) == 0)
1019 				printf(".");
1020 
1021 			/*
1022 			 * map page into kernel: valid, read/write,non-cacheable
1023 			 */
1024 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1025 			invltlb();
1026 
1027 			tmp = *(int *)ptr;
1028 			/*
1029 			 * Test for alternating 1's and 0's
1030 			 */
1031 			*(volatile int *)ptr = 0xaaaaaaaa;
1032 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1033 				page_bad = TRUE;
1034 			/*
1035 			 * Test for alternating 0's and 1's
1036 			 */
1037 			*(volatile int *)ptr = 0x55555555;
1038 			if (*(volatile int *)ptr != 0x55555555)
1039 				page_bad = TRUE;
1040 			/*
1041 			 * Test for all 1's
1042 			 */
1043 			*(volatile int *)ptr = 0xffffffff;
1044 			if (*(volatile int *)ptr != 0xffffffff)
1045 				page_bad = TRUE;
1046 			/*
1047 			 * Test for all 0's
1048 			 */
1049 			*(volatile int *)ptr = 0x0;
1050 			if (*(volatile int *)ptr != 0x0)
1051 				page_bad = TRUE;
1052 			/*
1053 			 * Restore original value.
1054 			 */
1055 			*(int *)ptr = tmp;
1056 
1057 skip_memtest:
1058 			/*
1059 			 * Adjust array of valid/good pages.
1060 			 */
1061 			if (page_bad == TRUE)
1062 				continue;
1063 			/*
1064 			 * If this good page is a continuation of the
1065 			 * previous set of good pages, then just increase
1066 			 * the end pointer. Otherwise start a new chunk.
1067 			 * Note that "end" points one higher than end,
1068 			 * making the range >= start and < end.
1069 			 * If we're also doing a speculative memory
1070 			 * test and we at or past the end, bump up Maxmem
1071 			 * so that we keep going. The first bad page
1072 			 * will terminate the loop.
1073 			 */
1074 			if (phys_avail[pa_indx] == pa) {
1075 				phys_avail[pa_indx] += PAGE_SIZE;
1076 			} else {
1077 				pa_indx++;
1078 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
1079 					printf(
1080 		"Too many holes in the physical address space, giving up\n");
1081 					pa_indx--;
1082 					full = TRUE;
1083 					goto do_dump_avail;
1084 				}
1085 				phys_avail[pa_indx++] = pa;	/* start */
1086 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1087 			}
1088 			physmem++;
1089 do_dump_avail:
1090 			if (dump_avail[da_indx] == pa) {
1091 				dump_avail[da_indx] += PAGE_SIZE;
1092 			} else {
1093 				da_indx++;
1094 				if (da_indx == PHYS_AVAIL_ENTRIES) {
1095 					da_indx--;
1096 					goto do_next;
1097 				}
1098 				dump_avail[da_indx++] = pa; /* start */
1099 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1100 			}
1101 do_next:
1102 			if (full)
1103 				break;
1104 		}
1105 	}
1106 	*pte = 0;
1107 	invltlb();
1108 	if (memtest != 0)
1109 		printf("\n");
1110 
1111 	/*
1112 	 * XXX
1113 	 * The last chunk must contain at least one page plus the message
1114 	 * buffer to avoid complicating other code (message buffer address
1115 	 * calculation, etc.).
1116 	 */
1117 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1118 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1119 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1120 		phys_avail[pa_indx--] = 0;
1121 		phys_avail[pa_indx--] = 0;
1122 	}
1123 
1124 	Maxmem = atop(phys_avail[pa_indx]);
1125 
1126 	/* Trim off space for the message buffer. */
1127 	phys_avail[pa_indx] -= round_page(msgbufsize);
1128 
1129 	/* Map the message buffer. */
1130 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1131 	TSEXIT();
1132 }
1133 
1134 static caddr_t
1135 native_parse_preload_data(u_int64_t modulep)
1136 {
1137 	caddr_t kmdp;
1138 	char *envp;
1139 #ifdef DDB
1140 	vm_offset_t ksym_start;
1141 	vm_offset_t ksym_end;
1142 #endif
1143 
1144 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1145 	preload_bootstrap_relocate(KERNBASE);
1146 	kmdp = preload_search_by_type("elf kernel");
1147 	if (kmdp == NULL)
1148 		kmdp = preload_search_by_type("elf64 kernel");
1149 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1150 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1151 	if (envp != NULL)
1152 		envp += KERNBASE;
1153 	init_static_kenv(envp, 0);
1154 #ifdef DDB
1155 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1156 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1157 	db_fetch_ksymtab(ksym_start, ksym_end, 0);
1158 #endif
1159 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1160 
1161 	return (kmdp);
1162 }
1163 
1164 static void
1165 native_clock_source_init(void)
1166 {
1167 	i8254_init();
1168 }
1169 
1170 static void
1171 amd64_kdb_init(void)
1172 {
1173 	kdb_init();
1174 #ifdef KDB
1175 	if (boothowto & RB_KDB)
1176 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1177 #endif
1178 }
1179 
1180 /* Set up the fast syscall stuff */
1181 void
1182 amd64_conf_fast_syscall(void)
1183 {
1184 	uint64_t msr;
1185 
1186 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1187 	wrmsr(MSR_EFER, msr);
1188 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1189 	    (u_int64_t)IDTVEC(fast_syscall));
1190 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1191 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1192 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1193 	wrmsr(MSR_STAR, msr);
1194 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1195 }
1196 
1197 void
1198 amd64_bsp_pcpu_init1(struct pcpu *pc)
1199 {
1200 	struct user_segment_descriptor *gdt;
1201 
1202 	PCPU_SET(prvspace, pc);
1203 	gdt = *PCPU_PTR(gdt);
1204 	PCPU_SET(curthread, &thread0);
1205 	PCPU_SET(tssp, PCPU_PTR(common_tss));
1206 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1207 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1208 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1209 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1210 	PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1211 	PCPU_SET(smp_tlb_gen, 1);
1212 }
1213 
1214 void
1215 amd64_bsp_pcpu_init2(uint64_t rsp0)
1216 {
1217 
1218 	PCPU_SET(rsp0, rsp0);
1219 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1220 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1221 	PCPU_SET(curpcb, thread0.td_pcb);
1222 }
1223 
1224 void
1225 amd64_bsp_ist_init(struct pcpu *pc)
1226 {
1227 	struct nmi_pcpu *np;
1228 	struct amd64tss *tssp;
1229 
1230 	tssp = &pc->pc_common_tss;
1231 
1232 	/* doublefault stack space, runs on ist1 */
1233 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1234 	np->np_pcpu = (register_t)pc;
1235 	tssp->tss_ist1 = (long)np;
1236 
1237 	/*
1238 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1239 	 * above the start of the ist2 stack.
1240 	 */
1241 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1242 	np->np_pcpu = (register_t)pc;
1243 	tssp->tss_ist2 = (long)np;
1244 
1245 	/*
1246 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1247 	 * above the start of the ist3 stack.
1248 	 */
1249 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1250 	np->np_pcpu = (register_t)pc;
1251 	tssp->tss_ist3 = (long)np;
1252 
1253 	/*
1254 	 * DB# stack, runs on ist4.
1255 	 */
1256 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1257 	np->np_pcpu = (register_t)pc;
1258 	tssp->tss_ist4 = (long)np;
1259 }
1260 
1261 /*
1262  * Calculate the kernel load address by inspecting page table created by loader.
1263  * The assumptions:
1264  * - kernel is mapped at KERNBASE, backed by contiguous phys memory
1265  *   aligned at 2M, below 4G (the latter is important for AP startup)
1266  * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
1267  * - kernel is mapped with 2M superpages
1268  * - all participating memory, i.e. kernel, modules, metadata,
1269  *   page table is accessible by pre-created 1:1 mapping
1270  *   (right now loader creates 1:1 mapping for lower 4G, and all
1271  *   memory is from there)
1272  * - there is a usable memory block right after the end of the
1273  *   mapped kernel and all modules/metadata, pointed to by
1274  *   physfree, for early allocations
1275  */
1276 vm_paddr_t __nosanitizeaddress __nosanitizememory
1277 amd64_loadaddr(void)
1278 {
1279 	pml4_entry_t *pml4e;
1280 	pdp_entry_t *pdpe;
1281 	pd_entry_t *pde;
1282 	uint64_t cr3;
1283 
1284 	cr3 = rcr3();
1285 	pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART);
1286 	pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART);
1287 	pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART);
1288 	return (*pde & PG_FRAME);
1289 }
1290 
1291 u_int64_t
1292 hammer_time(u_int64_t modulep, u_int64_t physfree)
1293 {
1294 	caddr_t kmdp;
1295 	int gsel_tss, x;
1296 	struct pcpu *pc;
1297 	uint64_t rsp0;
1298 	char *env;
1299 	struct user_segment_descriptor *gdt;
1300 	struct region_descriptor r_gdt;
1301 	size_t kstack0_sz;
1302 
1303 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1304 
1305 	kernphys = amd64_loadaddr();
1306 
1307 	physfree += kernphys;
1308 
1309 	kmdp = init_ops.parse_preload_data(modulep);
1310 
1311 	efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
1312 	    MODINFOMD_EFI_MAP) != NULL;
1313 
1314 	if (!efi_boot) {
1315 		/* Tell the bios to warmboot next time */
1316 		atomic_store_short((u_short *)0x472, 0x1234);
1317 	}
1318 
1319 	physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
1320 	physfree = roundup2(physfree, PAGE_SIZE);
1321 
1322 	identify_cpu1();
1323 	identify_hypervisor();
1324 	identify_hypervisor_smbios();
1325 	identify_cpu_fixup_bsp();
1326 	identify_cpu2();
1327 	initializecpucache();
1328 
1329 	/*
1330 	 * Check for pti, pcid, and invpcid before ifuncs are
1331 	 * resolved, to correctly select the implementation for
1332 	 * pmap_activate_sw_mode().
1333 	 */
1334 	pti = pti_get_default();
1335 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1336 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1337 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1338 		invpcid_works = (cpu_stdext_feature &
1339 		    CPUID_STDEXT_INVPCID) != 0;
1340 	} else {
1341 		pmap_pcid_enabled = 0;
1342 	}
1343 
1344 	/*
1345 	 * Now we can do small core initialization, after the PCID
1346 	 * CPU features and user knobs are evaluated.
1347 	 */
1348 	TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
1349 	    &pmap_pcid_invlpg_workaround_uena);
1350 	cpu_init_small_core();
1351 
1352 	if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
1353 		use_xsave = 1;
1354 		TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave);
1355 	}
1356 
1357 	link_elf_ireloc(kmdp);
1358 
1359 	/*
1360 	 * This may be done better later if it gets more high level
1361 	 * components in it. If so just link td->td_proc here.
1362 	 */
1363 	proc_linkup0(&proc0, &thread0);
1364 
1365 	/* Init basic tunables, hz etc */
1366 	init_param1();
1367 
1368 	thread0.td_kstack = physfree - kernphys + KERNSTART;
1369 	thread0.td_kstack_pages = kstack_pages;
1370 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1371 	bzero((void *)thread0.td_kstack, kstack0_sz);
1372 	physfree += kstack0_sz;
1373 
1374 	/*
1375 	 * Initialize enough of thread0 for delayed invalidation to
1376 	 * work very early.  Rely on thread0.td_base_pri
1377 	 * zero-initialization, it is reset to PVM at proc0_init().
1378 	 */
1379 	pmap_thread_init_invl_gen(&thread0);
1380 
1381 	pc = &temp_bsp_pcpu;
1382 	pcpu_init(pc, 0, sizeof(struct pcpu));
1383 	gdt = &temp_bsp_pcpu.pc_gdt[0];
1384 
1385 	/*
1386 	 * make gdt memory segments
1387 	 */
1388 	for (x = 0; x < NGDT; x++) {
1389 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1390 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1391 			ssdtosd(&gdt_segs[x], &gdt[x]);
1392 	}
1393 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1394 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1395 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1396 
1397 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1398 	r_gdt.rd_base = (long)gdt;
1399 	lgdt(&r_gdt);
1400 
1401 	wrmsr(MSR_FSBASE, 0);		/* User value */
1402 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1403 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1404 
1405 	dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
1406 	physfree += DPCPU_SIZE;
1407 	amd64_bsp_pcpu_init1(pc);
1408 	/* Non-late cninit() and printf() can be moved up to here. */
1409 
1410 	/*
1411 	 * Initialize mutexes.
1412 	 *
1413 	 * icu_lock: in order to allow an interrupt to occur in a critical
1414 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1415 	 *	     must be able to get the icu lock, so it can't be
1416 	 *	     under witness.
1417 	 */
1418 	mutex_init();
1419 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1420 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1421 
1422 	/* exceptions */
1423 	for (x = 0; x < NIDT; x++)
1424 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1425 		    SEL_KPL, 0);
1426 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1427 	    SEL_KPL, 0);
1428 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1429 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1430 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1431 	    SEL_UPL, 0);
1432 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1433 	    SEL_UPL, 0);
1434 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1435 	    SEL_KPL, 0);
1436 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1437 	    SEL_KPL, 0);
1438 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1439 	    SEL_KPL, 0);
1440 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1441 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1442 	    SDT_SYSIGT, SEL_KPL, 0);
1443 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1444 	    SEL_KPL, 0);
1445 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1446 	    SDT_SYSIGT, SEL_KPL, 0);
1447 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1448 	    SEL_KPL, 0);
1449 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1450 	    SEL_KPL, 0);
1451 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1452 	    SEL_KPL, 0);
1453 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1454 	    SEL_KPL, 0);
1455 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1456 	    SEL_KPL, 0);
1457 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1458 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1459 	    SEL_KPL, 0);
1460 #ifdef KDTRACE_HOOKS
1461 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1462 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1463 #endif
1464 #ifdef XENHVM
1465 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1466 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1467 #endif
1468 	r_idt.rd_limit = sizeof(idt0) - 1;
1469 	r_idt.rd_base = (long) idt;
1470 	lidt(&r_idt);
1471 
1472 	/*
1473 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1474 	 * transition).
1475 	 * Once bootblocks have updated, we can test directly for
1476 	 * efi_systbl != NULL here...
1477 	 */
1478 	if (efi_boot)
1479 		vty_set_preferred(VTY_VT);
1480 
1481 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1482 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1483 
1484 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1485 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1486 
1487 	TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
1488 	    &syscall_ret_l1d_flush_mode);
1489 
1490 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1491 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1492 
1493 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1494 
1495 	TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
1496 	    &x86_rngds_mitg_enable);
1497 
1498 	finishidentcpu();	/* Final stage of CPU initialization */
1499 
1500 	/*
1501 	 * Initialize the clock before the console so that console
1502 	 * initialization can use DELAY().
1503 	 */
1504 	clock_init();
1505 
1506 	initializecpu();	/* Initialize CPU registers */
1507 
1508 	amd64_bsp_ist_init(pc);
1509 
1510 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1511 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1512 	    IOPERM_BITMAP_SIZE;
1513 
1514 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1515 	ltr(gsel_tss);
1516 
1517 	amd64_conf_fast_syscall();
1518 
1519 	/*
1520 	 * We initialize the PCB pointer early so that exception
1521 	 * handlers will work.  Also set up td_critnest to short-cut
1522 	 * the page fault handler.
1523 	 */
1524 	cpu_max_ext_state_size = sizeof(struct savefpu);
1525 	set_top_of_stack_td(&thread0);
1526 	thread0.td_pcb = get_pcb_td(&thread0);
1527 	thread0.td_critnest = 1;
1528 
1529 	/*
1530 	 * The console and kdb should be initialized even earlier than here,
1531 	 * but some console drivers don't work until after getmemsize().
1532 	 * Default to late console initialization to support these drivers.
1533 	 * This loses mainly printf()s in getmemsize() and early debugging.
1534 	 */
1535 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1536 	if (!late_console) {
1537 		cninit();
1538 		amd64_kdb_init();
1539 	}
1540 
1541 	getmemsize(kmdp, physfree);
1542 	init_param2(physmem);
1543 
1544 	/* now running on new page tables, configured,and u/iom is accessible */
1545 
1546 #ifdef DEV_PCI
1547         /* This call might adjust phys_avail[]. */
1548         pci_early_quirks();
1549 #endif
1550 
1551 	if (late_console)
1552 		cninit();
1553 
1554 	/*
1555 	 * Dump the boot metadata. We have to wait for cninit() since console
1556 	 * output is required. If it's grossly incorrect the kernel will never
1557 	 * make it this far.
1558 	 */
1559 	if (getenv_is_true("debug.dump_modinfo_at_boot"))
1560 		preload_dump();
1561 
1562 #ifdef DEV_ISA
1563 #ifdef DEV_ATPIC
1564 	elcr_probe();
1565 	atpic_startup();
1566 #else
1567 	/* Reset and mask the atpics and leave them shut down. */
1568 	atpic_reset();
1569 
1570 	/*
1571 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1572 	 * interrupt handler.
1573 	 */
1574 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1575 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1576 #endif
1577 #else
1578 #error "have you forgotten the isa device?"
1579 #endif
1580 
1581 	if (late_console)
1582 		amd64_kdb_init();
1583 
1584 	msgbufinit(msgbufp, msgbufsize);
1585 	fpuinit();
1586 
1587 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1588 	rsp0 = thread0.td_md.md_stack_base;
1589 	/* Ensure the stack is aligned to 16 bytes */
1590 	rsp0 &= ~0xFul;
1591 	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1592 	amd64_bsp_pcpu_init2(rsp0);
1593 
1594 	/* transfer to user mode */
1595 
1596 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1597 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1598 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1599 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1600 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1601 
1602 	load_ds(_udatasel);
1603 	load_es(_udatasel);
1604 	load_fs(_ufssel);
1605 
1606 	/* setup proc 0's pcb */
1607 	thread0.td_pcb->pcb_flags = 0;
1608 
1609         env = kern_getenv("kernelname");
1610 	if (env != NULL)
1611 		strlcpy(kernelname, env, sizeof(kernelname));
1612 
1613 	kcsan_cpu_init(0);
1614 
1615 #ifdef FDT
1616 	x86_init_fdt();
1617 #endif
1618 	thread0.td_critnest = 0;
1619 
1620 	kasan_init();
1621 	kmsan_init();
1622 
1623 	TSEXIT();
1624 
1625 	/* Location of kernel stack for locore */
1626 	return (thread0.td_md.md_stack_base);
1627 }
1628 
1629 void
1630 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1631 {
1632 
1633 	pcpu->pc_acpi_id = 0xffffffff;
1634 }
1635 
1636 static int
1637 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1638 {
1639 	struct bios_smap *smapbase;
1640 	struct bios_smap_xattr smap;
1641 	caddr_t kmdp;
1642 	uint32_t *smapattr;
1643 	int count, error, i;
1644 
1645 	/* Retrieve the system memory map from the loader. */
1646 	kmdp = preload_search_by_type("elf kernel");
1647 	if (kmdp == NULL)
1648 		kmdp = preload_search_by_type("elf64 kernel");
1649 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1650 	    MODINFO_METADATA | MODINFOMD_SMAP);
1651 	if (smapbase == NULL)
1652 		return (0);
1653 	smapattr = (uint32_t *)preload_search_info(kmdp,
1654 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1655 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1656 	error = 0;
1657 	for (i = 0; i < count; i++) {
1658 		smap.base = smapbase[i].base;
1659 		smap.length = smapbase[i].length;
1660 		smap.type = smapbase[i].type;
1661 		if (smapattr != NULL)
1662 			smap.xattr = smapattr[i];
1663 		else
1664 			smap.xattr = 0;
1665 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1666 	}
1667 	return (error);
1668 }
1669 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1670     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1671     smap_sysctl_handler, "S,bios_smap_xattr",
1672     "Raw BIOS SMAP data");
1673 
1674 static int
1675 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1676 {
1677 	struct efi_map_header *efihdr;
1678 	caddr_t kmdp;
1679 	uint32_t efisize;
1680 
1681 	kmdp = preload_search_by_type("elf kernel");
1682 	if (kmdp == NULL)
1683 		kmdp = preload_search_by_type("elf64 kernel");
1684 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1685 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1686 	if (efihdr == NULL)
1687 		return (0);
1688 	efisize = *((uint32_t *)efihdr - 1);
1689 	return (SYSCTL_OUT(req, efihdr, efisize));
1690 }
1691 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1692     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1693     efi_map_sysctl_handler, "S,efi_map_header",
1694     "Raw EFI Memory Map");
1695 
1696 void
1697 spinlock_enter(void)
1698 {
1699 	struct thread *td;
1700 	register_t flags;
1701 
1702 	td = curthread;
1703 	if (td->td_md.md_spinlock_count == 0) {
1704 		flags = intr_disable();
1705 		td->td_md.md_spinlock_count = 1;
1706 		td->td_md.md_saved_flags = flags;
1707 		critical_enter();
1708 	} else
1709 		td->td_md.md_spinlock_count++;
1710 }
1711 
1712 void
1713 spinlock_exit(void)
1714 {
1715 	struct thread *td;
1716 	register_t flags;
1717 
1718 	td = curthread;
1719 	flags = td->td_md.md_saved_flags;
1720 	td->td_md.md_spinlock_count--;
1721 	if (td->td_md.md_spinlock_count == 0) {
1722 		critical_exit();
1723 		intr_restore(flags);
1724 	}
1725 }
1726 
1727 /*
1728  * Construct a PCB from a trapframe. This is called from kdb_trap() where
1729  * we want to start a backtrace from the function that caused us to enter
1730  * the debugger. We have the context in the trapframe, but base the trace
1731  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1732  * enough for a backtrace.
1733  */
1734 void
1735 makectx(struct trapframe *tf, struct pcb *pcb)
1736 {
1737 
1738 	pcb->pcb_r12 = tf->tf_r12;
1739 	pcb->pcb_r13 = tf->tf_r13;
1740 	pcb->pcb_r14 = tf->tf_r14;
1741 	pcb->pcb_r15 = tf->tf_r15;
1742 	pcb->pcb_rbp = tf->tf_rbp;
1743 	pcb->pcb_rbx = tf->tf_rbx;
1744 	pcb->pcb_rip = tf->tf_rip;
1745 	pcb->pcb_rsp = tf->tf_rsp;
1746 }
1747 
1748 /*
1749  * The pcb_flags is only modified by current thread, or by other threads
1750  * when current thread is stopped.  However, current thread may change it
1751  * from the interrupt context in cpu_switch(), or in the trap handler.
1752  * When we read-modify-write pcb_flags from C sources, compiler may generate
1753  * code that is not atomic regarding the interrupt handler.  If a trap or
1754  * interrupt happens and any flag is modified from the handler, it can be
1755  * clobbered with the cached value later.  Therefore, we implement setting
1756  * and clearing flags with single-instruction functions, which do not race
1757  * with possible modification of the flags from the trap or interrupt context,
1758  * because traps and interrupts are executed only on instruction boundary.
1759  */
1760 void
1761 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
1762 {
1763 
1764 	__asm __volatile("orl %1,%0"
1765 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
1766 	    : "cc", "memory");
1767 
1768 }
1769 
1770 /*
1771  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
1772  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
1773  * pcb if user space modified the bases.  We must save on the context
1774  * switch or if the return to usermode happens through the doreti.
1775  *
1776  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
1777  * which have a consequence that the base MSRs must be saved each time
1778  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
1779  * context switches.
1780  */
1781 static void
1782 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
1783 {
1784 	register_t r;
1785 
1786 	if (curpcb == pcb &&
1787 	    (flags & PCB_FULL_IRET) != 0 &&
1788 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1789 		r = intr_disable();
1790 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1791 			if (rfs() == _ufssel)
1792 				pcb->pcb_fsbase = rdfsbase();
1793 			if (rgs() == _ugssel)
1794 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
1795 		}
1796 		set_pcb_flags_raw(pcb, flags);
1797 		intr_restore(r);
1798 	} else {
1799 		set_pcb_flags_raw(pcb, flags);
1800 	}
1801 }
1802 
1803 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
1804 {
1805 
1806 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
1807 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
1808 }
1809 
1810 void
1811 clear_pcb_flags(struct pcb *pcb, const u_int flags)
1812 {
1813 
1814 	__asm __volatile("andl %1,%0"
1815 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
1816 	    : "cc", "memory");
1817 }
1818 
1819 #ifdef KDB
1820 
1821 /*
1822  * Provide inb() and outb() as functions.  They are normally only available as
1823  * inline functions, thus cannot be called from the debugger.
1824  */
1825 
1826 /* silence compiler warnings */
1827 u_char inb_(u_short);
1828 void outb_(u_short, u_char);
1829 
1830 u_char
1831 inb_(u_short port)
1832 {
1833 	return inb(port);
1834 }
1835 
1836 void
1837 outb_(u_short port, u_char data)
1838 {
1839 	outb(port, data);
1840 }
1841 
1842 #endif /* KDB */
1843 
1844 #undef memset
1845 #undef memmove
1846 #undef memcpy
1847 
1848 void	*memset_std(void *buf, int c, size_t len);
1849 void	*memset_erms(void *buf, int c, size_t len);
1850 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
1851 	    size_t len);
1852 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
1853 	    size_t len);
1854 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
1855 	    size_t len);
1856 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
1857 	    size_t len);
1858 
1859 #ifdef KCSAN
1860 /*
1861  * These fail to build as ifuncs when used with KCSAN.
1862  */
1863 void *
1864 memset(void *buf, int c, size_t len)
1865 {
1866 
1867 	return (memset_std(buf, c, len));
1868 }
1869 
1870 void *
1871 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1872 {
1873 
1874 	return (memmove_std(dst, src, len));
1875 }
1876 
1877 void *
1878 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1879 {
1880 
1881 	return (memcpy_std(dst, src, len));
1882 }
1883 #else
1884 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
1885 {
1886 
1887 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1888 	    memset_erms : memset_std);
1889 }
1890 
1891 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
1892     size_t))
1893 {
1894 
1895 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1896 	    memmove_erms : memmove_std);
1897 }
1898 
1899 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
1900 {
1901 
1902 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1903 	    memcpy_erms : memcpy_std);
1904 }
1905 #endif
1906 
1907 void	pagezero_std(void *addr);
1908 void	pagezero_erms(void *addr);
1909 DEFINE_IFUNC(, void , pagezero, (void *))
1910 {
1911 
1912 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1913 	    pagezero_erms : pagezero_std);
1914 }
1915