xref: /freebsd/sys/amd64/amd64/machdep.c (revision 16038816)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 1992 Terrence R. Lambert.
6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
41  */
42 
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 #include "opt_atpic.h"
47 #include "opt_cpu.h"
48 #include "opt_ddb.h"
49 #include "opt_inet.h"
50 #include "opt_isa.h"
51 #include "opt_kstack_pages.h"
52 #include "opt_maxmem.h"
53 #include "opt_mp_watchdog.h"
54 #include "opt_pci.h"
55 #include "opt_platform.h"
56 #include "opt_sched.h"
57 
58 #include <sys/param.h>
59 #include <sys/proc.h>
60 #include <sys/systm.h>
61 #include <sys/asan.h>
62 #include <sys/bio.h>
63 #include <sys/buf.h>
64 #include <sys/bus.h>
65 #include <sys/callout.h>
66 #include <sys/cons.h>
67 #include <sys/cpu.h>
68 #include <sys/csan.h>
69 #include <sys/efi.h>
70 #include <sys/eventhandler.h>
71 #include <sys/exec.h>
72 #include <sys/imgact.h>
73 #include <sys/kdb.h>
74 #include <sys/kernel.h>
75 #include <sys/ktr.h>
76 #include <sys/linker.h>
77 #include <sys/lock.h>
78 #include <sys/malloc.h>
79 #include <sys/memrange.h>
80 #include <sys/msgbuf.h>
81 #include <sys/mutex.h>
82 #include <sys/pcpu.h>
83 #include <sys/ptrace.h>
84 #include <sys/reboot.h>
85 #include <sys/rwlock.h>
86 #include <sys/sched.h>
87 #include <sys/signalvar.h>
88 #ifdef SMP
89 #include <sys/smp.h>
90 #endif
91 #include <sys/syscallsubr.h>
92 #include <sys/sysctl.h>
93 #include <sys/sysent.h>
94 #include <sys/sysproto.h>
95 #include <sys/ucontext.h>
96 #include <sys/vmmeter.h>
97 
98 #include <vm/vm.h>
99 #include <vm/vm_param.h>
100 #include <vm/vm_extern.h>
101 #include <vm/vm_kern.h>
102 #include <vm/vm_page.h>
103 #include <vm/vm_map.h>
104 #include <vm/vm_object.h>
105 #include <vm/vm_pager.h>
106 #include <vm/vm_phys.h>
107 #include <vm/vm_dumpset.h>
108 
109 #ifdef DDB
110 #ifndef KDB
111 #error KDB must be enabled in order for DDB to work!
112 #endif
113 #include <ddb/ddb.h>
114 #include <ddb/db_sym.h>
115 #endif
116 
117 #include <net/netisr.h>
118 
119 #include <machine/clock.h>
120 #include <machine/cpu.h>
121 #include <machine/cputypes.h>
122 #include <machine/frame.h>
123 #include <machine/intr_machdep.h>
124 #include <x86/mca.h>
125 #include <machine/md_var.h>
126 #include <machine/metadata.h>
127 #include <machine/mp_watchdog.h>
128 #include <machine/pc/bios.h>
129 #include <machine/pcb.h>
130 #include <machine/proc.h>
131 #include <machine/reg.h>
132 #include <machine/sigframe.h>
133 #include <machine/specialreg.h>
134 #include <machine/trap.h>
135 #include <machine/tss.h>
136 #include <x86/ucode.h>
137 #include <x86/ifunc.h>
138 #ifdef SMP
139 #include <machine/smp.h>
140 #endif
141 #ifdef FDT
142 #include <x86/fdt.h>
143 #endif
144 
145 #ifdef DEV_ATPIC
146 #include <x86/isa/icu.h>
147 #else
148 #include <x86/apicvar.h>
149 #endif
150 
151 #include <isa/isareg.h>
152 #include <isa/rtc.h>
153 #include <x86/init.h>
154 
155 /* Sanity check for __curthread() */
156 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
157 
158 /*
159  * The PTI trampoline stack needs enough space for a hardware trapframe and a
160  * couple of scratch registers, as well as the trapframe left behind after an
161  * iret fault.
162  */
163 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
164     offsetof(struct pti_frame, pti_rip));
165 
166 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
167 
168 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
169 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
170 
171 static void cpu_startup(void *);
172 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
173     char *xfpusave, size_t xfpusave_len);
174 static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
175     char *xfpustate, size_t xfpustate_len);
176 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
177 
178 /* Preload data parse function */
179 static caddr_t native_parse_preload_data(u_int64_t);
180 
181 /* Native function to fetch and parse the e820 map */
182 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
183 
184 /* Default init_ops implementation. */
185 struct init_ops init_ops = {
186 	.parse_preload_data =	native_parse_preload_data,
187 	.early_clock_source_init =	i8254_init,
188 	.early_delay =			i8254_delay,
189 	.parse_memmap =			native_parse_memmap,
190 };
191 
192 /*
193  * Physical address of the EFI System Table. Stashed from the metadata hints
194  * passed into the kernel and used by the EFI code to call runtime services.
195  */
196 vm_paddr_t efi_systbl_phys;
197 
198 /* Intel ICH registers */
199 #define ICH_PMBASE	0x400
200 #define ICH_SMI_EN	ICH_PMBASE + 0x30
201 
202 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
203 
204 int cold = 1;
205 
206 long Maxmem = 0;
207 long realmem = 0;
208 
209 struct kva_md_info kmi;
210 
211 static struct trapframe proc0_tf;
212 struct region_descriptor r_idt;
213 
214 struct pcpu *__pcpu;
215 struct pcpu temp_bsp_pcpu;
216 
217 struct mtx icu_lock;
218 
219 struct mem_range_softc mem_range_softc;
220 
221 struct mtx dt_lock;	/* lock for GDT and LDT */
222 
223 void (*vmm_resume_p)(void);
224 
225 static void
226 cpu_startup(dummy)
227 	void *dummy;
228 {
229 	uintmax_t memsize;
230 	char *sysenv;
231 
232 	/*
233 	 * On MacBooks, we need to disallow the legacy USB circuit to
234 	 * generate an SMI# because this can cause several problems,
235 	 * namely: incorrect CPU frequency detection and failure to
236 	 * start the APs.
237 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
238 	 * Enable register) of the Intel ICH LPC Interface Bridge.
239 	 */
240 	sysenv = kern_getenv("smbios.system.product");
241 	if (sysenv != NULL) {
242 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
243 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
244 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
245 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
246 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
247 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
248 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
249 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
250 			if (bootverbose)
251 				printf("Disabling LEGACY_USB_EN bit on "
252 				    "Intel ICH.\n");
253 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
254 		}
255 		freeenv(sysenv);
256 	}
257 
258 	/*
259 	 * Good {morning,afternoon,evening,night}.
260 	 */
261 	startrtclock();
262 	printcpuinfo();
263 
264 	/*
265 	 * Display physical memory if SMBIOS reports reasonable amount.
266 	 */
267 	memsize = 0;
268 	sysenv = kern_getenv("smbios.memory.enabled");
269 	if (sysenv != NULL) {
270 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
271 		freeenv(sysenv);
272 	}
273 	if (memsize < ptoa((uintmax_t)vm_free_count()))
274 		memsize = ptoa((uintmax_t)Maxmem);
275 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
276 	realmem = atop(memsize);
277 
278 	/*
279 	 * Display any holes after the first chunk of extended memory.
280 	 */
281 	if (bootverbose) {
282 		int indx;
283 
284 		printf("Physical memory chunk(s):\n");
285 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
286 			vm_paddr_t size;
287 
288 			size = phys_avail[indx + 1] - phys_avail[indx];
289 			printf(
290 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
291 			    (uintmax_t)phys_avail[indx],
292 			    (uintmax_t)phys_avail[indx + 1] - 1,
293 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
294 		}
295 	}
296 
297 	vm_ksubmap_init(&kmi);
298 
299 	printf("avail memory = %ju (%ju MB)\n",
300 	    ptoa((uintmax_t)vm_free_count()),
301 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
302 #ifdef DEV_PCI
303 	if (bootverbose && intel_graphics_stolen_base != 0)
304 		printf("intel stolen mem: base %#jx size %ju MB\n",
305 		    (uintmax_t)intel_graphics_stolen_base,
306 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
307 #endif
308 
309 	/*
310 	 * Set up buffers, so they can be used to read disk labels.
311 	 */
312 	bufinit();
313 	vm_pager_bufferinit();
314 
315 	cpu_setregs();
316 }
317 
318 static void
319 late_ifunc_resolve(void *dummy __unused)
320 {
321 	link_elf_late_ireloc();
322 }
323 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
324 
325 /*
326  * Send an interrupt to process.
327  *
328  * Stack is set up to allow sigcode stored
329  * at top to call routine, followed by call
330  * to sigreturn routine below.  After sigreturn
331  * resets the signal mask, the stack, and the
332  * frame pointer, it returns to the user
333  * specified pc, psl.
334  */
335 void
336 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
337 {
338 	struct sigframe sf, *sfp;
339 	struct pcb *pcb;
340 	struct proc *p;
341 	struct thread *td;
342 	struct sigacts *psp;
343 	char *sp;
344 	struct trapframe *regs;
345 	char *xfpusave;
346 	size_t xfpusave_len;
347 	int sig;
348 	int oonstack;
349 
350 	td = curthread;
351 	pcb = td->td_pcb;
352 	p = td->td_proc;
353 	PROC_LOCK_ASSERT(p, MA_OWNED);
354 	sig = ksi->ksi_signo;
355 	psp = p->p_sigacts;
356 	mtx_assert(&psp->ps_mtx, MA_OWNED);
357 	regs = td->td_frame;
358 	oonstack = sigonstack(regs->tf_rsp);
359 
360 	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
361 		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
362 		xfpusave = __builtin_alloca(xfpusave_len);
363 	} else {
364 		xfpusave_len = 0;
365 		xfpusave = NULL;
366 	}
367 
368 	/* Save user context. */
369 	bzero(&sf, sizeof(sf));
370 	sf.sf_uc.uc_sigmask = *mask;
371 	sf.sf_uc.uc_stack = td->td_sigstk;
372 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
373 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
374 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
375 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
376 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
377 	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
378 	fpstate_drop(td);
379 	update_pcb_bases(pcb);
380 	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
381 	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
382 	bzero(sf.sf_uc.uc_mcontext.mc_spare,
383 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
384 
385 	/* Allocate space for the signal handler context. */
386 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
387 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
388 		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
389 #if defined(COMPAT_43)
390 		td->td_sigstk.ss_flags |= SS_ONSTACK;
391 #endif
392 	} else
393 		sp = (char *)regs->tf_rsp - 128;
394 	if (xfpusave != NULL) {
395 		sp -= xfpusave_len;
396 		sp = (char *)((unsigned long)sp & ~0x3Ful);
397 		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
398 	}
399 	sp -= sizeof(struct sigframe);
400 	/* Align to 16 bytes. */
401 	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
402 
403 	/* Build the argument list for the signal handler. */
404 	regs->tf_rdi = sig;			/* arg 1 in %rdi */
405 	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
406 	bzero(&sf.sf_si, sizeof(sf.sf_si));
407 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
408 		/* Signal handler installed with SA_SIGINFO. */
409 		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
410 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
411 
412 		/* Fill in POSIX parts */
413 		sf.sf_si = ksi->ksi_info;
414 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
415 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
416 	} else {
417 		/* Old FreeBSD-style arguments. */
418 		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
419 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
420 		sf.sf_ahu.sf_handler = catcher;
421 	}
422 	mtx_unlock(&psp->ps_mtx);
423 	PROC_UNLOCK(p);
424 
425 	/*
426 	 * Copy the sigframe out to the user's stack.
427 	 */
428 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
429 	    (xfpusave != NULL && copyout(xfpusave,
430 	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
431 	    != 0)) {
432 #ifdef DEBUG
433 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
434 #endif
435 		PROC_LOCK(p);
436 		sigexit(td, SIGILL);
437 	}
438 
439 	regs->tf_rsp = (long)sfp;
440 	regs->tf_rip = p->p_sysent->sv_sigcode_base;
441 	regs->tf_rflags &= ~(PSL_T | PSL_D);
442 	regs->tf_cs = _ucodesel;
443 	regs->tf_ds = _udatasel;
444 	regs->tf_ss = _udatasel;
445 	regs->tf_es = _udatasel;
446 	regs->tf_fs = _ufssel;
447 	regs->tf_gs = _ugssel;
448 	regs->tf_flags = TF_HASSEGS;
449 	PROC_LOCK(p);
450 	mtx_lock(&psp->ps_mtx);
451 }
452 
453 /*
454  * System call to cleanup state after a signal
455  * has been taken.  Reset signal mask and
456  * stack state from context left by sendsig (above).
457  * Return to previous pc and psl as specified by
458  * context left by sendsig. Check carefully to
459  * make sure that the user has not modified the
460  * state to gain improper privileges.
461  *
462  * MPSAFE
463  */
464 int
465 sys_sigreturn(td, uap)
466 	struct thread *td;
467 	struct sigreturn_args /* {
468 		const struct __ucontext *sigcntxp;
469 	} */ *uap;
470 {
471 	ucontext_t uc;
472 	struct pcb *pcb;
473 	struct proc *p;
474 	struct trapframe *regs;
475 	ucontext_t *ucp;
476 	char *xfpustate;
477 	size_t xfpustate_len;
478 	long rflags;
479 	int cs, error, ret;
480 	ksiginfo_t ksi;
481 
482 	pcb = td->td_pcb;
483 	p = td->td_proc;
484 
485 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
486 	if (error != 0) {
487 		uprintf("pid %d (%s): sigreturn copyin failed\n",
488 		    p->p_pid, td->td_name);
489 		return (error);
490 	}
491 	ucp = &uc;
492 	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
493 		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
494 		    td->td_name, ucp->uc_mcontext.mc_flags);
495 		return (EINVAL);
496 	}
497 	regs = td->td_frame;
498 	rflags = ucp->uc_mcontext.mc_rflags;
499 	/*
500 	 * Don't allow users to change privileged or reserved flags.
501 	 */
502 	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
503 		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
504 		    td->td_name, rflags);
505 		return (EINVAL);
506 	}
507 
508 	/*
509 	 * Don't allow users to load a valid privileged %cs.  Let the
510 	 * hardware check for invalid selectors, excess privilege in
511 	 * other selectors, invalid %eip's and invalid %esp's.
512 	 */
513 	cs = ucp->uc_mcontext.mc_cs;
514 	if (!CS_SECURE(cs)) {
515 		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
516 		    td->td_name, cs);
517 		ksiginfo_init_trap(&ksi);
518 		ksi.ksi_signo = SIGBUS;
519 		ksi.ksi_code = BUS_OBJERR;
520 		ksi.ksi_trapno = T_PROTFLT;
521 		ksi.ksi_addr = (void *)regs->tf_rip;
522 		trapsignal(td, &ksi);
523 		return (EINVAL);
524 	}
525 
526 	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
527 		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
528 		if (xfpustate_len > cpu_max_ext_state_size -
529 		    sizeof(struct savefpu)) {
530 			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
531 			    p->p_pid, td->td_name, xfpustate_len);
532 			return (EINVAL);
533 		}
534 		xfpustate = __builtin_alloca(xfpustate_len);
535 		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
536 		    xfpustate, xfpustate_len);
537 		if (error != 0) {
538 			uprintf(
539 	"pid %d (%s): sigreturn copying xfpustate failed\n",
540 			    p->p_pid, td->td_name);
541 			return (error);
542 		}
543 	} else {
544 		xfpustate = NULL;
545 		xfpustate_len = 0;
546 	}
547 	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
548 	if (ret != 0) {
549 		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
550 		    p->p_pid, td->td_name, ret);
551 		return (ret);
552 	}
553 	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
554 	update_pcb_bases(pcb);
555 	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
556 	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
557 
558 #if defined(COMPAT_43)
559 	if (ucp->uc_mcontext.mc_onstack & 1)
560 		td->td_sigstk.ss_flags |= SS_ONSTACK;
561 	else
562 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
563 #endif
564 
565 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
566 	return (EJUSTRETURN);
567 }
568 
569 #ifdef COMPAT_FREEBSD4
570 int
571 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
572 {
573 
574 	return sys_sigreturn(td, (struct sigreturn_args *)uap);
575 }
576 #endif
577 
578 /*
579  * Reset the hardware debug registers if they were in use.
580  * They won't have any meaning for the newly exec'd process.
581  */
582 void
583 x86_clear_dbregs(struct pcb *pcb)
584 {
585 	if ((pcb->pcb_flags & PCB_DBREGS) == 0)
586 		return;
587 
588 	pcb->pcb_dr0 = 0;
589 	pcb->pcb_dr1 = 0;
590 	pcb->pcb_dr2 = 0;
591 	pcb->pcb_dr3 = 0;
592 	pcb->pcb_dr6 = 0;
593 	pcb->pcb_dr7 = 0;
594 
595 	if (pcb == curpcb) {
596 		/*
597 		 * Clear the debug registers on the running CPU,
598 		 * otherwise they will end up affecting the next
599 		 * process we switch to.
600 		 */
601 		reset_dbregs();
602 	}
603 	clear_pcb_flags(pcb, PCB_DBREGS);
604 }
605 
606 /*
607  * Reset registers to default values on exec.
608  */
609 void
610 exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack)
611 {
612 	struct trapframe *regs;
613 	struct pcb *pcb;
614 	register_t saved_rflags;
615 
616 	regs = td->td_frame;
617 	pcb = td->td_pcb;
618 
619 	if (td->td_proc->p_md.md_ldt != NULL)
620 		user_ldt_free(td);
621 
622 	update_pcb_bases(pcb);
623 	pcb->pcb_fsbase = 0;
624 	pcb->pcb_gsbase = 0;
625 	clear_pcb_flags(pcb, PCB_32BIT);
626 	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
627 
628 	saved_rflags = regs->tf_rflags & PSL_T;
629 	bzero((char *)regs, sizeof(struct trapframe));
630 	regs->tf_rip = imgp->entry_addr;
631 	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
632 	regs->tf_rdi = stack;		/* argv */
633 	regs->tf_rflags = PSL_USER | saved_rflags;
634 	regs->tf_ss = _udatasel;
635 	regs->tf_cs = _ucodesel;
636 	regs->tf_ds = _udatasel;
637 	regs->tf_es = _udatasel;
638 	regs->tf_fs = _ufssel;
639 	regs->tf_gs = _ugssel;
640 	regs->tf_flags = TF_HASSEGS;
641 
642 	x86_clear_dbregs(pcb);
643 
644 	/*
645 	 * Drop the FP state if we hold it, so that the process gets a
646 	 * clean FP state if it uses the FPU again.
647 	 */
648 	fpstate_drop(td);
649 }
650 
651 void
652 cpu_setregs(void)
653 {
654 	register_t cr0;
655 
656 	cr0 = rcr0();
657 	/*
658 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
659 	 * BSP.  See the comments there about why we set them.
660 	 */
661 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
662 	load_cr0(cr0);
663 }
664 
665 /*
666  * Initialize amd64 and configure to run kernel
667  */
668 
669 /*
670  * Initialize segments & interrupt table
671  */
672 static struct gate_descriptor idt0[NIDT];
673 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
674 
675 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
676 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
677 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
678 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
679 CTASSERT(sizeof(struct nmi_pcpu) == 16);
680 
681 /*
682  * Software prototypes -- in more palatable form.
683  *
684  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
685  * slots as corresponding segments for i386 kernel.
686  */
687 struct soft_segment_descriptor gdt_segs[] = {
688 /* GNULL_SEL	0 Null Descriptor */
689 {	.ssd_base = 0x0,
690 	.ssd_limit = 0x0,
691 	.ssd_type = 0,
692 	.ssd_dpl = 0,
693 	.ssd_p = 0,
694 	.ssd_long = 0,
695 	.ssd_def32 = 0,
696 	.ssd_gran = 0		},
697 /* GNULL2_SEL	1 Null Descriptor */
698 {	.ssd_base = 0x0,
699 	.ssd_limit = 0x0,
700 	.ssd_type = 0,
701 	.ssd_dpl = 0,
702 	.ssd_p = 0,
703 	.ssd_long = 0,
704 	.ssd_def32 = 0,
705 	.ssd_gran = 0		},
706 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
707 {	.ssd_base = 0x0,
708 	.ssd_limit = 0xfffff,
709 	.ssd_type = SDT_MEMRWA,
710 	.ssd_dpl = SEL_UPL,
711 	.ssd_p = 1,
712 	.ssd_long = 0,
713 	.ssd_def32 = 1,
714 	.ssd_gran = 1		},
715 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
716 {	.ssd_base = 0x0,
717 	.ssd_limit = 0xfffff,
718 	.ssd_type = SDT_MEMRWA,
719 	.ssd_dpl = SEL_UPL,
720 	.ssd_p = 1,
721 	.ssd_long = 0,
722 	.ssd_def32 = 1,
723 	.ssd_gran = 1		},
724 /* GCODE_SEL	4 Code Descriptor for kernel */
725 {	.ssd_base = 0x0,
726 	.ssd_limit = 0xfffff,
727 	.ssd_type = SDT_MEMERA,
728 	.ssd_dpl = SEL_KPL,
729 	.ssd_p = 1,
730 	.ssd_long = 1,
731 	.ssd_def32 = 0,
732 	.ssd_gran = 1		},
733 /* GDATA_SEL	5 Data Descriptor for kernel */
734 {	.ssd_base = 0x0,
735 	.ssd_limit = 0xfffff,
736 	.ssd_type = SDT_MEMRWA,
737 	.ssd_dpl = SEL_KPL,
738 	.ssd_p = 1,
739 	.ssd_long = 1,
740 	.ssd_def32 = 0,
741 	.ssd_gran = 1		},
742 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
743 {	.ssd_base = 0x0,
744 	.ssd_limit = 0xfffff,
745 	.ssd_type = SDT_MEMERA,
746 	.ssd_dpl = SEL_UPL,
747 	.ssd_p = 1,
748 	.ssd_long = 0,
749 	.ssd_def32 = 1,
750 	.ssd_gran = 1		},
751 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
752 {	.ssd_base = 0x0,
753 	.ssd_limit = 0xfffff,
754 	.ssd_type = SDT_MEMRWA,
755 	.ssd_dpl = SEL_UPL,
756 	.ssd_p = 1,
757 	.ssd_long = 0,
758 	.ssd_def32 = 1,
759 	.ssd_gran = 1		},
760 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
761 {	.ssd_base = 0x0,
762 	.ssd_limit = 0xfffff,
763 	.ssd_type = SDT_MEMERA,
764 	.ssd_dpl = SEL_UPL,
765 	.ssd_p = 1,
766 	.ssd_long = 1,
767 	.ssd_def32 = 0,
768 	.ssd_gran = 1		},
769 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
770 {	.ssd_base = 0x0,
771 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
772 	.ssd_type = SDT_SYSTSS,
773 	.ssd_dpl = SEL_KPL,
774 	.ssd_p = 1,
775 	.ssd_long = 0,
776 	.ssd_def32 = 0,
777 	.ssd_gran = 0		},
778 /* Actually, the TSS is a system descriptor which is double size */
779 {	.ssd_base = 0x0,
780 	.ssd_limit = 0x0,
781 	.ssd_type = 0,
782 	.ssd_dpl = 0,
783 	.ssd_p = 0,
784 	.ssd_long = 0,
785 	.ssd_def32 = 0,
786 	.ssd_gran = 0		},
787 /* GUSERLDT_SEL	11 LDT Descriptor */
788 {	.ssd_base = 0x0,
789 	.ssd_limit = 0x0,
790 	.ssd_type = 0,
791 	.ssd_dpl = 0,
792 	.ssd_p = 0,
793 	.ssd_long = 0,
794 	.ssd_def32 = 0,
795 	.ssd_gran = 0		},
796 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
797 {	.ssd_base = 0x0,
798 	.ssd_limit = 0x0,
799 	.ssd_type = 0,
800 	.ssd_dpl = 0,
801 	.ssd_p = 0,
802 	.ssd_long = 0,
803 	.ssd_def32 = 0,
804 	.ssd_gran = 0		},
805 };
806 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
807 
808 void
809 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
810 {
811 	struct gate_descriptor *ip;
812 
813 	ip = idt + idx;
814 	ip->gd_looffset = (uintptr_t)func;
815 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
816 	ip->gd_ist = ist;
817 	ip->gd_xx = 0;
818 	ip->gd_type = typ;
819 	ip->gd_dpl = dpl;
820 	ip->gd_p = 1;
821 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
822 }
823 
824 extern inthand_t
825 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
826 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
827 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
828 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
829 	IDTVEC(xmm), IDTVEC(dblfault),
830 	IDTVEC(div_pti), IDTVEC(bpt_pti),
831 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
832 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
833 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
834 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
835 	IDTVEC(xmm_pti),
836 #ifdef KDTRACE_HOOKS
837 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
838 #endif
839 #ifdef XENHVM
840 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
841 #endif
842 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
843 	IDTVEC(fast_syscall_pti);
844 
845 #ifdef DDB
846 /*
847  * Display the index and function name of any IDT entries that don't use
848  * the default 'rsvd' entry point.
849  */
850 DB_SHOW_COMMAND(idt, db_show_idt)
851 {
852 	struct gate_descriptor *ip;
853 	int idx;
854 	uintptr_t func;
855 
856 	ip = idt;
857 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
858 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
859 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
860 			db_printf("%3d\t", idx);
861 			db_printsym(func, DB_STGY_PROC);
862 			db_printf("\n");
863 		}
864 		ip++;
865 	}
866 }
867 
868 /* Show privileged registers. */
869 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
870 {
871 	struct {
872 		uint16_t limit;
873 		uint64_t base;
874 	} __packed idtr, gdtr;
875 	uint16_t ldt, tr;
876 
877 	__asm __volatile("sidt %0" : "=m" (idtr));
878 	db_printf("idtr\t0x%016lx/%04x\n",
879 	    (u_long)idtr.base, (u_int)idtr.limit);
880 	__asm __volatile("sgdt %0" : "=m" (gdtr));
881 	db_printf("gdtr\t0x%016lx/%04x\n",
882 	    (u_long)gdtr.base, (u_int)gdtr.limit);
883 	__asm __volatile("sldt %0" : "=r" (ldt));
884 	db_printf("ldtr\t0x%04x\n", ldt);
885 	__asm __volatile("str %0" : "=r" (tr));
886 	db_printf("tr\t0x%04x\n", tr);
887 	db_printf("cr0\t0x%016lx\n", rcr0());
888 	db_printf("cr2\t0x%016lx\n", rcr2());
889 	db_printf("cr3\t0x%016lx\n", rcr3());
890 	db_printf("cr4\t0x%016lx\n", rcr4());
891 	if (rcr4() & CR4_XSAVE)
892 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
893 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
894 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
895 		db_printf("FEATURES_CTL\t%016lx\n",
896 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
897 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
898 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
899 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
900 }
901 
902 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
903 {
904 
905 	db_printf("dr0\t0x%016lx\n", rdr0());
906 	db_printf("dr1\t0x%016lx\n", rdr1());
907 	db_printf("dr2\t0x%016lx\n", rdr2());
908 	db_printf("dr3\t0x%016lx\n", rdr3());
909 	db_printf("dr6\t0x%016lx\n", rdr6());
910 	db_printf("dr7\t0x%016lx\n", rdr7());
911 }
912 #endif
913 
914 void
915 sdtossd(sd, ssd)
916 	struct user_segment_descriptor *sd;
917 	struct soft_segment_descriptor *ssd;
918 {
919 
920 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
921 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
922 	ssd->ssd_type  = sd->sd_type;
923 	ssd->ssd_dpl   = sd->sd_dpl;
924 	ssd->ssd_p     = sd->sd_p;
925 	ssd->ssd_long  = sd->sd_long;
926 	ssd->ssd_def32 = sd->sd_def32;
927 	ssd->ssd_gran  = sd->sd_gran;
928 }
929 
930 void
931 ssdtosd(ssd, sd)
932 	struct soft_segment_descriptor *ssd;
933 	struct user_segment_descriptor *sd;
934 {
935 
936 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
937 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
938 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
939 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
940 	sd->sd_type  = ssd->ssd_type;
941 	sd->sd_dpl   = ssd->ssd_dpl;
942 	sd->sd_p     = ssd->ssd_p;
943 	sd->sd_long  = ssd->ssd_long;
944 	sd->sd_def32 = ssd->ssd_def32;
945 	sd->sd_gran  = ssd->ssd_gran;
946 }
947 
948 void
949 ssdtosyssd(ssd, sd)
950 	struct soft_segment_descriptor *ssd;
951 	struct system_segment_descriptor *sd;
952 {
953 
954 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
955 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
956 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
957 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
958 	sd->sd_type  = ssd->ssd_type;
959 	sd->sd_dpl   = ssd->ssd_dpl;
960 	sd->sd_p     = ssd->ssd_p;
961 	sd->sd_gran  = ssd->ssd_gran;
962 }
963 
964 u_int basemem;
965 
966 static int
967 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
968     int *physmap_idxp)
969 {
970 	int i, insert_idx, physmap_idx;
971 
972 	physmap_idx = *physmap_idxp;
973 
974 	if (length == 0)
975 		return (1);
976 
977 	/*
978 	 * Find insertion point while checking for overlap.  Start off by
979 	 * assuming the new entry will be added to the end.
980 	 *
981 	 * NB: physmap_idx points to the next free slot.
982 	 */
983 	insert_idx = physmap_idx;
984 	for (i = 0; i <= physmap_idx; i += 2) {
985 		if (base < physmap[i + 1]) {
986 			if (base + length <= physmap[i]) {
987 				insert_idx = i;
988 				break;
989 			}
990 			if (boothowto & RB_VERBOSE)
991 				printf(
992 		    "Overlapping memory regions, ignoring second region\n");
993 			return (1);
994 		}
995 	}
996 
997 	/* See if we can prepend to the next entry. */
998 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
999 		physmap[insert_idx] = base;
1000 		return (1);
1001 	}
1002 
1003 	/* See if we can append to the previous entry. */
1004 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1005 		physmap[insert_idx - 1] += length;
1006 		return (1);
1007 	}
1008 
1009 	physmap_idx += 2;
1010 	*physmap_idxp = physmap_idx;
1011 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
1012 		printf(
1013 		"Too many segments in the physical address map, giving up\n");
1014 		return (0);
1015 	}
1016 
1017 	/*
1018 	 * Move the last 'N' entries down to make room for the new
1019 	 * entry if needed.
1020 	 */
1021 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1022 		physmap[i] = physmap[i - 2];
1023 		physmap[i + 1] = physmap[i - 1];
1024 	}
1025 
1026 	/* Insert the new entry. */
1027 	physmap[insert_idx] = base;
1028 	physmap[insert_idx + 1] = base + length;
1029 	return (1);
1030 }
1031 
1032 void
1033 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1034                       vm_paddr_t *physmap, int *physmap_idx)
1035 {
1036 	struct bios_smap *smap, *smapend;
1037 
1038 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1039 
1040 	for (smap = smapbase; smap < smapend; smap++) {
1041 		if (boothowto & RB_VERBOSE)
1042 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
1043 			    smap->type, smap->base, smap->length);
1044 
1045 		if (smap->type != SMAP_TYPE_MEMORY)
1046 			continue;
1047 
1048 		if (!add_physmap_entry(smap->base, smap->length, physmap,
1049 		    physmap_idx))
1050 			break;
1051 	}
1052 }
1053 
1054 static void
1055 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1056     int *physmap_idx)
1057 {
1058 	struct efi_md *map, *p;
1059 	const char *type;
1060 	size_t efisz;
1061 	int ndesc, i;
1062 
1063 	static const char *types[] = {
1064 		"Reserved",
1065 		"LoaderCode",
1066 		"LoaderData",
1067 		"BootServicesCode",
1068 		"BootServicesData",
1069 		"RuntimeServicesCode",
1070 		"RuntimeServicesData",
1071 		"ConventionalMemory",
1072 		"UnusableMemory",
1073 		"ACPIReclaimMemory",
1074 		"ACPIMemoryNVS",
1075 		"MemoryMappedIO",
1076 		"MemoryMappedIOPortSpace",
1077 		"PalCode",
1078 		"PersistentMemory"
1079 	};
1080 
1081 	/*
1082 	 * Memory map data provided by UEFI via the GetMemoryMap
1083 	 * Boot Services API.
1084 	 */
1085 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1086 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1087 
1088 	if (efihdr->descriptor_size == 0)
1089 		return;
1090 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
1091 
1092 	if (boothowto & RB_VERBOSE)
1093 		printf("%23s %12s %12s %8s %4s\n",
1094 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
1095 
1096 	for (i = 0, p = map; i < ndesc; i++,
1097 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1098 		if (boothowto & RB_VERBOSE) {
1099 			if (p->md_type < nitems(types))
1100 				type = types[p->md_type];
1101 			else
1102 				type = "<INVALID>";
1103 			printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
1104 			    p->md_virt, p->md_pages);
1105 			if (p->md_attr & EFI_MD_ATTR_UC)
1106 				printf("UC ");
1107 			if (p->md_attr & EFI_MD_ATTR_WC)
1108 				printf("WC ");
1109 			if (p->md_attr & EFI_MD_ATTR_WT)
1110 				printf("WT ");
1111 			if (p->md_attr & EFI_MD_ATTR_WB)
1112 				printf("WB ");
1113 			if (p->md_attr & EFI_MD_ATTR_UCE)
1114 				printf("UCE ");
1115 			if (p->md_attr & EFI_MD_ATTR_WP)
1116 				printf("WP ");
1117 			if (p->md_attr & EFI_MD_ATTR_RP)
1118 				printf("RP ");
1119 			if (p->md_attr & EFI_MD_ATTR_XP)
1120 				printf("XP ");
1121 			if (p->md_attr & EFI_MD_ATTR_NV)
1122 				printf("NV ");
1123 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1124 				printf("MORE_RELIABLE ");
1125 			if (p->md_attr & EFI_MD_ATTR_RO)
1126 				printf("RO ");
1127 			if (p->md_attr & EFI_MD_ATTR_RT)
1128 				printf("RUNTIME");
1129 			printf("\n");
1130 		}
1131 
1132 		switch (p->md_type) {
1133 		case EFI_MD_TYPE_CODE:
1134 		case EFI_MD_TYPE_DATA:
1135 		case EFI_MD_TYPE_BS_CODE:
1136 		case EFI_MD_TYPE_BS_DATA:
1137 		case EFI_MD_TYPE_FREE:
1138 			/*
1139 			 * We're allowed to use any entry with these types.
1140 			 */
1141 			break;
1142 		default:
1143 			continue;
1144 		}
1145 
1146 		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1147 		    physmap, physmap_idx))
1148 			break;
1149 	}
1150 }
1151 
1152 static void
1153 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1154 {
1155 	struct bios_smap *smap;
1156 	struct efi_map_header *efihdr;
1157 	u_int32_t size;
1158 
1159 	/*
1160 	 * Memory map from INT 15:E820.
1161 	 *
1162 	 * subr_module.c says:
1163 	 * "Consumer may safely assume that size value precedes data."
1164 	 * ie: an int32_t immediately precedes smap.
1165 	 */
1166 
1167 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1168 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1169 	smap = (struct bios_smap *)preload_search_info(kmdp,
1170 	    MODINFO_METADATA | MODINFOMD_SMAP);
1171 	if (efihdr == NULL && smap == NULL)
1172 		panic("No BIOS smap or EFI map info from loader!");
1173 
1174 	if (efihdr != NULL) {
1175 		add_efi_map_entries(efihdr, physmap, physmap_idx);
1176 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1177 	} else {
1178 		size = *((u_int32_t *)smap - 1);
1179 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
1180 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1181 	}
1182 }
1183 
1184 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
1185 
1186 /*
1187  * Populate the (physmap) array with base/bound pairs describing the
1188  * available physical memory in the system, then test this memory and
1189  * build the phys_avail array describing the actually-available memory.
1190  *
1191  * Total memory size may be set by the kernel environment variable
1192  * hw.physmem or the compile-time define MAXMEM.
1193  *
1194  * XXX first should be vm_paddr_t.
1195  */
1196 static void
1197 getmemsize(caddr_t kmdp, u_int64_t first)
1198 {
1199 	int i, physmap_idx, pa_indx, da_indx;
1200 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
1201 	u_long physmem_start, physmem_tunable, memtest;
1202 	pt_entry_t *pte;
1203 	quad_t dcons_addr, dcons_size;
1204 	int page_counter;
1205 
1206 	/*
1207 	 * Tell the physical memory allocator about pages used to store
1208 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
1209 	 */
1210 	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
1211 
1212 	bzero(physmap, sizeof(physmap));
1213 	physmap_idx = 0;
1214 
1215 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1216 	physmap_idx -= 2;
1217 
1218 	/*
1219 	 * Find the 'base memory' segment for SMP
1220 	 */
1221 	basemem = 0;
1222 	for (i = 0; i <= physmap_idx; i += 2) {
1223 		if (physmap[i] <= 0xA0000) {
1224 			basemem = physmap[i + 1] / 1024;
1225 			break;
1226 		}
1227 	}
1228 	if (basemem == 0 || basemem > 640) {
1229 		if (bootverbose)
1230 			printf(
1231 		"Memory map doesn't contain a basemem segment, faking it");
1232 		basemem = 640;
1233 	}
1234 
1235 	/*
1236 	 * Maxmem isn't the "maximum memory", it's one larger than the
1237 	 * highest page of the physical address space.  It should be
1238 	 * called something like "Maxphyspage".  We may adjust this
1239 	 * based on ``hw.physmem'' and the results of the memory test.
1240 	 */
1241 	Maxmem = atop(physmap[physmap_idx + 1]);
1242 
1243 #ifdef MAXMEM
1244 	Maxmem = MAXMEM / 4;
1245 #endif
1246 
1247 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1248 		Maxmem = atop(physmem_tunable);
1249 
1250 	/*
1251 	 * The boot memory test is disabled by default, as it takes a
1252 	 * significant amount of time on large-memory systems, and is
1253 	 * unfriendly to virtual machines as it unnecessarily touches all
1254 	 * pages.
1255 	 *
1256 	 * A general name is used as the code may be extended to support
1257 	 * additional tests beyond the current "page present" test.
1258 	 */
1259 	memtest = 0;
1260 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1261 
1262 	/*
1263 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1264 	 * in the system.
1265 	 */
1266 	if (Maxmem > atop(physmap[physmap_idx + 1]))
1267 		Maxmem = atop(physmap[physmap_idx + 1]);
1268 
1269 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1270 	    (boothowto & RB_VERBOSE))
1271 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
1272 
1273 	/*
1274 	 * Make hole for "AP -> long mode" bootstrap code.  The
1275 	 * mp_bootaddress vector is only available when the kernel
1276 	 * is configured to support APs and APs for the system start
1277 	 * in real mode mode (e.g. SMP bare metal).
1278 	 */
1279 #ifdef SMP
1280 	mp_bootaddress(physmap, &physmap_idx);
1281 #endif
1282 
1283 	/* call pmap initialization to make new kernel address space */
1284 	pmap_bootstrap(&first);
1285 
1286 	/*
1287 	 * Size up each available chunk of physical memory.
1288 	 *
1289 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1290 	 * By default, mask off the first 16 pages unless we appear to be
1291 	 * running in a VM.
1292 	 */
1293 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1294 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1295 	if (physmap[0] < physmem_start) {
1296 		if (physmem_start < PAGE_SIZE)
1297 			physmap[0] = PAGE_SIZE;
1298 		else if (physmem_start >= physmap[1])
1299 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1300 		else
1301 			physmap[0] = round_page(physmem_start);
1302 	}
1303 	pa_indx = 0;
1304 	da_indx = 1;
1305 	phys_avail[pa_indx++] = physmap[0];
1306 	phys_avail[pa_indx] = physmap[0];
1307 	dump_avail[da_indx] = physmap[0];
1308 	pte = CMAP1;
1309 
1310 	/*
1311 	 * Get dcons buffer address
1312 	 */
1313 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1314 	    getenv_quad("dcons.size", &dcons_size) == 0)
1315 		dcons_addr = 0;
1316 
1317 	/*
1318 	 * physmap is in bytes, so when converting to page boundaries,
1319 	 * round up the start address and round down the end address.
1320 	 */
1321 	page_counter = 0;
1322 	if (memtest != 0)
1323 		printf("Testing system memory");
1324 	for (i = 0; i <= physmap_idx; i += 2) {
1325 		vm_paddr_t end;
1326 
1327 		end = ptoa((vm_paddr_t)Maxmem);
1328 		if (physmap[i + 1] < end)
1329 			end = trunc_page(physmap[i + 1]);
1330 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1331 			int tmp, page_bad, full;
1332 			int *ptr = (int *)CADDR1;
1333 
1334 			full = FALSE;
1335 			/*
1336 			 * block out kernel memory as not available.
1337 			 */
1338 			if (pa >= (vm_paddr_t)kernphys && pa < first)
1339 				goto do_dump_avail;
1340 
1341 			/*
1342 			 * block out dcons buffer
1343 			 */
1344 			if (dcons_addr > 0
1345 			    && pa >= trunc_page(dcons_addr)
1346 			    && pa < dcons_addr + dcons_size)
1347 				goto do_dump_avail;
1348 
1349 			page_bad = FALSE;
1350 			if (memtest == 0)
1351 				goto skip_memtest;
1352 
1353 			/*
1354 			 * Print a "." every GB to show we're making
1355 			 * progress.
1356 			 */
1357 			page_counter++;
1358 			if ((page_counter % PAGES_PER_GB) == 0)
1359 				printf(".");
1360 
1361 			/*
1362 			 * map page into kernel: valid, read/write,non-cacheable
1363 			 */
1364 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1365 			invltlb();
1366 
1367 			tmp = *(int *)ptr;
1368 			/*
1369 			 * Test for alternating 1's and 0's
1370 			 */
1371 			*(volatile int *)ptr = 0xaaaaaaaa;
1372 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1373 				page_bad = TRUE;
1374 			/*
1375 			 * Test for alternating 0's and 1's
1376 			 */
1377 			*(volatile int *)ptr = 0x55555555;
1378 			if (*(volatile int *)ptr != 0x55555555)
1379 				page_bad = TRUE;
1380 			/*
1381 			 * Test for all 1's
1382 			 */
1383 			*(volatile int *)ptr = 0xffffffff;
1384 			if (*(volatile int *)ptr != 0xffffffff)
1385 				page_bad = TRUE;
1386 			/*
1387 			 * Test for all 0's
1388 			 */
1389 			*(volatile int *)ptr = 0x0;
1390 			if (*(volatile int *)ptr != 0x0)
1391 				page_bad = TRUE;
1392 			/*
1393 			 * Restore original value.
1394 			 */
1395 			*(int *)ptr = tmp;
1396 
1397 skip_memtest:
1398 			/*
1399 			 * Adjust array of valid/good pages.
1400 			 */
1401 			if (page_bad == TRUE)
1402 				continue;
1403 			/*
1404 			 * If this good page is a continuation of the
1405 			 * previous set of good pages, then just increase
1406 			 * the end pointer. Otherwise start a new chunk.
1407 			 * Note that "end" points one higher than end,
1408 			 * making the range >= start and < end.
1409 			 * If we're also doing a speculative memory
1410 			 * test and we at or past the end, bump up Maxmem
1411 			 * so that we keep going. The first bad page
1412 			 * will terminate the loop.
1413 			 */
1414 			if (phys_avail[pa_indx] == pa) {
1415 				phys_avail[pa_indx] += PAGE_SIZE;
1416 			} else {
1417 				pa_indx++;
1418 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
1419 					printf(
1420 		"Too many holes in the physical address space, giving up\n");
1421 					pa_indx--;
1422 					full = TRUE;
1423 					goto do_dump_avail;
1424 				}
1425 				phys_avail[pa_indx++] = pa;	/* start */
1426 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1427 			}
1428 			physmem++;
1429 do_dump_avail:
1430 			if (dump_avail[da_indx] == pa) {
1431 				dump_avail[da_indx] += PAGE_SIZE;
1432 			} else {
1433 				da_indx++;
1434 				if (da_indx == PHYS_AVAIL_ENTRIES) {
1435 					da_indx--;
1436 					goto do_next;
1437 				}
1438 				dump_avail[da_indx++] = pa; /* start */
1439 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1440 			}
1441 do_next:
1442 			if (full)
1443 				break;
1444 		}
1445 	}
1446 	*pte = 0;
1447 	invltlb();
1448 	if (memtest != 0)
1449 		printf("\n");
1450 
1451 	/*
1452 	 * XXX
1453 	 * The last chunk must contain at least one page plus the message
1454 	 * buffer to avoid complicating other code (message buffer address
1455 	 * calculation, etc.).
1456 	 */
1457 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1458 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1459 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1460 		phys_avail[pa_indx--] = 0;
1461 		phys_avail[pa_indx--] = 0;
1462 	}
1463 
1464 	Maxmem = atop(phys_avail[pa_indx]);
1465 
1466 	/* Trim off space for the message buffer. */
1467 	phys_avail[pa_indx] -= round_page(msgbufsize);
1468 
1469 	/* Map the message buffer. */
1470 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1471 }
1472 
1473 static caddr_t
1474 native_parse_preload_data(u_int64_t modulep)
1475 {
1476 	caddr_t kmdp;
1477 	char *envp;
1478 #ifdef DDB
1479 	vm_offset_t ksym_start;
1480 	vm_offset_t ksym_end;
1481 #endif
1482 
1483 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1484 	preload_bootstrap_relocate(KERNBASE);
1485 	kmdp = preload_search_by_type("elf kernel");
1486 	if (kmdp == NULL)
1487 		kmdp = preload_search_by_type("elf64 kernel");
1488 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1489 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1490 	if (envp != NULL)
1491 		envp += KERNBASE;
1492 	init_static_kenv(envp, 0);
1493 #ifdef DDB
1494 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1495 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1496 	db_fetch_ksymtab(ksym_start, ksym_end, 0);
1497 #endif
1498 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1499 
1500 	return (kmdp);
1501 }
1502 
1503 static void
1504 amd64_kdb_init(void)
1505 {
1506 	kdb_init();
1507 #ifdef KDB
1508 	if (boothowto & RB_KDB)
1509 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1510 #endif
1511 }
1512 
1513 /* Set up the fast syscall stuff */
1514 void
1515 amd64_conf_fast_syscall(void)
1516 {
1517 	uint64_t msr;
1518 
1519 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1520 	wrmsr(MSR_EFER, msr);
1521 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1522 	    (u_int64_t)IDTVEC(fast_syscall));
1523 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1524 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1525 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1526 	wrmsr(MSR_STAR, msr);
1527 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1528 }
1529 
1530 void
1531 amd64_bsp_pcpu_init1(struct pcpu *pc)
1532 {
1533 	struct user_segment_descriptor *gdt;
1534 
1535 	PCPU_SET(prvspace, pc);
1536 	gdt = *PCPU_PTR(gdt);
1537 	PCPU_SET(curthread, &thread0);
1538 	PCPU_SET(tssp, PCPU_PTR(common_tss));
1539 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1540 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1541 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1542 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1543 	PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1544 	PCPU_SET(smp_tlb_gen, 1);
1545 }
1546 
1547 void
1548 amd64_bsp_pcpu_init2(uint64_t rsp0)
1549 {
1550 
1551 	PCPU_SET(rsp0, rsp0);
1552 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1553 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1554 	PCPU_SET(curpcb, thread0.td_pcb);
1555 }
1556 
1557 void
1558 amd64_bsp_ist_init(struct pcpu *pc)
1559 {
1560 	struct nmi_pcpu *np;
1561 	struct amd64tss *tssp;
1562 
1563 	tssp = &pc->pc_common_tss;
1564 
1565 	/* doublefault stack space, runs on ist1 */
1566 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1567 	np->np_pcpu = (register_t)pc;
1568 	tssp->tss_ist1 = (long)np;
1569 
1570 	/*
1571 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1572 	 * above the start of the ist2 stack.
1573 	 */
1574 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1575 	np->np_pcpu = (register_t)pc;
1576 	tssp->tss_ist2 = (long)np;
1577 
1578 	/*
1579 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1580 	 * above the start of the ist3 stack.
1581 	 */
1582 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1583 	np->np_pcpu = (register_t)pc;
1584 	tssp->tss_ist3 = (long)np;
1585 
1586 	/*
1587 	 * DB# stack, runs on ist4.
1588 	 */
1589 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1590 	np->np_pcpu = (register_t)pc;
1591 	tssp->tss_ist4 = (long)np;
1592 }
1593 
1594 u_int64_t
1595 hammer_time(u_int64_t modulep, u_int64_t physfree)
1596 {
1597 	caddr_t kmdp;
1598 	int gsel_tss, x;
1599 	struct pcpu *pc;
1600 	struct xstate_hdr *xhdr;
1601 	u_int64_t rsp0;
1602 	char *env;
1603 	struct user_segment_descriptor *gdt;
1604 	struct region_descriptor r_gdt;
1605 	size_t kstack0_sz;
1606 	int late_console;
1607 
1608 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1609 
1610 	kmdp = init_ops.parse_preload_data(modulep);
1611 
1612 	physfree += ucode_load_bsp(physfree + KERNBASE);
1613 	physfree = roundup2(physfree, PAGE_SIZE);
1614 
1615 	identify_cpu1();
1616 	identify_hypervisor();
1617 	identify_cpu_fixup_bsp();
1618 	identify_cpu2();
1619 	initializecpucache();
1620 
1621 	/*
1622 	 * Check for pti, pcid, and invpcid before ifuncs are
1623 	 * resolved, to correctly select the implementation for
1624 	 * pmap_activate_sw_mode().
1625 	 */
1626 	pti = pti_get_default();
1627 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1628 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1629 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1630 		invpcid_works = (cpu_stdext_feature &
1631 		    CPUID_STDEXT_INVPCID) != 0;
1632 	} else {
1633 		pmap_pcid_enabled = 0;
1634 	}
1635 
1636 	link_elf_ireloc(kmdp);
1637 
1638 	/*
1639 	 * This may be done better later if it gets more high level
1640 	 * components in it. If so just link td->td_proc here.
1641 	 */
1642 	proc_linkup0(&proc0, &thread0);
1643 
1644 	/* Init basic tunables, hz etc */
1645 	init_param1();
1646 
1647 	thread0.td_kstack = physfree + KERNBASE;
1648 	thread0.td_kstack_pages = kstack_pages;
1649 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1650 	bzero((void *)thread0.td_kstack, kstack0_sz);
1651 	physfree += kstack0_sz;
1652 
1653 	/*
1654 	 * Initialize enough of thread0 for delayed invalidation to
1655 	 * work very early.  Rely on thread0.td_base_pri
1656 	 * zero-initialization, it is reset to PVM at proc0_init().
1657 	 */
1658 	pmap_thread_init_invl_gen(&thread0);
1659 
1660 	pc = &temp_bsp_pcpu;
1661 	pcpu_init(pc, 0, sizeof(struct pcpu));
1662 	gdt = &temp_bsp_pcpu.pc_gdt[0];
1663 
1664 	/*
1665 	 * make gdt memory segments
1666 	 */
1667 	for (x = 0; x < NGDT; x++) {
1668 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1669 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1670 			ssdtosd(&gdt_segs[x], &gdt[x]);
1671 	}
1672 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1673 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1674 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1675 
1676 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1677 	r_gdt.rd_base = (long)gdt;
1678 	lgdt(&r_gdt);
1679 
1680 	wrmsr(MSR_FSBASE, 0);		/* User value */
1681 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1682 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1683 
1684 	dpcpu_init((void *)(physfree + KERNBASE), 0);
1685 	physfree += DPCPU_SIZE;
1686 	amd64_bsp_pcpu_init1(pc);
1687 	/* Non-late cninit() and printf() can be moved up to here. */
1688 
1689 	/*
1690 	 * Initialize mutexes.
1691 	 *
1692 	 * icu_lock: in order to allow an interrupt to occur in a critical
1693 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1694 	 *	     must be able to get the icu lock, so it can't be
1695 	 *	     under witness.
1696 	 */
1697 	mutex_init();
1698 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1699 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1700 
1701 	/* exceptions */
1702 	for (x = 0; x < NIDT; x++)
1703 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1704 		    SEL_KPL, 0);
1705 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1706 	    SEL_KPL, 0);
1707 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1708 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1709 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1710 	    SEL_UPL, 0);
1711 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1712 	    SEL_UPL, 0);
1713 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1714 	    SEL_KPL, 0);
1715 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1716 	    SEL_KPL, 0);
1717 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1718 	    SEL_KPL, 0);
1719 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1720 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1721 	    SDT_SYSIGT, SEL_KPL, 0);
1722 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1723 	    SEL_KPL, 0);
1724 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1725 	    SDT_SYSIGT, SEL_KPL, 0);
1726 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1727 	    SEL_KPL, 0);
1728 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1729 	    SEL_KPL, 0);
1730 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1731 	    SEL_KPL, 0);
1732 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1733 	    SEL_KPL, 0);
1734 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1735 	    SEL_KPL, 0);
1736 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1737 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1738 	    SEL_KPL, 0);
1739 #ifdef KDTRACE_HOOKS
1740 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1741 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1742 #endif
1743 #ifdef XENHVM
1744 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1745 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1746 #endif
1747 	r_idt.rd_limit = sizeof(idt0) - 1;
1748 	r_idt.rd_base = (long) idt;
1749 	lidt(&r_idt);
1750 
1751 	/*
1752 	 * Initialize the clock before the console so that console
1753 	 * initialization can use DELAY().
1754 	 */
1755 	clock_init();
1756 
1757 	/*
1758 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1759 	 * transition).
1760 	 * Once bootblocks have updated, we can test directly for
1761 	 * efi_systbl != NULL here...
1762 	 */
1763 	if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1764 	    != NULL)
1765 		vty_set_preferred(VTY_VT);
1766 
1767 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1768 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1769 
1770 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1771 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1772 
1773 	TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
1774 	    &syscall_ret_l1d_flush_mode);
1775 
1776 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1777 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1778 
1779 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1780 
1781 	TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
1782 	    &x86_rngds_mitg_enable);
1783 
1784 	finishidentcpu();	/* Final stage of CPU initialization */
1785 	initializecpu();	/* Initialize CPU registers */
1786 
1787 	amd64_bsp_ist_init(pc);
1788 
1789 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1790 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1791 	    IOPERM_BITMAP_SIZE;
1792 
1793 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1794 	ltr(gsel_tss);
1795 
1796 	amd64_conf_fast_syscall();
1797 
1798 	/*
1799 	 * We initialize the PCB pointer early so that exception
1800 	 * handlers will work.  Also set up td_critnest to short-cut
1801 	 * the page fault handler.
1802 	 */
1803 	cpu_max_ext_state_size = sizeof(struct savefpu);
1804 	set_top_of_stack_td(&thread0);
1805 	thread0.td_pcb = get_pcb_td(&thread0);
1806 	thread0.td_critnest = 1;
1807 
1808 	/*
1809 	 * The console and kdb should be initialized even earlier than here,
1810 	 * but some console drivers don't work until after getmemsize().
1811 	 * Default to late console initialization to support these drivers.
1812 	 * This loses mainly printf()s in getmemsize() and early debugging.
1813 	 */
1814 	late_console = 1;
1815 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1816 	if (!late_console) {
1817 		cninit();
1818 		amd64_kdb_init();
1819 	}
1820 
1821 	getmemsize(kmdp, physfree);
1822 	init_param2(physmem);
1823 
1824 	/* now running on new page tables, configured,and u/iom is accessible */
1825 
1826 #ifdef DEV_PCI
1827         /* This call might adjust phys_avail[]. */
1828         pci_early_quirks();
1829 #endif
1830 
1831 	if (late_console)
1832 		cninit();
1833 
1834 	/*
1835 	 * Dump the boot metadata. We have to wait for cninit() since console
1836 	 * output is required. If it's grossly incorrect the kernel will never
1837 	 * make it this far.
1838 	 */
1839 	if (getenv_is_true("debug.dump_modinfo_at_boot"))
1840 		preload_dump();
1841 
1842 #ifdef DEV_ISA
1843 #ifdef DEV_ATPIC
1844 	elcr_probe();
1845 	atpic_startup();
1846 #else
1847 	/* Reset and mask the atpics and leave them shut down. */
1848 	atpic_reset();
1849 
1850 	/*
1851 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1852 	 * interrupt handler.
1853 	 */
1854 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1855 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1856 #endif
1857 #else
1858 #error "have you forgotten the isa device?"
1859 #endif
1860 
1861 	if (late_console)
1862 		amd64_kdb_init();
1863 
1864 	msgbufinit(msgbufp, msgbufsize);
1865 	fpuinit();
1866 
1867 	/*
1868 	 * Reinitialize thread0's stack base now that the xsave area size is
1869 	 * known.  Set up thread0's pcb save area after fpuinit calculated fpu
1870 	 * save area size.  Zero out the extended state header in fpu save area.
1871 	 */
1872 	set_top_of_stack_td(&thread0);
1873 	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1874 	bzero(thread0.td_pcb->pcb_save, cpu_max_ext_state_size);
1875 	if (use_xsave) {
1876 		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1877 		    1);
1878 		xhdr->xstate_bv = xsave_mask;
1879 	}
1880 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1881 	rsp0 = thread0.td_md.md_stack_base;
1882 	/* Ensure the stack is aligned to 16 bytes */
1883 	rsp0 &= ~0xFul;
1884 	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1885 	amd64_bsp_pcpu_init2(rsp0);
1886 
1887 	/* transfer to user mode */
1888 
1889 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1890 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1891 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1892 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1893 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1894 
1895 	load_ds(_udatasel);
1896 	load_es(_udatasel);
1897 	load_fs(_ufssel);
1898 
1899 	/* setup proc 0's pcb */
1900 	thread0.td_pcb->pcb_flags = 0;
1901 	thread0.td_frame = &proc0_tf;
1902 
1903         env = kern_getenv("kernelname");
1904 	if (env != NULL)
1905 		strlcpy(kernelname, env, sizeof(kernelname));
1906 
1907 	kcsan_cpu_init(0);
1908 
1909 #ifdef FDT
1910 	x86_init_fdt();
1911 #endif
1912 	thread0.td_critnest = 0;
1913 
1914 	kasan_init();
1915 
1916 	TSEXIT();
1917 
1918 	/* Location of kernel stack for locore */
1919 	return (thread0.td_md.md_stack_base);
1920 }
1921 
1922 void
1923 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1924 {
1925 
1926 	pcpu->pc_acpi_id = 0xffffffff;
1927 }
1928 
1929 static int
1930 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1931 {
1932 	struct bios_smap *smapbase;
1933 	struct bios_smap_xattr smap;
1934 	caddr_t kmdp;
1935 	uint32_t *smapattr;
1936 	int count, error, i;
1937 
1938 	/* Retrieve the system memory map from the loader. */
1939 	kmdp = preload_search_by_type("elf kernel");
1940 	if (kmdp == NULL)
1941 		kmdp = preload_search_by_type("elf64 kernel");
1942 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1943 	    MODINFO_METADATA | MODINFOMD_SMAP);
1944 	if (smapbase == NULL)
1945 		return (0);
1946 	smapattr = (uint32_t *)preload_search_info(kmdp,
1947 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1948 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1949 	error = 0;
1950 	for (i = 0; i < count; i++) {
1951 		smap.base = smapbase[i].base;
1952 		smap.length = smapbase[i].length;
1953 		smap.type = smapbase[i].type;
1954 		if (smapattr != NULL)
1955 			smap.xattr = smapattr[i];
1956 		else
1957 			smap.xattr = 0;
1958 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1959 	}
1960 	return (error);
1961 }
1962 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1963     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1964     smap_sysctl_handler, "S,bios_smap_xattr",
1965     "Raw BIOS SMAP data");
1966 
1967 static int
1968 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1969 {
1970 	struct efi_map_header *efihdr;
1971 	caddr_t kmdp;
1972 	uint32_t efisize;
1973 
1974 	kmdp = preload_search_by_type("elf kernel");
1975 	if (kmdp == NULL)
1976 		kmdp = preload_search_by_type("elf64 kernel");
1977 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1978 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1979 	if (efihdr == NULL)
1980 		return (0);
1981 	efisize = *((uint32_t *)efihdr - 1);
1982 	return (SYSCTL_OUT(req, efihdr, efisize));
1983 }
1984 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1985     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1986     efi_map_sysctl_handler, "S,efi_map_header",
1987     "Raw EFI Memory Map");
1988 
1989 void
1990 spinlock_enter(void)
1991 {
1992 	struct thread *td;
1993 	register_t flags;
1994 
1995 	td = curthread;
1996 	if (td->td_md.md_spinlock_count == 0) {
1997 		flags = intr_disable();
1998 		td->td_md.md_spinlock_count = 1;
1999 		td->td_md.md_saved_flags = flags;
2000 		critical_enter();
2001 	} else
2002 		td->td_md.md_spinlock_count++;
2003 }
2004 
2005 void
2006 spinlock_exit(void)
2007 {
2008 	struct thread *td;
2009 	register_t flags;
2010 
2011 	td = curthread;
2012 	flags = td->td_md.md_saved_flags;
2013 	td->td_md.md_spinlock_count--;
2014 	if (td->td_md.md_spinlock_count == 0) {
2015 		critical_exit();
2016 		intr_restore(flags);
2017 	}
2018 }
2019 
2020 /*
2021  * Construct a PCB from a trapframe. This is called from kdb_trap() where
2022  * we want to start a backtrace from the function that caused us to enter
2023  * the debugger. We have the context in the trapframe, but base the trace
2024  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
2025  * enough for a backtrace.
2026  */
2027 void
2028 makectx(struct trapframe *tf, struct pcb *pcb)
2029 {
2030 
2031 	pcb->pcb_r12 = tf->tf_r12;
2032 	pcb->pcb_r13 = tf->tf_r13;
2033 	pcb->pcb_r14 = tf->tf_r14;
2034 	pcb->pcb_r15 = tf->tf_r15;
2035 	pcb->pcb_rbp = tf->tf_rbp;
2036 	pcb->pcb_rbx = tf->tf_rbx;
2037 	pcb->pcb_rip = tf->tf_rip;
2038 	pcb->pcb_rsp = tf->tf_rsp;
2039 }
2040 
2041 int
2042 ptrace_set_pc(struct thread *td, unsigned long addr)
2043 {
2044 
2045 	td->td_frame->tf_rip = addr;
2046 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2047 	return (0);
2048 }
2049 
2050 int
2051 ptrace_single_step(struct thread *td)
2052 {
2053 
2054 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2055 	if ((td->td_frame->tf_rflags & PSL_T) == 0) {
2056 		td->td_frame->tf_rflags |= PSL_T;
2057 		td->td_dbgflags |= TDB_STEP;
2058 	}
2059 	return (0);
2060 }
2061 
2062 int
2063 ptrace_clear_single_step(struct thread *td)
2064 {
2065 
2066 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2067 	td->td_frame->tf_rflags &= ~PSL_T;
2068 	td->td_dbgflags &= ~TDB_STEP;
2069 	return (0);
2070 }
2071 
2072 int
2073 fill_regs(struct thread *td, struct reg *regs)
2074 {
2075 	struct trapframe *tp;
2076 
2077 	tp = td->td_frame;
2078 	return (fill_frame_regs(tp, regs));
2079 }
2080 
2081 int
2082 fill_frame_regs(struct trapframe *tp, struct reg *regs)
2083 {
2084 
2085 	regs->r_r15 = tp->tf_r15;
2086 	regs->r_r14 = tp->tf_r14;
2087 	regs->r_r13 = tp->tf_r13;
2088 	regs->r_r12 = tp->tf_r12;
2089 	regs->r_r11 = tp->tf_r11;
2090 	regs->r_r10 = tp->tf_r10;
2091 	regs->r_r9  = tp->tf_r9;
2092 	regs->r_r8  = tp->tf_r8;
2093 	regs->r_rdi = tp->tf_rdi;
2094 	regs->r_rsi = tp->tf_rsi;
2095 	regs->r_rbp = tp->tf_rbp;
2096 	regs->r_rbx = tp->tf_rbx;
2097 	regs->r_rdx = tp->tf_rdx;
2098 	regs->r_rcx = tp->tf_rcx;
2099 	regs->r_rax = tp->tf_rax;
2100 	regs->r_rip = tp->tf_rip;
2101 	regs->r_cs = tp->tf_cs;
2102 	regs->r_rflags = tp->tf_rflags;
2103 	regs->r_rsp = tp->tf_rsp;
2104 	regs->r_ss = tp->tf_ss;
2105 	if (tp->tf_flags & TF_HASSEGS) {
2106 		regs->r_ds = tp->tf_ds;
2107 		regs->r_es = tp->tf_es;
2108 		regs->r_fs = tp->tf_fs;
2109 		regs->r_gs = tp->tf_gs;
2110 	} else {
2111 		regs->r_ds = 0;
2112 		regs->r_es = 0;
2113 		regs->r_fs = 0;
2114 		regs->r_gs = 0;
2115 	}
2116 	regs->r_err = 0;
2117 	regs->r_trapno = 0;
2118 	return (0);
2119 }
2120 
2121 int
2122 set_regs(struct thread *td, struct reg *regs)
2123 {
2124 	struct trapframe *tp;
2125 	register_t rflags;
2126 
2127 	tp = td->td_frame;
2128 	rflags = regs->r_rflags & 0xffffffff;
2129 	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
2130 		return (EINVAL);
2131 	tp->tf_r15 = regs->r_r15;
2132 	tp->tf_r14 = regs->r_r14;
2133 	tp->tf_r13 = regs->r_r13;
2134 	tp->tf_r12 = regs->r_r12;
2135 	tp->tf_r11 = regs->r_r11;
2136 	tp->tf_r10 = regs->r_r10;
2137 	tp->tf_r9  = regs->r_r9;
2138 	tp->tf_r8  = regs->r_r8;
2139 	tp->tf_rdi = regs->r_rdi;
2140 	tp->tf_rsi = regs->r_rsi;
2141 	tp->tf_rbp = regs->r_rbp;
2142 	tp->tf_rbx = regs->r_rbx;
2143 	tp->tf_rdx = regs->r_rdx;
2144 	tp->tf_rcx = regs->r_rcx;
2145 	tp->tf_rax = regs->r_rax;
2146 	tp->tf_rip = regs->r_rip;
2147 	tp->tf_cs = regs->r_cs;
2148 	tp->tf_rflags = rflags;
2149 	tp->tf_rsp = regs->r_rsp;
2150 	tp->tf_ss = regs->r_ss;
2151 	if (0) {	/* XXXKIB */
2152 		tp->tf_ds = regs->r_ds;
2153 		tp->tf_es = regs->r_es;
2154 		tp->tf_fs = regs->r_fs;
2155 		tp->tf_gs = regs->r_gs;
2156 		tp->tf_flags = TF_HASSEGS;
2157 	}
2158 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2159 	return (0);
2160 }
2161 
2162 /* XXX check all this stuff! */
2163 /* externalize from sv_xmm */
2164 static void
2165 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2166 {
2167 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2168 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2169 	int i;
2170 
2171 	/* pcb -> fpregs */
2172 	bzero(fpregs, sizeof(*fpregs));
2173 
2174 	/* FPU control/status */
2175 	penv_fpreg->en_cw = penv_xmm->en_cw;
2176 	penv_fpreg->en_sw = penv_xmm->en_sw;
2177 	penv_fpreg->en_tw = penv_xmm->en_tw;
2178 	penv_fpreg->en_opcode = penv_xmm->en_opcode;
2179 	penv_fpreg->en_rip = penv_xmm->en_rip;
2180 	penv_fpreg->en_rdp = penv_xmm->en_rdp;
2181 	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2182 	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2183 
2184 	/* FPU registers */
2185 	for (i = 0; i < 8; ++i)
2186 		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2187 
2188 	/* SSE registers */
2189 	for (i = 0; i < 16; ++i)
2190 		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2191 }
2192 
2193 /* internalize from fpregs into sv_xmm */
2194 static void
2195 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2196 {
2197 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2198 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2199 	int i;
2200 
2201 	/* fpregs -> pcb */
2202 	/* FPU control/status */
2203 	penv_xmm->en_cw = penv_fpreg->en_cw;
2204 	penv_xmm->en_sw = penv_fpreg->en_sw;
2205 	penv_xmm->en_tw = penv_fpreg->en_tw;
2206 	penv_xmm->en_opcode = penv_fpreg->en_opcode;
2207 	penv_xmm->en_rip = penv_fpreg->en_rip;
2208 	penv_xmm->en_rdp = penv_fpreg->en_rdp;
2209 	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2210 	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2211 
2212 	/* FPU registers */
2213 	for (i = 0; i < 8; ++i)
2214 		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2215 
2216 	/* SSE registers */
2217 	for (i = 0; i < 16; ++i)
2218 		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2219 }
2220 
2221 /* externalize from td->pcb */
2222 int
2223 fill_fpregs(struct thread *td, struct fpreg *fpregs)
2224 {
2225 
2226 	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2227 	    P_SHOULDSTOP(td->td_proc),
2228 	    ("not suspended thread %p", td));
2229 	fpugetregs(td);
2230 	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2231 	return (0);
2232 }
2233 
2234 /* internalize to td->pcb */
2235 int
2236 set_fpregs(struct thread *td, struct fpreg *fpregs)
2237 {
2238 
2239 	critical_enter();
2240 	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2241 	fpuuserinited(td);
2242 	critical_exit();
2243 	return (0);
2244 }
2245 
2246 /*
2247  * Get machine context.
2248  */
2249 int
2250 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2251 {
2252 	struct pcb *pcb;
2253 	struct trapframe *tp;
2254 
2255 	pcb = td->td_pcb;
2256 	tp = td->td_frame;
2257 	PROC_LOCK(curthread->td_proc);
2258 	mcp->mc_onstack = sigonstack(tp->tf_rsp);
2259 	PROC_UNLOCK(curthread->td_proc);
2260 	mcp->mc_r15 = tp->tf_r15;
2261 	mcp->mc_r14 = tp->tf_r14;
2262 	mcp->mc_r13 = tp->tf_r13;
2263 	mcp->mc_r12 = tp->tf_r12;
2264 	mcp->mc_r11 = tp->tf_r11;
2265 	mcp->mc_r10 = tp->tf_r10;
2266 	mcp->mc_r9  = tp->tf_r9;
2267 	mcp->mc_r8  = tp->tf_r8;
2268 	mcp->mc_rdi = tp->tf_rdi;
2269 	mcp->mc_rsi = tp->tf_rsi;
2270 	mcp->mc_rbp = tp->tf_rbp;
2271 	mcp->mc_rbx = tp->tf_rbx;
2272 	mcp->mc_rcx = tp->tf_rcx;
2273 	mcp->mc_rflags = tp->tf_rflags;
2274 	if (flags & GET_MC_CLEAR_RET) {
2275 		mcp->mc_rax = 0;
2276 		mcp->mc_rdx = 0;
2277 		mcp->mc_rflags &= ~PSL_C;
2278 	} else {
2279 		mcp->mc_rax = tp->tf_rax;
2280 		mcp->mc_rdx = tp->tf_rdx;
2281 	}
2282 	mcp->mc_rip = tp->tf_rip;
2283 	mcp->mc_cs = tp->tf_cs;
2284 	mcp->mc_rsp = tp->tf_rsp;
2285 	mcp->mc_ss = tp->tf_ss;
2286 	mcp->mc_ds = tp->tf_ds;
2287 	mcp->mc_es = tp->tf_es;
2288 	mcp->mc_fs = tp->tf_fs;
2289 	mcp->mc_gs = tp->tf_gs;
2290 	mcp->mc_flags = tp->tf_flags;
2291 	mcp->mc_len = sizeof(*mcp);
2292 	get_fpcontext(td, mcp, NULL, 0);
2293 	update_pcb_bases(pcb);
2294 	mcp->mc_fsbase = pcb->pcb_fsbase;
2295 	mcp->mc_gsbase = pcb->pcb_gsbase;
2296 	mcp->mc_xfpustate = 0;
2297 	mcp->mc_xfpustate_len = 0;
2298 	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2299 	return (0);
2300 }
2301 
2302 /*
2303  * Set machine context.
2304  *
2305  * However, we don't set any but the user modifiable flags, and we won't
2306  * touch the cs selector.
2307  */
2308 int
2309 set_mcontext(struct thread *td, mcontext_t *mcp)
2310 {
2311 	struct pcb *pcb;
2312 	struct trapframe *tp;
2313 	char *xfpustate;
2314 	long rflags;
2315 	int ret;
2316 
2317 	pcb = td->td_pcb;
2318 	tp = td->td_frame;
2319 	if (mcp->mc_len != sizeof(*mcp) ||
2320 	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2321 		return (EINVAL);
2322 	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2323 	    (tp->tf_rflags & ~PSL_USERCHANGE);
2324 	if (mcp->mc_flags & _MC_HASFPXSTATE) {
2325 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2326 		    sizeof(struct savefpu))
2327 			return (EINVAL);
2328 		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2329 		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2330 		    mcp->mc_xfpustate_len);
2331 		if (ret != 0)
2332 			return (ret);
2333 	} else
2334 		xfpustate = NULL;
2335 	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2336 	if (ret != 0)
2337 		return (ret);
2338 	tp->tf_r15 = mcp->mc_r15;
2339 	tp->tf_r14 = mcp->mc_r14;
2340 	tp->tf_r13 = mcp->mc_r13;
2341 	tp->tf_r12 = mcp->mc_r12;
2342 	tp->tf_r11 = mcp->mc_r11;
2343 	tp->tf_r10 = mcp->mc_r10;
2344 	tp->tf_r9  = mcp->mc_r9;
2345 	tp->tf_r8  = mcp->mc_r8;
2346 	tp->tf_rdi = mcp->mc_rdi;
2347 	tp->tf_rsi = mcp->mc_rsi;
2348 	tp->tf_rbp = mcp->mc_rbp;
2349 	tp->tf_rbx = mcp->mc_rbx;
2350 	tp->tf_rdx = mcp->mc_rdx;
2351 	tp->tf_rcx = mcp->mc_rcx;
2352 	tp->tf_rax = mcp->mc_rax;
2353 	tp->tf_rip = mcp->mc_rip;
2354 	tp->tf_rflags = rflags;
2355 	tp->tf_rsp = mcp->mc_rsp;
2356 	tp->tf_ss = mcp->mc_ss;
2357 	tp->tf_flags = mcp->mc_flags;
2358 	if (tp->tf_flags & TF_HASSEGS) {
2359 		tp->tf_ds = mcp->mc_ds;
2360 		tp->tf_es = mcp->mc_es;
2361 		tp->tf_fs = mcp->mc_fs;
2362 		tp->tf_gs = mcp->mc_gs;
2363 	}
2364 	set_pcb_flags(pcb, PCB_FULL_IRET);
2365 	if (mcp->mc_flags & _MC_HASBASES) {
2366 		pcb->pcb_fsbase = mcp->mc_fsbase;
2367 		pcb->pcb_gsbase = mcp->mc_gsbase;
2368 	}
2369 	return (0);
2370 }
2371 
2372 static void
2373 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2374     size_t xfpusave_len)
2375 {
2376 	size_t max_len, len;
2377 
2378 	mcp->mc_ownedfp = fpugetregs(td);
2379 	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2380 	    sizeof(mcp->mc_fpstate));
2381 	mcp->mc_fpformat = fpuformat();
2382 	if (!use_xsave || xfpusave_len == 0)
2383 		return;
2384 	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2385 	len = xfpusave_len;
2386 	if (len > max_len) {
2387 		len = max_len;
2388 		bzero(xfpusave + max_len, len - max_len);
2389 	}
2390 	mcp->mc_flags |= _MC_HASFPXSTATE;
2391 	mcp->mc_xfpustate_len = len;
2392 	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2393 }
2394 
2395 static int
2396 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2397     size_t xfpustate_len)
2398 {
2399 	int error;
2400 
2401 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2402 		return (0);
2403 	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2404 		return (EINVAL);
2405 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2406 		/* We don't care what state is left in the FPU or PCB. */
2407 		fpstate_drop(td);
2408 		error = 0;
2409 	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2410 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2411 		error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
2412 		    xfpustate, xfpustate_len);
2413 	} else
2414 		return (EINVAL);
2415 	return (error);
2416 }
2417 
2418 void
2419 fpstate_drop(struct thread *td)
2420 {
2421 
2422 	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2423 	critical_enter();
2424 	if (PCPU_GET(fpcurthread) == td)
2425 		fpudrop();
2426 	/*
2427 	 * XXX force a full drop of the fpu.  The above only drops it if we
2428 	 * owned it.
2429 	 *
2430 	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2431 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
2432 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
2433 	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2434 	 * have too many layers.
2435 	 */
2436 	clear_pcb_flags(curthread->td_pcb,
2437 	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2438 	critical_exit();
2439 }
2440 
2441 int
2442 fill_dbregs(struct thread *td, struct dbreg *dbregs)
2443 {
2444 	struct pcb *pcb;
2445 
2446 	if (td == NULL) {
2447 		dbregs->dr[0] = rdr0();
2448 		dbregs->dr[1] = rdr1();
2449 		dbregs->dr[2] = rdr2();
2450 		dbregs->dr[3] = rdr3();
2451 		dbregs->dr[6] = rdr6();
2452 		dbregs->dr[7] = rdr7();
2453 	} else {
2454 		pcb = td->td_pcb;
2455 		dbregs->dr[0] = pcb->pcb_dr0;
2456 		dbregs->dr[1] = pcb->pcb_dr1;
2457 		dbregs->dr[2] = pcb->pcb_dr2;
2458 		dbregs->dr[3] = pcb->pcb_dr3;
2459 		dbregs->dr[6] = pcb->pcb_dr6;
2460 		dbregs->dr[7] = pcb->pcb_dr7;
2461 	}
2462 	dbregs->dr[4] = 0;
2463 	dbregs->dr[5] = 0;
2464 	dbregs->dr[8] = 0;
2465 	dbregs->dr[9] = 0;
2466 	dbregs->dr[10] = 0;
2467 	dbregs->dr[11] = 0;
2468 	dbregs->dr[12] = 0;
2469 	dbregs->dr[13] = 0;
2470 	dbregs->dr[14] = 0;
2471 	dbregs->dr[15] = 0;
2472 	return (0);
2473 }
2474 
2475 int
2476 set_dbregs(struct thread *td, struct dbreg *dbregs)
2477 {
2478 	struct pcb *pcb;
2479 	int i;
2480 
2481 	if (td == NULL) {
2482 		load_dr0(dbregs->dr[0]);
2483 		load_dr1(dbregs->dr[1]);
2484 		load_dr2(dbregs->dr[2]);
2485 		load_dr3(dbregs->dr[3]);
2486 		load_dr6(dbregs->dr[6]);
2487 		load_dr7(dbregs->dr[7]);
2488 	} else {
2489 		/*
2490 		 * Don't let an illegal value for dr7 get set.  Specifically,
2491 		 * check for undefined settings.  Setting these bit patterns
2492 		 * result in undefined behaviour and can lead to an unexpected
2493 		 * TRCTRAP or a general protection fault right here.
2494 		 * Upper bits of dr6 and dr7 must not be set
2495 		 */
2496 		for (i = 0; i < 4; i++) {
2497 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2498 				return (EINVAL);
2499 			if (td->td_frame->tf_cs == _ucode32sel &&
2500 			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2501 				return (EINVAL);
2502 		}
2503 		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2504 		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2505 			return (EINVAL);
2506 
2507 		pcb = td->td_pcb;
2508 
2509 		/*
2510 		 * Don't let a process set a breakpoint that is not within the
2511 		 * process's address space.  If a process could do this, it
2512 		 * could halt the system by setting a breakpoint in the kernel
2513 		 * (if ddb was enabled).  Thus, we need to check to make sure
2514 		 * that no breakpoints are being enabled for addresses outside
2515 		 * process's address space.
2516 		 *
2517 		 * XXX - what about when the watched area of the user's
2518 		 * address space is written into from within the kernel
2519 		 * ... wouldn't that still cause a breakpoint to be generated
2520 		 * from within kernel mode?
2521 		 */
2522 
2523 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2524 			/* dr0 is enabled */
2525 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2526 				return (EINVAL);
2527 		}
2528 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2529 			/* dr1 is enabled */
2530 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2531 				return (EINVAL);
2532 		}
2533 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2534 			/* dr2 is enabled */
2535 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2536 				return (EINVAL);
2537 		}
2538 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2539 			/* dr3 is enabled */
2540 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2541 				return (EINVAL);
2542 		}
2543 
2544 		pcb->pcb_dr0 = dbregs->dr[0];
2545 		pcb->pcb_dr1 = dbregs->dr[1];
2546 		pcb->pcb_dr2 = dbregs->dr[2];
2547 		pcb->pcb_dr3 = dbregs->dr[3];
2548 		pcb->pcb_dr6 = dbregs->dr[6];
2549 		pcb->pcb_dr7 = dbregs->dr[7];
2550 
2551 		set_pcb_flags(pcb, PCB_DBREGS);
2552 	}
2553 
2554 	return (0);
2555 }
2556 
2557 void
2558 reset_dbregs(void)
2559 {
2560 
2561 	load_dr7(0);	/* Turn off the control bits first */
2562 	load_dr0(0);
2563 	load_dr1(0);
2564 	load_dr2(0);
2565 	load_dr3(0);
2566 	load_dr6(0);
2567 }
2568 
2569 /*
2570  * Return > 0 if a hardware breakpoint has been hit, and the
2571  * breakpoint was in user space.  Return 0, otherwise.
2572  */
2573 int
2574 user_dbreg_trap(register_t dr6)
2575 {
2576         u_int64_t dr7;
2577         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
2578         int nbp;            /* number of breakpoints that triggered */
2579         caddr_t addr[4];    /* breakpoint addresses */
2580         int i;
2581 
2582         bp = dr6 & DBREG_DR6_BMASK;
2583         if (bp == 0) {
2584                 /*
2585                  * None of the breakpoint bits are set meaning this
2586                  * trap was not caused by any of the debug registers
2587                  */
2588                 return 0;
2589         }
2590 
2591         dr7 = rdr7();
2592         if ((dr7 & 0x000000ff) == 0) {
2593                 /*
2594                  * all GE and LE bits in the dr7 register are zero,
2595                  * thus the trap couldn't have been caused by the
2596                  * hardware debug registers
2597                  */
2598                 return 0;
2599         }
2600 
2601         nbp = 0;
2602 
2603         /*
2604          * at least one of the breakpoints were hit, check to see
2605          * which ones and if any of them are user space addresses
2606          */
2607 
2608         if (bp & 0x01) {
2609                 addr[nbp++] = (caddr_t)rdr0();
2610         }
2611         if (bp & 0x02) {
2612                 addr[nbp++] = (caddr_t)rdr1();
2613         }
2614         if (bp & 0x04) {
2615                 addr[nbp++] = (caddr_t)rdr2();
2616         }
2617         if (bp & 0x08) {
2618                 addr[nbp++] = (caddr_t)rdr3();
2619         }
2620 
2621         for (i = 0; i < nbp; i++) {
2622                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2623                         /*
2624                          * addr[i] is in user space
2625                          */
2626                         return nbp;
2627                 }
2628         }
2629 
2630         /*
2631          * None of the breakpoints are in user space.
2632          */
2633         return 0;
2634 }
2635 
2636 /*
2637  * The pcb_flags is only modified by current thread, or by other threads
2638  * when current thread is stopped.  However, current thread may change it
2639  * from the interrupt context in cpu_switch(), or in the trap handler.
2640  * When we read-modify-write pcb_flags from C sources, compiler may generate
2641  * code that is not atomic regarding the interrupt handler.  If a trap or
2642  * interrupt happens and any flag is modified from the handler, it can be
2643  * clobbered with the cached value later.  Therefore, we implement setting
2644  * and clearing flags with single-instruction functions, which do not race
2645  * with possible modification of the flags from the trap or interrupt context,
2646  * because traps and interrupts are executed only on instruction boundary.
2647  */
2648 void
2649 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
2650 {
2651 
2652 	__asm __volatile("orl %1,%0"
2653 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
2654 	    : "cc", "memory");
2655 
2656 }
2657 
2658 /*
2659  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
2660  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
2661  * pcb if user space modified the bases.  We must save on the context
2662  * switch or if the return to usermode happens through the doreti.
2663  *
2664  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
2665  * which have a consequence that the base MSRs must be saved each time
2666  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
2667  * context switches.
2668  */
2669 static void
2670 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
2671 {
2672 	register_t r;
2673 
2674 	if (curpcb == pcb &&
2675 	    (flags & PCB_FULL_IRET) != 0 &&
2676 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2677 		r = intr_disable();
2678 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2679 			if (rfs() == _ufssel)
2680 				pcb->pcb_fsbase = rdfsbase();
2681 			if (rgs() == _ugssel)
2682 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
2683 		}
2684 		set_pcb_flags_raw(pcb, flags);
2685 		intr_restore(r);
2686 	} else {
2687 		set_pcb_flags_raw(pcb, flags);
2688 	}
2689 }
2690 
2691 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
2692 {
2693 
2694 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
2695 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
2696 }
2697 
2698 void
2699 clear_pcb_flags(struct pcb *pcb, const u_int flags)
2700 {
2701 
2702 	__asm __volatile("andl %1,%0"
2703 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
2704 	    : "cc", "memory");
2705 }
2706 
2707 #ifdef KDB
2708 
2709 /*
2710  * Provide inb() and outb() as functions.  They are normally only available as
2711  * inline functions, thus cannot be called from the debugger.
2712  */
2713 
2714 /* silence compiler warnings */
2715 u_char inb_(u_short);
2716 void outb_(u_short, u_char);
2717 
2718 u_char
2719 inb_(u_short port)
2720 {
2721 	return inb(port);
2722 }
2723 
2724 void
2725 outb_(u_short port, u_char data)
2726 {
2727 	outb(port, data);
2728 }
2729 
2730 #endif /* KDB */
2731 
2732 #undef memset
2733 #undef memmove
2734 #undef memcpy
2735 
2736 void	*memset_std(void *buf, int c, size_t len);
2737 void	*memset_erms(void *buf, int c, size_t len);
2738 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
2739 	    size_t len);
2740 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
2741 	    size_t len);
2742 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
2743 	    size_t len);
2744 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
2745 	    size_t len);
2746 
2747 #ifdef KCSAN
2748 /*
2749  * These fail to build as ifuncs when used with KCSAN.
2750  */
2751 void *
2752 memset(void *buf, int c, size_t len)
2753 {
2754 
2755 	return (memset_std(buf, c, len));
2756 }
2757 
2758 void *
2759 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
2760 {
2761 
2762 	return (memmove_std(dst, src, len));
2763 }
2764 
2765 void *
2766 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
2767 {
2768 
2769 	return (memcpy_std(dst, src, len));
2770 }
2771 #else
2772 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
2773 {
2774 
2775 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2776 	    memset_erms : memset_std);
2777 }
2778 
2779 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
2780     size_t))
2781 {
2782 
2783 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2784 	    memmove_erms : memmove_std);
2785 }
2786 
2787 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
2788 {
2789 
2790 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2791 	    memcpy_erms : memcpy_std);
2792 }
2793 #endif
2794 
2795 void	pagezero_std(void *addr);
2796 void	pagezero_erms(void *addr);
2797 DEFINE_IFUNC(, void , pagezero, (void *))
2798 {
2799 
2800 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2801 	    pagezero_erms : pagezero_std);
2802 }
2803