xref: /freebsd/sys/amd64/amd64/machdep.c (revision c697fb7f)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 1992 Terrence R. Lambert.
6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
41  */
42 
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 #include "opt_atpic.h"
47 #include "opt_cpu.h"
48 #include "opt_ddb.h"
49 #include "opt_inet.h"
50 #include "opt_isa.h"
51 #include "opt_kstack_pages.h"
52 #include "opt_maxmem.h"
53 #include "opt_mp_watchdog.h"
54 #include "opt_pci.h"
55 #include "opt_platform.h"
56 #include "opt_sched.h"
57 
58 #include <sys/param.h>
59 #include <sys/proc.h>
60 #include <sys/systm.h>
61 #include <sys/bio.h>
62 #include <sys/buf.h>
63 #include <sys/bus.h>
64 #include <sys/callout.h>
65 #include <sys/cons.h>
66 #include <sys/cpu.h>
67 #include <sys/csan.h>
68 #include <sys/efi.h>
69 #include <sys/eventhandler.h>
70 #include <sys/exec.h>
71 #include <sys/imgact.h>
72 #include <sys/kdb.h>
73 #include <sys/kernel.h>
74 #include <sys/ktr.h>
75 #include <sys/linker.h>
76 #include <sys/lock.h>
77 #include <sys/malloc.h>
78 #include <sys/memrange.h>
79 #include <sys/msgbuf.h>
80 #include <sys/mutex.h>
81 #include <sys/pcpu.h>
82 #include <sys/ptrace.h>
83 #include <sys/reboot.h>
84 #include <sys/rwlock.h>
85 #include <sys/sched.h>
86 #include <sys/signalvar.h>
87 #ifdef SMP
88 #include <sys/smp.h>
89 #endif
90 #include <sys/syscallsubr.h>
91 #include <sys/sysctl.h>
92 #include <sys/sysent.h>
93 #include <sys/sysproto.h>
94 #include <sys/ucontext.h>
95 #include <sys/vmmeter.h>
96 
97 #include <vm/vm.h>
98 #include <vm/vm_extern.h>
99 #include <vm/vm_kern.h>
100 #include <vm/vm_page.h>
101 #include <vm/vm_map.h>
102 #include <vm/vm_object.h>
103 #include <vm/vm_pager.h>
104 #include <vm/vm_param.h>
105 #include <vm/vm_phys.h>
106 
107 #ifdef DDB
108 #ifndef KDB
109 #error KDB must be enabled in order for DDB to work!
110 #endif
111 #include <ddb/ddb.h>
112 #include <ddb/db_sym.h>
113 #endif
114 
115 #include <net/netisr.h>
116 
117 #include <machine/clock.h>
118 #include <machine/cpu.h>
119 #include <machine/cputypes.h>
120 #include <machine/frame.h>
121 #include <machine/intr_machdep.h>
122 #include <x86/mca.h>
123 #include <machine/md_var.h>
124 #include <machine/metadata.h>
125 #include <machine/mp_watchdog.h>
126 #include <machine/pc/bios.h>
127 #include <machine/pcb.h>
128 #include <machine/proc.h>
129 #include <machine/reg.h>
130 #include <machine/sigframe.h>
131 #include <machine/specialreg.h>
132 #include <machine/trap.h>
133 #include <machine/tss.h>
134 #include <x86/ucode.h>
135 #include <x86/ifunc.h>
136 #ifdef SMP
137 #include <machine/smp.h>
138 #endif
139 #ifdef FDT
140 #include <x86/fdt.h>
141 #endif
142 
143 #ifdef DEV_ATPIC
144 #include <x86/isa/icu.h>
145 #else
146 #include <x86/apicvar.h>
147 #endif
148 
149 #include <isa/isareg.h>
150 #include <isa/rtc.h>
151 #include <x86/init.h>
152 
153 /* Sanity check for __curthread() */
154 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
155 
156 /*
157  * The PTI trampoline stack needs enough space for a hardware trapframe and a
158  * couple of scratch registers, as well as the trapframe left behind after an
159  * iret fault.
160  */
161 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
162     offsetof(struct pti_frame, pti_rip));
163 
164 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
165 
166 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
167 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
168 
169 static void cpu_startup(void *);
170 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
171     char *xfpusave, size_t xfpusave_len);
172 static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
173     char *xfpustate, size_t xfpustate_len);
174 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
175 
176 /* Preload data parse function */
177 static caddr_t native_parse_preload_data(u_int64_t);
178 
179 /* Native function to fetch and parse the e820 map */
180 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
181 
182 /* Default init_ops implementation. */
183 struct init_ops init_ops = {
184 	.parse_preload_data =	native_parse_preload_data,
185 	.early_clock_source_init =	i8254_init,
186 	.early_delay =			i8254_delay,
187 	.parse_memmap =			native_parse_memmap,
188 #ifdef SMP
189 	.mp_bootaddress =		mp_bootaddress,
190 	.start_all_aps =		native_start_all_aps,
191 #endif
192 #ifdef DEV_PCI
193 	.msi_init =			msi_init,
194 #endif
195 };
196 
197 /*
198  * Physical address of the EFI System Table. Stashed from the metadata hints
199  * passed into the kernel and used by the EFI code to call runtime services.
200  */
201 vm_paddr_t efi_systbl_phys;
202 
203 /* Intel ICH registers */
204 #define ICH_PMBASE	0x400
205 #define ICH_SMI_EN	ICH_PMBASE + 0x30
206 
207 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
208 
209 int cold = 1;
210 
211 long Maxmem = 0;
212 long realmem = 0;
213 
214 struct kva_md_info kmi;
215 
216 static struct trapframe proc0_tf;
217 struct region_descriptor r_idt;
218 
219 struct pcpu *__pcpu;
220 struct pcpu temp_bsp_pcpu;
221 
222 struct mtx icu_lock;
223 
224 struct mem_range_softc mem_range_softc;
225 
226 struct mtx dt_lock;	/* lock for GDT and LDT */
227 
228 void (*vmm_resume_p)(void);
229 
230 static void
231 cpu_startup(dummy)
232 	void *dummy;
233 {
234 	uintmax_t memsize;
235 	char *sysenv;
236 
237 	/*
238 	 * On MacBooks, we need to disallow the legacy USB circuit to
239 	 * generate an SMI# because this can cause several problems,
240 	 * namely: incorrect CPU frequency detection and failure to
241 	 * start the APs.
242 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
243 	 * Enable register) of the Intel ICH LPC Interface Bridge.
244 	 */
245 	sysenv = kern_getenv("smbios.system.product");
246 	if (sysenv != NULL) {
247 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
248 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
249 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
250 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
251 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
252 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
253 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
254 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
255 			if (bootverbose)
256 				printf("Disabling LEGACY_USB_EN bit on "
257 				    "Intel ICH.\n");
258 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
259 		}
260 		freeenv(sysenv);
261 	}
262 
263 	/*
264 	 * Good {morning,afternoon,evening,night}.
265 	 */
266 	startrtclock();
267 	printcpuinfo();
268 
269 	/*
270 	 * Display physical memory if SMBIOS reports reasonable amount.
271 	 */
272 	memsize = 0;
273 	sysenv = kern_getenv("smbios.memory.enabled");
274 	if (sysenv != NULL) {
275 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
276 		freeenv(sysenv);
277 	}
278 	if (memsize < ptoa((uintmax_t)vm_free_count()))
279 		memsize = ptoa((uintmax_t)Maxmem);
280 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
281 	realmem = atop(memsize);
282 
283 	/*
284 	 * Display any holes after the first chunk of extended memory.
285 	 */
286 	if (bootverbose) {
287 		int indx;
288 
289 		printf("Physical memory chunk(s):\n");
290 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
291 			vm_paddr_t size;
292 
293 			size = phys_avail[indx + 1] - phys_avail[indx];
294 			printf(
295 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
296 			    (uintmax_t)phys_avail[indx],
297 			    (uintmax_t)phys_avail[indx + 1] - 1,
298 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
299 		}
300 	}
301 
302 	vm_ksubmap_init(&kmi);
303 
304 	printf("avail memory = %ju (%ju MB)\n",
305 	    ptoa((uintmax_t)vm_free_count()),
306 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
307 #ifdef DEV_PCI
308 	if (bootverbose && intel_graphics_stolen_base != 0)
309 		printf("intel stolen mem: base %#jx size %ju MB\n",
310 		    (uintmax_t)intel_graphics_stolen_base,
311 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
312 #endif
313 
314 	/*
315 	 * Set up buffers, so they can be used to read disk labels.
316 	 */
317 	bufinit();
318 	vm_pager_bufferinit();
319 
320 	cpu_setregs();
321 }
322 
323 /*
324  * Send an interrupt to process.
325  *
326  * Stack is set up to allow sigcode stored
327  * at top to call routine, followed by call
328  * to sigreturn routine below.  After sigreturn
329  * resets the signal mask, the stack, and the
330  * frame pointer, it returns to the user
331  * specified pc, psl.
332  */
333 void
334 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
335 {
336 	struct sigframe sf, *sfp;
337 	struct pcb *pcb;
338 	struct proc *p;
339 	struct thread *td;
340 	struct sigacts *psp;
341 	char *sp;
342 	struct trapframe *regs;
343 	char *xfpusave;
344 	size_t xfpusave_len;
345 	int sig;
346 	int oonstack;
347 
348 	td = curthread;
349 	pcb = td->td_pcb;
350 	p = td->td_proc;
351 	PROC_LOCK_ASSERT(p, MA_OWNED);
352 	sig = ksi->ksi_signo;
353 	psp = p->p_sigacts;
354 	mtx_assert(&psp->ps_mtx, MA_OWNED);
355 	regs = td->td_frame;
356 	oonstack = sigonstack(regs->tf_rsp);
357 
358 	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
359 		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
360 		xfpusave = __builtin_alloca(xfpusave_len);
361 	} else {
362 		xfpusave_len = 0;
363 		xfpusave = NULL;
364 	}
365 
366 	/* Save user context. */
367 	bzero(&sf, sizeof(sf));
368 	sf.sf_uc.uc_sigmask = *mask;
369 	sf.sf_uc.uc_stack = td->td_sigstk;
370 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
371 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
372 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
373 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
374 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
375 	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
376 	fpstate_drop(td);
377 	update_pcb_bases(pcb);
378 	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
379 	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
380 	bzero(sf.sf_uc.uc_mcontext.mc_spare,
381 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
382 
383 	/* Allocate space for the signal handler context. */
384 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
385 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
386 		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
387 #if defined(COMPAT_43)
388 		td->td_sigstk.ss_flags |= SS_ONSTACK;
389 #endif
390 	} else
391 		sp = (char *)regs->tf_rsp - 128;
392 	if (xfpusave != NULL) {
393 		sp -= xfpusave_len;
394 		sp = (char *)((unsigned long)sp & ~0x3Ful);
395 		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
396 	}
397 	sp -= sizeof(struct sigframe);
398 	/* Align to 16 bytes. */
399 	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
400 
401 	/* Build the argument list for the signal handler. */
402 	regs->tf_rdi = sig;			/* arg 1 in %rdi */
403 	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
404 	bzero(&sf.sf_si, sizeof(sf.sf_si));
405 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
406 		/* Signal handler installed with SA_SIGINFO. */
407 		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
408 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
409 
410 		/* Fill in POSIX parts */
411 		sf.sf_si = ksi->ksi_info;
412 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
413 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
414 	} else {
415 		/* Old FreeBSD-style arguments. */
416 		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
417 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
418 		sf.sf_ahu.sf_handler = catcher;
419 	}
420 	mtx_unlock(&psp->ps_mtx);
421 	PROC_UNLOCK(p);
422 
423 	/*
424 	 * Copy the sigframe out to the user's stack.
425 	 */
426 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
427 	    (xfpusave != NULL && copyout(xfpusave,
428 	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
429 	    != 0)) {
430 #ifdef DEBUG
431 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
432 #endif
433 		PROC_LOCK(p);
434 		sigexit(td, SIGILL);
435 	}
436 
437 	regs->tf_rsp = (long)sfp;
438 	regs->tf_rip = p->p_sysent->sv_sigcode_base;
439 	regs->tf_rflags &= ~(PSL_T | PSL_D);
440 	regs->tf_cs = _ucodesel;
441 	regs->tf_ds = _udatasel;
442 	regs->tf_ss = _udatasel;
443 	regs->tf_es = _udatasel;
444 	regs->tf_fs = _ufssel;
445 	regs->tf_gs = _ugssel;
446 	regs->tf_flags = TF_HASSEGS;
447 	PROC_LOCK(p);
448 	mtx_lock(&psp->ps_mtx);
449 }
450 
451 /*
452  * System call to cleanup state after a signal
453  * has been taken.  Reset signal mask and
454  * stack state from context left by sendsig (above).
455  * Return to previous pc and psl as specified by
456  * context left by sendsig. Check carefully to
457  * make sure that the user has not modified the
458  * state to gain improper privileges.
459  *
460  * MPSAFE
461  */
462 int
463 sys_sigreturn(td, uap)
464 	struct thread *td;
465 	struct sigreturn_args /* {
466 		const struct __ucontext *sigcntxp;
467 	} */ *uap;
468 {
469 	ucontext_t uc;
470 	struct pcb *pcb;
471 	struct proc *p;
472 	struct trapframe *regs;
473 	ucontext_t *ucp;
474 	char *xfpustate;
475 	size_t xfpustate_len;
476 	long rflags;
477 	int cs, error, ret;
478 	ksiginfo_t ksi;
479 
480 	pcb = td->td_pcb;
481 	p = td->td_proc;
482 
483 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
484 	if (error != 0) {
485 		uprintf("pid %d (%s): sigreturn copyin failed\n",
486 		    p->p_pid, td->td_name);
487 		return (error);
488 	}
489 	ucp = &uc;
490 	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
491 		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
492 		    td->td_name, ucp->uc_mcontext.mc_flags);
493 		return (EINVAL);
494 	}
495 	regs = td->td_frame;
496 	rflags = ucp->uc_mcontext.mc_rflags;
497 	/*
498 	 * Don't allow users to change privileged or reserved flags.
499 	 */
500 	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
501 		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
502 		    td->td_name, rflags);
503 		return (EINVAL);
504 	}
505 
506 	/*
507 	 * Don't allow users to load a valid privileged %cs.  Let the
508 	 * hardware check for invalid selectors, excess privilege in
509 	 * other selectors, invalid %eip's and invalid %esp's.
510 	 */
511 	cs = ucp->uc_mcontext.mc_cs;
512 	if (!CS_SECURE(cs)) {
513 		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
514 		    td->td_name, cs);
515 		ksiginfo_init_trap(&ksi);
516 		ksi.ksi_signo = SIGBUS;
517 		ksi.ksi_code = BUS_OBJERR;
518 		ksi.ksi_trapno = T_PROTFLT;
519 		ksi.ksi_addr = (void *)regs->tf_rip;
520 		trapsignal(td, &ksi);
521 		return (EINVAL);
522 	}
523 
524 	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
525 		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
526 		if (xfpustate_len > cpu_max_ext_state_size -
527 		    sizeof(struct savefpu)) {
528 			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
529 			    p->p_pid, td->td_name, xfpustate_len);
530 			return (EINVAL);
531 		}
532 		xfpustate = __builtin_alloca(xfpustate_len);
533 		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
534 		    xfpustate, xfpustate_len);
535 		if (error != 0) {
536 			uprintf(
537 	"pid %d (%s): sigreturn copying xfpustate failed\n",
538 			    p->p_pid, td->td_name);
539 			return (error);
540 		}
541 	} else {
542 		xfpustate = NULL;
543 		xfpustate_len = 0;
544 	}
545 	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
546 	if (ret != 0) {
547 		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
548 		    p->p_pid, td->td_name, ret);
549 		return (ret);
550 	}
551 	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
552 	update_pcb_bases(pcb);
553 	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
554 	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
555 
556 #if defined(COMPAT_43)
557 	if (ucp->uc_mcontext.mc_onstack & 1)
558 		td->td_sigstk.ss_flags |= SS_ONSTACK;
559 	else
560 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
561 #endif
562 
563 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
564 	return (EJUSTRETURN);
565 }
566 
567 #ifdef COMPAT_FREEBSD4
568 int
569 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
570 {
571 
572 	return sys_sigreturn(td, (struct sigreturn_args *)uap);
573 }
574 #endif
575 
576 /*
577  * Reset registers to default values on exec.
578  */
579 void
580 exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack)
581 {
582 	struct trapframe *regs;
583 	struct pcb *pcb;
584 	register_t saved_rflags;
585 
586 	regs = td->td_frame;
587 	pcb = td->td_pcb;
588 
589 	if (td->td_proc->p_md.md_ldt != NULL)
590 		user_ldt_free(td);
591 
592 	update_pcb_bases(pcb);
593 	pcb->pcb_fsbase = 0;
594 	pcb->pcb_gsbase = 0;
595 	clear_pcb_flags(pcb, PCB_32BIT);
596 	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
597 
598 	saved_rflags = regs->tf_rflags & PSL_T;
599 	bzero((char *)regs, sizeof(struct trapframe));
600 	regs->tf_rip = imgp->entry_addr;
601 	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
602 	regs->tf_rdi = stack;		/* argv */
603 	regs->tf_rflags = PSL_USER | saved_rflags;
604 	regs->tf_ss = _udatasel;
605 	regs->tf_cs = _ucodesel;
606 	regs->tf_ds = _udatasel;
607 	regs->tf_es = _udatasel;
608 	regs->tf_fs = _ufssel;
609 	regs->tf_gs = _ugssel;
610 	regs->tf_flags = TF_HASSEGS;
611 
612 	/*
613 	 * Reset the hardware debug registers if they were in use.
614 	 * They won't have any meaning for the newly exec'd process.
615 	 */
616 	if (pcb->pcb_flags & PCB_DBREGS) {
617 		pcb->pcb_dr0 = 0;
618 		pcb->pcb_dr1 = 0;
619 		pcb->pcb_dr2 = 0;
620 		pcb->pcb_dr3 = 0;
621 		pcb->pcb_dr6 = 0;
622 		pcb->pcb_dr7 = 0;
623 		if (pcb == curpcb) {
624 			/*
625 			 * Clear the debug registers on the running
626 			 * CPU, otherwise they will end up affecting
627 			 * the next process we switch to.
628 			 */
629 			reset_dbregs();
630 		}
631 		clear_pcb_flags(pcb, PCB_DBREGS);
632 	}
633 
634 	/*
635 	 * Drop the FP state if we hold it, so that the process gets a
636 	 * clean FP state if it uses the FPU again.
637 	 */
638 	fpstate_drop(td);
639 }
640 
641 void
642 cpu_setregs(void)
643 {
644 	register_t cr0;
645 
646 	cr0 = rcr0();
647 	/*
648 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
649 	 * BSP.  See the comments there about why we set them.
650 	 */
651 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
652 	load_cr0(cr0);
653 }
654 
655 /*
656  * Initialize amd64 and configure to run kernel
657  */
658 
659 /*
660  * Initialize segments & interrupt table
661  */
662 static struct gate_descriptor idt0[NIDT];
663 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
664 
665 static char dblfault_stack[PAGE_SIZE] __aligned(16);
666 static char mce0_stack[PAGE_SIZE] __aligned(16);
667 static char nmi0_stack[PAGE_SIZE] __aligned(16);
668 static char dbg0_stack[PAGE_SIZE] __aligned(16);
669 CTASSERT(sizeof(struct nmi_pcpu) == 16);
670 
671 /*
672  * Software prototypes -- in more palatable form.
673  *
674  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
675  * slots as corresponding segments for i386 kernel.
676  */
677 struct soft_segment_descriptor gdt_segs[] = {
678 /* GNULL_SEL	0 Null Descriptor */
679 {	.ssd_base = 0x0,
680 	.ssd_limit = 0x0,
681 	.ssd_type = 0,
682 	.ssd_dpl = 0,
683 	.ssd_p = 0,
684 	.ssd_long = 0,
685 	.ssd_def32 = 0,
686 	.ssd_gran = 0		},
687 /* GNULL2_SEL	1 Null Descriptor */
688 {	.ssd_base = 0x0,
689 	.ssd_limit = 0x0,
690 	.ssd_type = 0,
691 	.ssd_dpl = 0,
692 	.ssd_p = 0,
693 	.ssd_long = 0,
694 	.ssd_def32 = 0,
695 	.ssd_gran = 0		},
696 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
697 {	.ssd_base = 0x0,
698 	.ssd_limit = 0xfffff,
699 	.ssd_type = SDT_MEMRWA,
700 	.ssd_dpl = SEL_UPL,
701 	.ssd_p = 1,
702 	.ssd_long = 0,
703 	.ssd_def32 = 1,
704 	.ssd_gran = 1		},
705 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
706 {	.ssd_base = 0x0,
707 	.ssd_limit = 0xfffff,
708 	.ssd_type = SDT_MEMRWA,
709 	.ssd_dpl = SEL_UPL,
710 	.ssd_p = 1,
711 	.ssd_long = 0,
712 	.ssd_def32 = 1,
713 	.ssd_gran = 1		},
714 /* GCODE_SEL	4 Code Descriptor for kernel */
715 {	.ssd_base = 0x0,
716 	.ssd_limit = 0xfffff,
717 	.ssd_type = SDT_MEMERA,
718 	.ssd_dpl = SEL_KPL,
719 	.ssd_p = 1,
720 	.ssd_long = 1,
721 	.ssd_def32 = 0,
722 	.ssd_gran = 1		},
723 /* GDATA_SEL	5 Data Descriptor for kernel */
724 {	.ssd_base = 0x0,
725 	.ssd_limit = 0xfffff,
726 	.ssd_type = SDT_MEMRWA,
727 	.ssd_dpl = SEL_KPL,
728 	.ssd_p = 1,
729 	.ssd_long = 1,
730 	.ssd_def32 = 0,
731 	.ssd_gran = 1		},
732 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
733 {	.ssd_base = 0x0,
734 	.ssd_limit = 0xfffff,
735 	.ssd_type = SDT_MEMERA,
736 	.ssd_dpl = SEL_UPL,
737 	.ssd_p = 1,
738 	.ssd_long = 0,
739 	.ssd_def32 = 1,
740 	.ssd_gran = 1		},
741 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
742 {	.ssd_base = 0x0,
743 	.ssd_limit = 0xfffff,
744 	.ssd_type = SDT_MEMRWA,
745 	.ssd_dpl = SEL_UPL,
746 	.ssd_p = 1,
747 	.ssd_long = 0,
748 	.ssd_def32 = 1,
749 	.ssd_gran = 1		},
750 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
751 {	.ssd_base = 0x0,
752 	.ssd_limit = 0xfffff,
753 	.ssd_type = SDT_MEMERA,
754 	.ssd_dpl = SEL_UPL,
755 	.ssd_p = 1,
756 	.ssd_long = 1,
757 	.ssd_def32 = 0,
758 	.ssd_gran = 1		},
759 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
760 {	.ssd_base = 0x0,
761 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
762 	.ssd_type = SDT_SYSTSS,
763 	.ssd_dpl = SEL_KPL,
764 	.ssd_p = 1,
765 	.ssd_long = 0,
766 	.ssd_def32 = 0,
767 	.ssd_gran = 0		},
768 /* Actually, the TSS is a system descriptor which is double size */
769 {	.ssd_base = 0x0,
770 	.ssd_limit = 0x0,
771 	.ssd_type = 0,
772 	.ssd_dpl = 0,
773 	.ssd_p = 0,
774 	.ssd_long = 0,
775 	.ssd_def32 = 0,
776 	.ssd_gran = 0		},
777 /* GUSERLDT_SEL	11 LDT Descriptor */
778 {	.ssd_base = 0x0,
779 	.ssd_limit = 0x0,
780 	.ssd_type = 0,
781 	.ssd_dpl = 0,
782 	.ssd_p = 0,
783 	.ssd_long = 0,
784 	.ssd_def32 = 0,
785 	.ssd_gran = 0		},
786 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
787 {	.ssd_base = 0x0,
788 	.ssd_limit = 0x0,
789 	.ssd_type = 0,
790 	.ssd_dpl = 0,
791 	.ssd_p = 0,
792 	.ssd_long = 0,
793 	.ssd_def32 = 0,
794 	.ssd_gran = 0		},
795 };
796 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
797 
798 void
799 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
800 {
801 	struct gate_descriptor *ip;
802 
803 	ip = idt + idx;
804 	ip->gd_looffset = (uintptr_t)func;
805 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
806 	ip->gd_ist = ist;
807 	ip->gd_xx = 0;
808 	ip->gd_type = typ;
809 	ip->gd_dpl = dpl;
810 	ip->gd_p = 1;
811 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
812 }
813 
814 extern inthand_t
815 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
816 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
817 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
818 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
819 	IDTVEC(xmm), IDTVEC(dblfault),
820 	IDTVEC(div_pti), IDTVEC(bpt_pti),
821 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
822 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
823 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
824 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
825 	IDTVEC(xmm_pti),
826 #ifdef KDTRACE_HOOKS
827 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
828 #endif
829 #ifdef XENHVM
830 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
831 #endif
832 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
833 	IDTVEC(fast_syscall_pti);
834 
835 #ifdef DDB
836 /*
837  * Display the index and function name of any IDT entries that don't use
838  * the default 'rsvd' entry point.
839  */
840 DB_SHOW_COMMAND(idt, db_show_idt)
841 {
842 	struct gate_descriptor *ip;
843 	int idx;
844 	uintptr_t func;
845 
846 	ip = idt;
847 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
848 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
849 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
850 			db_printf("%3d\t", idx);
851 			db_printsym(func, DB_STGY_PROC);
852 			db_printf("\n");
853 		}
854 		ip++;
855 	}
856 }
857 
858 /* Show privileged registers. */
859 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
860 {
861 	struct {
862 		uint16_t limit;
863 		uint64_t base;
864 	} __packed idtr, gdtr;
865 	uint16_t ldt, tr;
866 
867 	__asm __volatile("sidt %0" : "=m" (idtr));
868 	db_printf("idtr\t0x%016lx/%04x\n",
869 	    (u_long)idtr.base, (u_int)idtr.limit);
870 	__asm __volatile("sgdt %0" : "=m" (gdtr));
871 	db_printf("gdtr\t0x%016lx/%04x\n",
872 	    (u_long)gdtr.base, (u_int)gdtr.limit);
873 	__asm __volatile("sldt %0" : "=r" (ldt));
874 	db_printf("ldtr\t0x%04x\n", ldt);
875 	__asm __volatile("str %0" : "=r" (tr));
876 	db_printf("tr\t0x%04x\n", tr);
877 	db_printf("cr0\t0x%016lx\n", rcr0());
878 	db_printf("cr2\t0x%016lx\n", rcr2());
879 	db_printf("cr3\t0x%016lx\n", rcr3());
880 	db_printf("cr4\t0x%016lx\n", rcr4());
881 	if (rcr4() & CR4_XSAVE)
882 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
883 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
884 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
885 		db_printf("FEATURES_CTL\t%016lx\n",
886 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
887 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
888 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
889 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
890 }
891 
892 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
893 {
894 
895 	db_printf("dr0\t0x%016lx\n", rdr0());
896 	db_printf("dr1\t0x%016lx\n", rdr1());
897 	db_printf("dr2\t0x%016lx\n", rdr2());
898 	db_printf("dr3\t0x%016lx\n", rdr3());
899 	db_printf("dr6\t0x%016lx\n", rdr6());
900 	db_printf("dr7\t0x%016lx\n", rdr7());
901 }
902 #endif
903 
904 void
905 sdtossd(sd, ssd)
906 	struct user_segment_descriptor *sd;
907 	struct soft_segment_descriptor *ssd;
908 {
909 
910 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
911 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
912 	ssd->ssd_type  = sd->sd_type;
913 	ssd->ssd_dpl   = sd->sd_dpl;
914 	ssd->ssd_p     = sd->sd_p;
915 	ssd->ssd_long  = sd->sd_long;
916 	ssd->ssd_def32 = sd->sd_def32;
917 	ssd->ssd_gran  = sd->sd_gran;
918 }
919 
920 void
921 ssdtosd(ssd, sd)
922 	struct soft_segment_descriptor *ssd;
923 	struct user_segment_descriptor *sd;
924 {
925 
926 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
927 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
928 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
929 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
930 	sd->sd_type  = ssd->ssd_type;
931 	sd->sd_dpl   = ssd->ssd_dpl;
932 	sd->sd_p     = ssd->ssd_p;
933 	sd->sd_long  = ssd->ssd_long;
934 	sd->sd_def32 = ssd->ssd_def32;
935 	sd->sd_gran  = ssd->ssd_gran;
936 }
937 
938 void
939 ssdtosyssd(ssd, sd)
940 	struct soft_segment_descriptor *ssd;
941 	struct system_segment_descriptor *sd;
942 {
943 
944 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
945 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
946 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
947 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
948 	sd->sd_type  = ssd->ssd_type;
949 	sd->sd_dpl   = ssd->ssd_dpl;
950 	sd->sd_p     = ssd->ssd_p;
951 	sd->sd_gran  = ssd->ssd_gran;
952 }
953 
954 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
955 #include <isa/isavar.h>
956 #include <isa/isareg.h>
957 /*
958  * Return a bitmap of the current interrupt requests.  This is 8259-specific
959  * and is only suitable for use at probe time.
960  * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
961  * It shouldn't be here.  There should probably be an APIC centric
962  * implementation in the apic driver code, if at all.
963  */
964 intrmask_t
965 isa_irq_pending(void)
966 {
967 	u_char irr1;
968 	u_char irr2;
969 
970 	irr1 = inb(IO_ICU1);
971 	irr2 = inb(IO_ICU2);
972 	return ((irr2 << 8) | irr1);
973 }
974 #endif
975 
976 u_int basemem;
977 
978 static int
979 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
980     int *physmap_idxp)
981 {
982 	int i, insert_idx, physmap_idx;
983 
984 	physmap_idx = *physmap_idxp;
985 
986 	if (length == 0)
987 		return (1);
988 
989 	/*
990 	 * Find insertion point while checking for overlap.  Start off by
991 	 * assuming the new entry will be added to the end.
992 	 *
993 	 * NB: physmap_idx points to the next free slot.
994 	 */
995 	insert_idx = physmap_idx;
996 	for (i = 0; i <= physmap_idx; i += 2) {
997 		if (base < physmap[i + 1]) {
998 			if (base + length <= physmap[i]) {
999 				insert_idx = i;
1000 				break;
1001 			}
1002 			if (boothowto & RB_VERBOSE)
1003 				printf(
1004 		    "Overlapping memory regions, ignoring second region\n");
1005 			return (1);
1006 		}
1007 	}
1008 
1009 	/* See if we can prepend to the next entry. */
1010 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1011 		physmap[insert_idx] = base;
1012 		return (1);
1013 	}
1014 
1015 	/* See if we can append to the previous entry. */
1016 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1017 		physmap[insert_idx - 1] += length;
1018 		return (1);
1019 	}
1020 
1021 	physmap_idx += 2;
1022 	*physmap_idxp = physmap_idx;
1023 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
1024 		printf(
1025 		"Too many segments in the physical address map, giving up\n");
1026 		return (0);
1027 	}
1028 
1029 	/*
1030 	 * Move the last 'N' entries down to make room for the new
1031 	 * entry if needed.
1032 	 */
1033 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1034 		physmap[i] = physmap[i - 2];
1035 		physmap[i + 1] = physmap[i - 1];
1036 	}
1037 
1038 	/* Insert the new entry. */
1039 	physmap[insert_idx] = base;
1040 	physmap[insert_idx + 1] = base + length;
1041 	return (1);
1042 }
1043 
1044 void
1045 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1046                       vm_paddr_t *physmap, int *physmap_idx)
1047 {
1048 	struct bios_smap *smap, *smapend;
1049 
1050 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1051 
1052 	for (smap = smapbase; smap < smapend; smap++) {
1053 		if (boothowto & RB_VERBOSE)
1054 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
1055 			    smap->type, smap->base, smap->length);
1056 
1057 		if (smap->type != SMAP_TYPE_MEMORY)
1058 			continue;
1059 
1060 		if (!add_physmap_entry(smap->base, smap->length, physmap,
1061 		    physmap_idx))
1062 			break;
1063 	}
1064 }
1065 
1066 static void
1067 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1068     int *physmap_idx)
1069 {
1070 	struct efi_md *map, *p;
1071 	const char *type;
1072 	size_t efisz;
1073 	int ndesc, i;
1074 
1075 	static const char *types[] = {
1076 		"Reserved",
1077 		"LoaderCode",
1078 		"LoaderData",
1079 		"BootServicesCode",
1080 		"BootServicesData",
1081 		"RuntimeServicesCode",
1082 		"RuntimeServicesData",
1083 		"ConventionalMemory",
1084 		"UnusableMemory",
1085 		"ACPIReclaimMemory",
1086 		"ACPIMemoryNVS",
1087 		"MemoryMappedIO",
1088 		"MemoryMappedIOPortSpace",
1089 		"PalCode",
1090 		"PersistentMemory"
1091 	};
1092 
1093 	/*
1094 	 * Memory map data provided by UEFI via the GetMemoryMap
1095 	 * Boot Services API.
1096 	 */
1097 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1098 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1099 
1100 	if (efihdr->descriptor_size == 0)
1101 		return;
1102 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
1103 
1104 	if (boothowto & RB_VERBOSE)
1105 		printf("%23s %12s %12s %8s %4s\n",
1106 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
1107 
1108 	for (i = 0, p = map; i < ndesc; i++,
1109 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1110 		if (boothowto & RB_VERBOSE) {
1111 			if (p->md_type < nitems(types))
1112 				type = types[p->md_type];
1113 			else
1114 				type = "<INVALID>";
1115 			printf("%23s %012lx %12p %08lx ", type, p->md_phys,
1116 			    p->md_virt, p->md_pages);
1117 			if (p->md_attr & EFI_MD_ATTR_UC)
1118 				printf("UC ");
1119 			if (p->md_attr & EFI_MD_ATTR_WC)
1120 				printf("WC ");
1121 			if (p->md_attr & EFI_MD_ATTR_WT)
1122 				printf("WT ");
1123 			if (p->md_attr & EFI_MD_ATTR_WB)
1124 				printf("WB ");
1125 			if (p->md_attr & EFI_MD_ATTR_UCE)
1126 				printf("UCE ");
1127 			if (p->md_attr & EFI_MD_ATTR_WP)
1128 				printf("WP ");
1129 			if (p->md_attr & EFI_MD_ATTR_RP)
1130 				printf("RP ");
1131 			if (p->md_attr & EFI_MD_ATTR_XP)
1132 				printf("XP ");
1133 			if (p->md_attr & EFI_MD_ATTR_NV)
1134 				printf("NV ");
1135 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1136 				printf("MORE_RELIABLE ");
1137 			if (p->md_attr & EFI_MD_ATTR_RO)
1138 				printf("RO ");
1139 			if (p->md_attr & EFI_MD_ATTR_RT)
1140 				printf("RUNTIME");
1141 			printf("\n");
1142 		}
1143 
1144 		switch (p->md_type) {
1145 		case EFI_MD_TYPE_CODE:
1146 		case EFI_MD_TYPE_DATA:
1147 		case EFI_MD_TYPE_BS_CODE:
1148 		case EFI_MD_TYPE_BS_DATA:
1149 		case EFI_MD_TYPE_FREE:
1150 			/*
1151 			 * We're allowed to use any entry with these types.
1152 			 */
1153 			break;
1154 		default:
1155 			continue;
1156 		}
1157 
1158 		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1159 		    physmap, physmap_idx))
1160 			break;
1161 	}
1162 }
1163 
1164 static char bootmethod[16] = "";
1165 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
1166     "System firmware boot method");
1167 
1168 static void
1169 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1170 {
1171 	struct bios_smap *smap;
1172 	struct efi_map_header *efihdr;
1173 	u_int32_t size;
1174 
1175 	/*
1176 	 * Memory map from INT 15:E820.
1177 	 *
1178 	 * subr_module.c says:
1179 	 * "Consumer may safely assume that size value precedes data."
1180 	 * ie: an int32_t immediately precedes smap.
1181 	 */
1182 
1183 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1184 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1185 	smap = (struct bios_smap *)preload_search_info(kmdp,
1186 	    MODINFO_METADATA | MODINFOMD_SMAP);
1187 	if (efihdr == NULL && smap == NULL)
1188 		panic("No BIOS smap or EFI map info from loader!");
1189 
1190 	if (efihdr != NULL) {
1191 		add_efi_map_entries(efihdr, physmap, physmap_idx);
1192 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1193 	} else {
1194 		size = *((u_int32_t *)smap - 1);
1195 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
1196 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1197 	}
1198 }
1199 
1200 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
1201 
1202 /*
1203  * Populate the (physmap) array with base/bound pairs describing the
1204  * available physical memory in the system, then test this memory and
1205  * build the phys_avail array describing the actually-available memory.
1206  *
1207  * Total memory size may be set by the kernel environment variable
1208  * hw.physmem or the compile-time define MAXMEM.
1209  *
1210  * XXX first should be vm_paddr_t.
1211  */
1212 static void
1213 getmemsize(caddr_t kmdp, u_int64_t first)
1214 {
1215 	int i, physmap_idx, pa_indx, da_indx;
1216 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
1217 	u_long physmem_start, physmem_tunable, memtest;
1218 	pt_entry_t *pte;
1219 	quad_t dcons_addr, dcons_size;
1220 	int page_counter;
1221 
1222 	/*
1223 	 * Tell the physical memory allocator about pages used to store
1224 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
1225 	 */
1226 	vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first));
1227 
1228 	bzero(physmap, sizeof(physmap));
1229 	physmap_idx = 0;
1230 
1231 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1232 	physmap_idx -= 2;
1233 
1234 	/*
1235 	 * Find the 'base memory' segment for SMP
1236 	 */
1237 	basemem = 0;
1238 	for (i = 0; i <= physmap_idx; i += 2) {
1239 		if (physmap[i] <= 0xA0000) {
1240 			basemem = physmap[i + 1] / 1024;
1241 			break;
1242 		}
1243 	}
1244 	if (basemem == 0 || basemem > 640) {
1245 		if (bootverbose)
1246 			printf(
1247 		"Memory map doesn't contain a basemem segment, faking it");
1248 		basemem = 640;
1249 	}
1250 
1251 	/*
1252 	 * Maxmem isn't the "maximum memory", it's one larger than the
1253 	 * highest page of the physical address space.  It should be
1254 	 * called something like "Maxphyspage".  We may adjust this
1255 	 * based on ``hw.physmem'' and the results of the memory test.
1256 	 */
1257 	Maxmem = atop(physmap[physmap_idx + 1]);
1258 
1259 #ifdef MAXMEM
1260 	Maxmem = MAXMEM / 4;
1261 #endif
1262 
1263 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1264 		Maxmem = atop(physmem_tunable);
1265 
1266 	/*
1267 	 * The boot memory test is disabled by default, as it takes a
1268 	 * significant amount of time on large-memory systems, and is
1269 	 * unfriendly to virtual machines as it unnecessarily touches all
1270 	 * pages.
1271 	 *
1272 	 * A general name is used as the code may be extended to support
1273 	 * additional tests beyond the current "page present" test.
1274 	 */
1275 	memtest = 0;
1276 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1277 
1278 	/*
1279 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1280 	 * in the system.
1281 	 */
1282 	if (Maxmem > atop(physmap[physmap_idx + 1]))
1283 		Maxmem = atop(physmap[physmap_idx + 1]);
1284 
1285 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1286 	    (boothowto & RB_VERBOSE))
1287 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
1288 
1289 	/*
1290 	 * Make hole for "AP -> long mode" bootstrap code.  The
1291 	 * mp_bootaddress vector is only available when the kernel
1292 	 * is configured to support APs and APs for the system start
1293 	 * in real mode mode (e.g. SMP bare metal).
1294 	 */
1295 	if (init_ops.mp_bootaddress)
1296 		init_ops.mp_bootaddress(physmap, &physmap_idx);
1297 
1298 	/* call pmap initialization to make new kernel address space */
1299 	pmap_bootstrap(&first);
1300 
1301 	/*
1302 	 * Size up each available chunk of physical memory.
1303 	 *
1304 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1305 	 * By default, mask off the first 16 pages unless we appear to be
1306 	 * running in a VM.
1307 	 */
1308 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1309 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1310 	if (physmap[0] < physmem_start) {
1311 		if (physmem_start < PAGE_SIZE)
1312 			physmap[0] = PAGE_SIZE;
1313 		else if (physmem_start >= physmap[1])
1314 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1315 		else
1316 			physmap[0] = round_page(physmem_start);
1317 	}
1318 	pa_indx = 0;
1319 	da_indx = 1;
1320 	phys_avail[pa_indx++] = physmap[0];
1321 	phys_avail[pa_indx] = physmap[0];
1322 	dump_avail[da_indx] = physmap[0];
1323 	pte = CMAP1;
1324 
1325 	/*
1326 	 * Get dcons buffer address
1327 	 */
1328 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1329 	    getenv_quad("dcons.size", &dcons_size) == 0)
1330 		dcons_addr = 0;
1331 
1332 	/*
1333 	 * physmap is in bytes, so when converting to page boundaries,
1334 	 * round up the start address and round down the end address.
1335 	 */
1336 	page_counter = 0;
1337 	if (memtest != 0)
1338 		printf("Testing system memory");
1339 	for (i = 0; i <= physmap_idx; i += 2) {
1340 		vm_paddr_t end;
1341 
1342 		end = ptoa((vm_paddr_t)Maxmem);
1343 		if (physmap[i + 1] < end)
1344 			end = trunc_page(physmap[i + 1]);
1345 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1346 			int tmp, page_bad, full;
1347 			int *ptr = (int *)CADDR1;
1348 
1349 			full = FALSE;
1350 			/*
1351 			 * block out kernel memory as not available.
1352 			 */
1353 			if (pa >= (vm_paddr_t)kernphys && pa < first)
1354 				goto do_dump_avail;
1355 
1356 			/*
1357 			 * block out dcons buffer
1358 			 */
1359 			if (dcons_addr > 0
1360 			    && pa >= trunc_page(dcons_addr)
1361 			    && pa < dcons_addr + dcons_size)
1362 				goto do_dump_avail;
1363 
1364 			page_bad = FALSE;
1365 			if (memtest == 0)
1366 				goto skip_memtest;
1367 
1368 			/*
1369 			 * Print a "." every GB to show we're making
1370 			 * progress.
1371 			 */
1372 			page_counter++;
1373 			if ((page_counter % PAGES_PER_GB) == 0)
1374 				printf(".");
1375 
1376 			/*
1377 			 * map page into kernel: valid, read/write,non-cacheable
1378 			 */
1379 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1380 			invltlb();
1381 
1382 			tmp = *(int *)ptr;
1383 			/*
1384 			 * Test for alternating 1's and 0's
1385 			 */
1386 			*(volatile int *)ptr = 0xaaaaaaaa;
1387 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1388 				page_bad = TRUE;
1389 			/*
1390 			 * Test for alternating 0's and 1's
1391 			 */
1392 			*(volatile int *)ptr = 0x55555555;
1393 			if (*(volatile int *)ptr != 0x55555555)
1394 				page_bad = TRUE;
1395 			/*
1396 			 * Test for all 1's
1397 			 */
1398 			*(volatile int *)ptr = 0xffffffff;
1399 			if (*(volatile int *)ptr != 0xffffffff)
1400 				page_bad = TRUE;
1401 			/*
1402 			 * Test for all 0's
1403 			 */
1404 			*(volatile int *)ptr = 0x0;
1405 			if (*(volatile int *)ptr != 0x0)
1406 				page_bad = TRUE;
1407 			/*
1408 			 * Restore original value.
1409 			 */
1410 			*(int *)ptr = tmp;
1411 
1412 skip_memtest:
1413 			/*
1414 			 * Adjust array of valid/good pages.
1415 			 */
1416 			if (page_bad == TRUE)
1417 				continue;
1418 			/*
1419 			 * If this good page is a continuation of the
1420 			 * previous set of good pages, then just increase
1421 			 * the end pointer. Otherwise start a new chunk.
1422 			 * Note that "end" points one higher than end,
1423 			 * making the range >= start and < end.
1424 			 * If we're also doing a speculative memory
1425 			 * test and we at or past the end, bump up Maxmem
1426 			 * so that we keep going. The first bad page
1427 			 * will terminate the loop.
1428 			 */
1429 			if (phys_avail[pa_indx] == pa) {
1430 				phys_avail[pa_indx] += PAGE_SIZE;
1431 			} else {
1432 				pa_indx++;
1433 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
1434 					printf(
1435 		"Too many holes in the physical address space, giving up\n");
1436 					pa_indx--;
1437 					full = TRUE;
1438 					goto do_dump_avail;
1439 				}
1440 				phys_avail[pa_indx++] = pa;	/* start */
1441 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1442 			}
1443 			physmem++;
1444 do_dump_avail:
1445 			if (dump_avail[da_indx] == pa) {
1446 				dump_avail[da_indx] += PAGE_SIZE;
1447 			} else {
1448 				da_indx++;
1449 				if (da_indx == PHYS_AVAIL_ENTRIES) {
1450 					da_indx--;
1451 					goto do_next;
1452 				}
1453 				dump_avail[da_indx++] = pa; /* start */
1454 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1455 			}
1456 do_next:
1457 			if (full)
1458 				break;
1459 		}
1460 	}
1461 	*pte = 0;
1462 	invltlb();
1463 	if (memtest != 0)
1464 		printf("\n");
1465 
1466 	/*
1467 	 * XXX
1468 	 * The last chunk must contain at least one page plus the message
1469 	 * buffer to avoid complicating other code (message buffer address
1470 	 * calculation, etc.).
1471 	 */
1472 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1473 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1474 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1475 		phys_avail[pa_indx--] = 0;
1476 		phys_avail[pa_indx--] = 0;
1477 	}
1478 
1479 	Maxmem = atop(phys_avail[pa_indx]);
1480 
1481 	/* Trim off space for the message buffer. */
1482 	phys_avail[pa_indx] -= round_page(msgbufsize);
1483 
1484 	/* Map the message buffer. */
1485 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1486 }
1487 
1488 static caddr_t
1489 native_parse_preload_data(u_int64_t modulep)
1490 {
1491 	caddr_t kmdp;
1492 	char *envp;
1493 #ifdef DDB
1494 	vm_offset_t ksym_start;
1495 	vm_offset_t ksym_end;
1496 #endif
1497 
1498 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1499 	preload_bootstrap_relocate(KERNBASE);
1500 	kmdp = preload_search_by_type("elf kernel");
1501 	if (kmdp == NULL)
1502 		kmdp = preload_search_by_type("elf64 kernel");
1503 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1504 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1505 	if (envp != NULL)
1506 		envp += KERNBASE;
1507 	init_static_kenv(envp, 0);
1508 #ifdef DDB
1509 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1510 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1511 	db_fetch_ksymtab(ksym_start, ksym_end);
1512 #endif
1513 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1514 
1515 	return (kmdp);
1516 }
1517 
1518 static void
1519 amd64_kdb_init(void)
1520 {
1521 	kdb_init();
1522 #ifdef KDB
1523 	if (boothowto & RB_KDB)
1524 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1525 #endif
1526 }
1527 
1528 /* Set up the fast syscall stuff */
1529 void
1530 amd64_conf_fast_syscall(void)
1531 {
1532 	uint64_t msr;
1533 
1534 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1535 	wrmsr(MSR_EFER, msr);
1536 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1537 	    (u_int64_t)IDTVEC(fast_syscall));
1538 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1539 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1540 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1541 	wrmsr(MSR_STAR, msr);
1542 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1543 }
1544 
1545 void
1546 amd64_bsp_pcpu_init1(struct pcpu *pc)
1547 {
1548 	struct user_segment_descriptor *gdt;
1549 
1550 	PCPU_SET(prvspace, pc);
1551 	gdt = *PCPU_PTR(gdt);
1552 	PCPU_SET(curthread, &thread0);
1553 	PCPU_SET(tssp, PCPU_PTR(common_tss));
1554 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1555 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1556 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1557 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1558 }
1559 
1560 void
1561 amd64_bsp_pcpu_init2(uint64_t rsp0)
1562 {
1563 
1564 	PCPU_SET(rsp0, rsp0);
1565 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1566 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1567 	PCPU_SET(curpcb, thread0.td_pcb);
1568 }
1569 
1570 void
1571 amd64_bsp_ist_init(struct pcpu *pc)
1572 {
1573 	struct nmi_pcpu *np;
1574 	struct amd64tss *tssp;
1575 
1576 	tssp = &pc->pc_common_tss;
1577 
1578 	/* doublefault stack space, runs on ist1 */
1579 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1580 	np->np_pcpu = (register_t)pc;
1581 	tssp->tss_ist1 = (long)np;
1582 
1583 	/*
1584 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1585 	 * above the start of the ist2 stack.
1586 	 */
1587 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1588 	np->np_pcpu = (register_t)pc;
1589 	tssp->tss_ist2 = (long)np;
1590 
1591 	/*
1592 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1593 	 * above the start of the ist3 stack.
1594 	 */
1595 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1596 	np->np_pcpu = (register_t)pc;
1597 	tssp->tss_ist3 = (long)np;
1598 
1599 	/*
1600 	 * DB# stack, runs on ist4.
1601 	 */
1602 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1603 	np->np_pcpu = (register_t)pc;
1604 	tssp->tss_ist4 = (long)np;
1605 }
1606 
1607 u_int64_t
1608 hammer_time(u_int64_t modulep, u_int64_t physfree)
1609 {
1610 	caddr_t kmdp;
1611 	int gsel_tss, x;
1612 	struct pcpu *pc;
1613 	struct xstate_hdr *xhdr;
1614 	u_int64_t rsp0;
1615 	char *env;
1616 	struct user_segment_descriptor *gdt;
1617 	struct region_descriptor r_gdt;
1618 	size_t kstack0_sz;
1619 	int late_console;
1620 
1621 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1622 
1623 	kmdp = init_ops.parse_preload_data(modulep);
1624 
1625 	physfree += ucode_load_bsp(physfree + KERNBASE);
1626 	physfree = roundup2(physfree, PAGE_SIZE);
1627 
1628 	identify_cpu1();
1629 	identify_hypervisor();
1630 	identify_cpu_fixup_bsp();
1631 	identify_cpu2();
1632 	initializecpucache();
1633 
1634 	/*
1635 	 * Check for pti, pcid, and invpcid before ifuncs are
1636 	 * resolved, to correctly select the implementation for
1637 	 * pmap_activate_sw_mode().
1638 	 */
1639 	pti = pti_get_default();
1640 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1641 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1642 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1643 		invpcid_works = (cpu_stdext_feature &
1644 		    CPUID_STDEXT_INVPCID) != 0;
1645 	} else {
1646 		pmap_pcid_enabled = 0;
1647 	}
1648 
1649 	link_elf_ireloc(kmdp);
1650 
1651 	/*
1652 	 * This may be done better later if it gets more high level
1653 	 * components in it. If so just link td->td_proc here.
1654 	 */
1655 	proc_linkup0(&proc0, &thread0);
1656 
1657 	/* Init basic tunables, hz etc */
1658 	init_param1();
1659 
1660 	thread0.td_kstack = physfree + KERNBASE;
1661 	thread0.td_kstack_pages = kstack_pages;
1662 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1663 	bzero((void *)thread0.td_kstack, kstack0_sz);
1664 	physfree += kstack0_sz;
1665 
1666 	/*
1667 	 * Initialize enough of thread0 for delayed invalidation to
1668 	 * work very early.  Rely on thread0.td_base_pri
1669 	 * zero-initialization, it is reset to PVM at proc0_init().
1670 	 */
1671 	pmap_thread_init_invl_gen(&thread0);
1672 
1673 	pc = &temp_bsp_pcpu;
1674 	pcpu_init(pc, 0, sizeof(struct pcpu));
1675 	gdt = &temp_bsp_pcpu.pc_gdt[0];
1676 
1677 	/*
1678 	 * make gdt memory segments
1679 	 */
1680 	for (x = 0; x < NGDT; x++) {
1681 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1682 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1683 			ssdtosd(&gdt_segs[x], &gdt[x]);
1684 	}
1685 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1686 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1687 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1688 
1689 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1690 	r_gdt.rd_base = (long)gdt;
1691 	lgdt(&r_gdt);
1692 
1693 	wrmsr(MSR_FSBASE, 0);		/* User value */
1694 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1695 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1696 
1697 	dpcpu_init((void *)(physfree + KERNBASE), 0);
1698 	physfree += DPCPU_SIZE;
1699 	amd64_bsp_pcpu_init1(pc);
1700 	/* Non-late cninit() and printf() can be moved up to here. */
1701 
1702 	/*
1703 	 * Initialize mutexes.
1704 	 *
1705 	 * icu_lock: in order to allow an interrupt to occur in a critical
1706 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1707 	 *	     must be able to get the icu lock, so it can't be
1708 	 *	     under witness.
1709 	 */
1710 	mutex_init();
1711 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1712 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1713 
1714 	/* exceptions */
1715 	for (x = 0; x < NIDT; x++)
1716 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1717 		    SEL_KPL, 0);
1718 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1719 	    SEL_KPL, 0);
1720 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1721 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1722 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1723 	    SEL_UPL, 0);
1724 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1725 	    SEL_UPL, 0);
1726 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1727 	    SEL_KPL, 0);
1728 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1729 	    SEL_KPL, 0);
1730 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1731 	    SEL_KPL, 0);
1732 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1733 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1734 	    SDT_SYSIGT, SEL_KPL, 0);
1735 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1736 	    SEL_KPL, 0);
1737 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1738 	    SDT_SYSIGT, SEL_KPL, 0);
1739 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1740 	    SEL_KPL, 0);
1741 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1742 	    SEL_KPL, 0);
1743 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1744 	    SEL_KPL, 0);
1745 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1746 	    SEL_KPL, 0);
1747 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1748 	    SEL_KPL, 0);
1749 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1750 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1751 	    SEL_KPL, 0);
1752 #ifdef KDTRACE_HOOKS
1753 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1754 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1755 #endif
1756 #ifdef XENHVM
1757 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1758 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1759 #endif
1760 	r_idt.rd_limit = sizeof(idt0) - 1;
1761 	r_idt.rd_base = (long) idt;
1762 	lidt(&r_idt);
1763 
1764 	/*
1765 	 * Initialize the clock before the console so that console
1766 	 * initialization can use DELAY().
1767 	 */
1768 	clock_init();
1769 
1770 	/*
1771 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1772 	 * transition).
1773 	 * Once bootblocks have updated, we can test directly for
1774 	 * efi_systbl != NULL here...
1775 	 */
1776 	if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1777 	    != NULL)
1778 		vty_set_preferred(VTY_VT);
1779 
1780 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1781 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1782 
1783 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1784 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1785 
1786 	TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
1787 	    &syscall_ret_l1d_flush_mode);
1788 
1789 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1790 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1791 
1792 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1793 
1794 	finishidentcpu();	/* Final stage of CPU initialization */
1795 	initializecpu();	/* Initialize CPU registers */
1796 
1797 	amd64_bsp_ist_init(pc);
1798 
1799 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1800 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1801 	    IOPERM_BITMAP_SIZE;
1802 
1803 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1804 	ltr(gsel_tss);
1805 
1806 	amd64_conf_fast_syscall();
1807 
1808 	/*
1809 	 * We initialize the PCB pointer early so that exception
1810 	 * handlers will work.  Also set up td_critnest to short-cut
1811 	 * the page fault handler.
1812 	 */
1813 	cpu_max_ext_state_size = sizeof(struct savefpu);
1814 	set_top_of_stack_td(&thread0);
1815 	thread0.td_pcb = get_pcb_td(&thread0);
1816 	thread0.td_critnest = 1;
1817 
1818 	/*
1819 	 * The console and kdb should be initialized even earlier than here,
1820 	 * but some console drivers don't work until after getmemsize().
1821 	 * Default to late console initialization to support these drivers.
1822 	 * This loses mainly printf()s in getmemsize() and early debugging.
1823 	 */
1824 	late_console = 1;
1825 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1826 	if (!late_console) {
1827 		cninit();
1828 		amd64_kdb_init();
1829 	}
1830 
1831 	getmemsize(kmdp, physfree);
1832 	init_param2(physmem);
1833 
1834 	/* now running on new page tables, configured,and u/iom is accessible */
1835 
1836 #ifdef DEV_PCI
1837         /* This call might adjust phys_avail[]. */
1838         pci_early_quirks();
1839 #endif
1840 
1841 	if (late_console)
1842 		cninit();
1843 
1844 #ifdef DEV_ISA
1845 #ifdef DEV_ATPIC
1846 	elcr_probe();
1847 	atpic_startup();
1848 #else
1849 	/* Reset and mask the atpics and leave them shut down. */
1850 	atpic_reset();
1851 
1852 	/*
1853 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1854 	 * interrupt handler.
1855 	 */
1856 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1857 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1858 #endif
1859 #else
1860 #error "have you forgotten the isa device?";
1861 #endif
1862 
1863 	if (late_console)
1864 		amd64_kdb_init();
1865 
1866 	msgbufinit(msgbufp, msgbufsize);
1867 	fpuinit();
1868 
1869 	/*
1870 	 * Set up thread0 pcb save area after fpuinit calculated fpu save
1871 	 * area size.  Zero out the extended state header in fpu save
1872 	 * area.
1873 	 */
1874 	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1875 	bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
1876 	if (use_xsave) {
1877 		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1878 		    1);
1879 		xhdr->xstate_bv = xsave_mask;
1880 	}
1881 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1882 	rsp0 = thread0.td_md.md_stack_base;
1883 	/* Ensure the stack is aligned to 16 bytes */
1884 	rsp0 &= ~0xFul;
1885 	__pcpu[0].pc_common_tss.tss_rsp0 = rsp0;
1886 	amd64_bsp_pcpu_init2(rsp0);
1887 
1888 	/* transfer to user mode */
1889 
1890 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1891 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1892 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1893 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1894 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1895 
1896 	load_ds(_udatasel);
1897 	load_es(_udatasel);
1898 	load_fs(_ufssel);
1899 
1900 	/* setup proc 0's pcb */
1901 	thread0.td_pcb->pcb_flags = 0;
1902 	thread0.td_frame = &proc0_tf;
1903 
1904         env = kern_getenv("kernelname");
1905 	if (env != NULL)
1906 		strlcpy(kernelname, env, sizeof(kernelname));
1907 
1908 	cpu_probe_amdc1e();
1909 
1910 	kcsan_cpu_init(0);
1911 
1912 #ifdef FDT
1913 	x86_init_fdt();
1914 #endif
1915 	thread0.td_critnest = 0;
1916 
1917 	TSEXIT();
1918 
1919 	/* Location of kernel stack for locore */
1920 	return (thread0.td_md.md_stack_base);
1921 }
1922 
1923 void
1924 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1925 {
1926 
1927 	pcpu->pc_acpi_id = 0xffffffff;
1928 }
1929 
1930 static int
1931 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1932 {
1933 	struct bios_smap *smapbase;
1934 	struct bios_smap_xattr smap;
1935 	caddr_t kmdp;
1936 	uint32_t *smapattr;
1937 	int count, error, i;
1938 
1939 	/* Retrieve the system memory map from the loader. */
1940 	kmdp = preload_search_by_type("elf kernel");
1941 	if (kmdp == NULL)
1942 		kmdp = preload_search_by_type("elf64 kernel");
1943 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1944 	    MODINFO_METADATA | MODINFOMD_SMAP);
1945 	if (smapbase == NULL)
1946 		return (0);
1947 	smapattr = (uint32_t *)preload_search_info(kmdp,
1948 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1949 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1950 	error = 0;
1951 	for (i = 0; i < count; i++) {
1952 		smap.base = smapbase[i].base;
1953 		smap.length = smapbase[i].length;
1954 		smap.type = smapbase[i].type;
1955 		if (smapattr != NULL)
1956 			smap.xattr = smapattr[i];
1957 		else
1958 			smap.xattr = 0;
1959 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1960 	}
1961 	return (error);
1962 }
1963 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1964     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1965     smap_sysctl_handler, "S,bios_smap_xattr",
1966     "Raw BIOS SMAP data");
1967 
1968 static int
1969 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1970 {
1971 	struct efi_map_header *efihdr;
1972 	caddr_t kmdp;
1973 	uint32_t efisize;
1974 
1975 	kmdp = preload_search_by_type("elf kernel");
1976 	if (kmdp == NULL)
1977 		kmdp = preload_search_by_type("elf64 kernel");
1978 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1979 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1980 	if (efihdr == NULL)
1981 		return (0);
1982 	efisize = *((uint32_t *)efihdr - 1);
1983 	return (SYSCTL_OUT(req, efihdr, efisize));
1984 }
1985 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1986     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1987     efi_map_sysctl_handler, "S,efi_map_header",
1988     "Raw EFI Memory Map");
1989 
1990 void
1991 spinlock_enter(void)
1992 {
1993 	struct thread *td;
1994 	register_t flags;
1995 
1996 	td = curthread;
1997 	if (td->td_md.md_spinlock_count == 0) {
1998 		flags = intr_disable();
1999 		td->td_md.md_spinlock_count = 1;
2000 		td->td_md.md_saved_flags = flags;
2001 		critical_enter();
2002 	} else
2003 		td->td_md.md_spinlock_count++;
2004 }
2005 
2006 void
2007 spinlock_exit(void)
2008 {
2009 	struct thread *td;
2010 	register_t flags;
2011 
2012 	td = curthread;
2013 	flags = td->td_md.md_saved_flags;
2014 	td->td_md.md_spinlock_count--;
2015 	if (td->td_md.md_spinlock_count == 0) {
2016 		critical_exit();
2017 		intr_restore(flags);
2018 	}
2019 }
2020 
2021 /*
2022  * Construct a PCB from a trapframe. This is called from kdb_trap() where
2023  * we want to start a backtrace from the function that caused us to enter
2024  * the debugger. We have the context in the trapframe, but base the trace
2025  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
2026  * enough for a backtrace.
2027  */
2028 void
2029 makectx(struct trapframe *tf, struct pcb *pcb)
2030 {
2031 
2032 	pcb->pcb_r12 = tf->tf_r12;
2033 	pcb->pcb_r13 = tf->tf_r13;
2034 	pcb->pcb_r14 = tf->tf_r14;
2035 	pcb->pcb_r15 = tf->tf_r15;
2036 	pcb->pcb_rbp = tf->tf_rbp;
2037 	pcb->pcb_rbx = tf->tf_rbx;
2038 	pcb->pcb_rip = tf->tf_rip;
2039 	pcb->pcb_rsp = tf->tf_rsp;
2040 }
2041 
2042 int
2043 ptrace_set_pc(struct thread *td, unsigned long addr)
2044 {
2045 
2046 	td->td_frame->tf_rip = addr;
2047 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2048 	return (0);
2049 }
2050 
2051 int
2052 ptrace_single_step(struct thread *td)
2053 {
2054 
2055 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2056 	if ((td->td_frame->tf_rflags & PSL_T) == 0) {
2057 		td->td_frame->tf_rflags |= PSL_T;
2058 		td->td_dbgflags |= TDB_STEP;
2059 	}
2060 	return (0);
2061 }
2062 
2063 int
2064 ptrace_clear_single_step(struct thread *td)
2065 {
2066 
2067 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2068 	td->td_frame->tf_rflags &= ~PSL_T;
2069 	td->td_dbgflags &= ~TDB_STEP;
2070 	return (0);
2071 }
2072 
2073 int
2074 fill_regs(struct thread *td, struct reg *regs)
2075 {
2076 	struct trapframe *tp;
2077 
2078 	tp = td->td_frame;
2079 	return (fill_frame_regs(tp, regs));
2080 }
2081 
2082 int
2083 fill_frame_regs(struct trapframe *tp, struct reg *regs)
2084 {
2085 
2086 	regs->r_r15 = tp->tf_r15;
2087 	regs->r_r14 = tp->tf_r14;
2088 	regs->r_r13 = tp->tf_r13;
2089 	regs->r_r12 = tp->tf_r12;
2090 	regs->r_r11 = tp->tf_r11;
2091 	regs->r_r10 = tp->tf_r10;
2092 	regs->r_r9  = tp->tf_r9;
2093 	regs->r_r8  = tp->tf_r8;
2094 	regs->r_rdi = tp->tf_rdi;
2095 	regs->r_rsi = tp->tf_rsi;
2096 	regs->r_rbp = tp->tf_rbp;
2097 	regs->r_rbx = tp->tf_rbx;
2098 	regs->r_rdx = tp->tf_rdx;
2099 	regs->r_rcx = tp->tf_rcx;
2100 	regs->r_rax = tp->tf_rax;
2101 	regs->r_rip = tp->tf_rip;
2102 	regs->r_cs = tp->tf_cs;
2103 	regs->r_rflags = tp->tf_rflags;
2104 	regs->r_rsp = tp->tf_rsp;
2105 	regs->r_ss = tp->tf_ss;
2106 	if (tp->tf_flags & TF_HASSEGS) {
2107 		regs->r_ds = tp->tf_ds;
2108 		regs->r_es = tp->tf_es;
2109 		regs->r_fs = tp->tf_fs;
2110 		regs->r_gs = tp->tf_gs;
2111 	} else {
2112 		regs->r_ds = 0;
2113 		regs->r_es = 0;
2114 		regs->r_fs = 0;
2115 		regs->r_gs = 0;
2116 	}
2117 	regs->r_err = 0;
2118 	regs->r_trapno = 0;
2119 	return (0);
2120 }
2121 
2122 int
2123 set_regs(struct thread *td, struct reg *regs)
2124 {
2125 	struct trapframe *tp;
2126 	register_t rflags;
2127 
2128 	tp = td->td_frame;
2129 	rflags = regs->r_rflags & 0xffffffff;
2130 	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
2131 		return (EINVAL);
2132 	tp->tf_r15 = regs->r_r15;
2133 	tp->tf_r14 = regs->r_r14;
2134 	tp->tf_r13 = regs->r_r13;
2135 	tp->tf_r12 = regs->r_r12;
2136 	tp->tf_r11 = regs->r_r11;
2137 	tp->tf_r10 = regs->r_r10;
2138 	tp->tf_r9  = regs->r_r9;
2139 	tp->tf_r8  = regs->r_r8;
2140 	tp->tf_rdi = regs->r_rdi;
2141 	tp->tf_rsi = regs->r_rsi;
2142 	tp->tf_rbp = regs->r_rbp;
2143 	tp->tf_rbx = regs->r_rbx;
2144 	tp->tf_rdx = regs->r_rdx;
2145 	tp->tf_rcx = regs->r_rcx;
2146 	tp->tf_rax = regs->r_rax;
2147 	tp->tf_rip = regs->r_rip;
2148 	tp->tf_cs = regs->r_cs;
2149 	tp->tf_rflags = rflags;
2150 	tp->tf_rsp = regs->r_rsp;
2151 	tp->tf_ss = regs->r_ss;
2152 	if (0) {	/* XXXKIB */
2153 		tp->tf_ds = regs->r_ds;
2154 		tp->tf_es = regs->r_es;
2155 		tp->tf_fs = regs->r_fs;
2156 		tp->tf_gs = regs->r_gs;
2157 		tp->tf_flags = TF_HASSEGS;
2158 	}
2159 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2160 	return (0);
2161 }
2162 
2163 /* XXX check all this stuff! */
2164 /* externalize from sv_xmm */
2165 static void
2166 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2167 {
2168 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2169 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2170 	int i;
2171 
2172 	/* pcb -> fpregs */
2173 	bzero(fpregs, sizeof(*fpregs));
2174 
2175 	/* FPU control/status */
2176 	penv_fpreg->en_cw = penv_xmm->en_cw;
2177 	penv_fpreg->en_sw = penv_xmm->en_sw;
2178 	penv_fpreg->en_tw = penv_xmm->en_tw;
2179 	penv_fpreg->en_opcode = penv_xmm->en_opcode;
2180 	penv_fpreg->en_rip = penv_xmm->en_rip;
2181 	penv_fpreg->en_rdp = penv_xmm->en_rdp;
2182 	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2183 	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2184 
2185 	/* FPU registers */
2186 	for (i = 0; i < 8; ++i)
2187 		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2188 
2189 	/* SSE registers */
2190 	for (i = 0; i < 16; ++i)
2191 		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2192 }
2193 
2194 /* internalize from fpregs into sv_xmm */
2195 static void
2196 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2197 {
2198 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2199 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2200 	int i;
2201 
2202 	/* fpregs -> pcb */
2203 	/* FPU control/status */
2204 	penv_xmm->en_cw = penv_fpreg->en_cw;
2205 	penv_xmm->en_sw = penv_fpreg->en_sw;
2206 	penv_xmm->en_tw = penv_fpreg->en_tw;
2207 	penv_xmm->en_opcode = penv_fpreg->en_opcode;
2208 	penv_xmm->en_rip = penv_fpreg->en_rip;
2209 	penv_xmm->en_rdp = penv_fpreg->en_rdp;
2210 	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2211 	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2212 
2213 	/* FPU registers */
2214 	for (i = 0; i < 8; ++i)
2215 		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2216 
2217 	/* SSE registers */
2218 	for (i = 0; i < 16; ++i)
2219 		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2220 }
2221 
2222 /* externalize from td->pcb */
2223 int
2224 fill_fpregs(struct thread *td, struct fpreg *fpregs)
2225 {
2226 
2227 	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2228 	    P_SHOULDSTOP(td->td_proc),
2229 	    ("not suspended thread %p", td));
2230 	fpugetregs(td);
2231 	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2232 	return (0);
2233 }
2234 
2235 /* internalize to td->pcb */
2236 int
2237 set_fpregs(struct thread *td, struct fpreg *fpregs)
2238 {
2239 
2240 	critical_enter();
2241 	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2242 	fpuuserinited(td);
2243 	critical_exit();
2244 	return (0);
2245 }
2246 
2247 /*
2248  * Get machine context.
2249  */
2250 int
2251 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2252 {
2253 	struct pcb *pcb;
2254 	struct trapframe *tp;
2255 
2256 	pcb = td->td_pcb;
2257 	tp = td->td_frame;
2258 	PROC_LOCK(curthread->td_proc);
2259 	mcp->mc_onstack = sigonstack(tp->tf_rsp);
2260 	PROC_UNLOCK(curthread->td_proc);
2261 	mcp->mc_r15 = tp->tf_r15;
2262 	mcp->mc_r14 = tp->tf_r14;
2263 	mcp->mc_r13 = tp->tf_r13;
2264 	mcp->mc_r12 = tp->tf_r12;
2265 	mcp->mc_r11 = tp->tf_r11;
2266 	mcp->mc_r10 = tp->tf_r10;
2267 	mcp->mc_r9  = tp->tf_r9;
2268 	mcp->mc_r8  = tp->tf_r8;
2269 	mcp->mc_rdi = tp->tf_rdi;
2270 	mcp->mc_rsi = tp->tf_rsi;
2271 	mcp->mc_rbp = tp->tf_rbp;
2272 	mcp->mc_rbx = tp->tf_rbx;
2273 	mcp->mc_rcx = tp->tf_rcx;
2274 	mcp->mc_rflags = tp->tf_rflags;
2275 	if (flags & GET_MC_CLEAR_RET) {
2276 		mcp->mc_rax = 0;
2277 		mcp->mc_rdx = 0;
2278 		mcp->mc_rflags &= ~PSL_C;
2279 	} else {
2280 		mcp->mc_rax = tp->tf_rax;
2281 		mcp->mc_rdx = tp->tf_rdx;
2282 	}
2283 	mcp->mc_rip = tp->tf_rip;
2284 	mcp->mc_cs = tp->tf_cs;
2285 	mcp->mc_rsp = tp->tf_rsp;
2286 	mcp->mc_ss = tp->tf_ss;
2287 	mcp->mc_ds = tp->tf_ds;
2288 	mcp->mc_es = tp->tf_es;
2289 	mcp->mc_fs = tp->tf_fs;
2290 	mcp->mc_gs = tp->tf_gs;
2291 	mcp->mc_flags = tp->tf_flags;
2292 	mcp->mc_len = sizeof(*mcp);
2293 	get_fpcontext(td, mcp, NULL, 0);
2294 	update_pcb_bases(pcb);
2295 	mcp->mc_fsbase = pcb->pcb_fsbase;
2296 	mcp->mc_gsbase = pcb->pcb_gsbase;
2297 	mcp->mc_xfpustate = 0;
2298 	mcp->mc_xfpustate_len = 0;
2299 	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2300 	return (0);
2301 }
2302 
2303 /*
2304  * Set machine context.
2305  *
2306  * However, we don't set any but the user modifiable flags, and we won't
2307  * touch the cs selector.
2308  */
2309 int
2310 set_mcontext(struct thread *td, mcontext_t *mcp)
2311 {
2312 	struct pcb *pcb;
2313 	struct trapframe *tp;
2314 	char *xfpustate;
2315 	long rflags;
2316 	int ret;
2317 
2318 	pcb = td->td_pcb;
2319 	tp = td->td_frame;
2320 	if (mcp->mc_len != sizeof(*mcp) ||
2321 	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2322 		return (EINVAL);
2323 	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2324 	    (tp->tf_rflags & ~PSL_USERCHANGE);
2325 	if (mcp->mc_flags & _MC_HASFPXSTATE) {
2326 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2327 		    sizeof(struct savefpu))
2328 			return (EINVAL);
2329 		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2330 		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2331 		    mcp->mc_xfpustate_len);
2332 		if (ret != 0)
2333 			return (ret);
2334 	} else
2335 		xfpustate = NULL;
2336 	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2337 	if (ret != 0)
2338 		return (ret);
2339 	tp->tf_r15 = mcp->mc_r15;
2340 	tp->tf_r14 = mcp->mc_r14;
2341 	tp->tf_r13 = mcp->mc_r13;
2342 	tp->tf_r12 = mcp->mc_r12;
2343 	tp->tf_r11 = mcp->mc_r11;
2344 	tp->tf_r10 = mcp->mc_r10;
2345 	tp->tf_r9  = mcp->mc_r9;
2346 	tp->tf_r8  = mcp->mc_r8;
2347 	tp->tf_rdi = mcp->mc_rdi;
2348 	tp->tf_rsi = mcp->mc_rsi;
2349 	tp->tf_rbp = mcp->mc_rbp;
2350 	tp->tf_rbx = mcp->mc_rbx;
2351 	tp->tf_rdx = mcp->mc_rdx;
2352 	tp->tf_rcx = mcp->mc_rcx;
2353 	tp->tf_rax = mcp->mc_rax;
2354 	tp->tf_rip = mcp->mc_rip;
2355 	tp->tf_rflags = rflags;
2356 	tp->tf_rsp = mcp->mc_rsp;
2357 	tp->tf_ss = mcp->mc_ss;
2358 	tp->tf_flags = mcp->mc_flags;
2359 	if (tp->tf_flags & TF_HASSEGS) {
2360 		tp->tf_ds = mcp->mc_ds;
2361 		tp->tf_es = mcp->mc_es;
2362 		tp->tf_fs = mcp->mc_fs;
2363 		tp->tf_gs = mcp->mc_gs;
2364 	}
2365 	set_pcb_flags(pcb, PCB_FULL_IRET);
2366 	if (mcp->mc_flags & _MC_HASBASES) {
2367 		pcb->pcb_fsbase = mcp->mc_fsbase;
2368 		pcb->pcb_gsbase = mcp->mc_gsbase;
2369 	}
2370 	return (0);
2371 }
2372 
2373 static void
2374 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2375     size_t xfpusave_len)
2376 {
2377 	size_t max_len, len;
2378 
2379 	mcp->mc_ownedfp = fpugetregs(td);
2380 	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2381 	    sizeof(mcp->mc_fpstate));
2382 	mcp->mc_fpformat = fpuformat();
2383 	if (!use_xsave || xfpusave_len == 0)
2384 		return;
2385 	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2386 	len = xfpusave_len;
2387 	if (len > max_len) {
2388 		len = max_len;
2389 		bzero(xfpusave + max_len, len - max_len);
2390 	}
2391 	mcp->mc_flags |= _MC_HASFPXSTATE;
2392 	mcp->mc_xfpustate_len = len;
2393 	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2394 }
2395 
2396 static int
2397 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2398     size_t xfpustate_len)
2399 {
2400 	int error;
2401 
2402 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2403 		return (0);
2404 	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2405 		return (EINVAL);
2406 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2407 		/* We don't care what state is left in the FPU or PCB. */
2408 		fpstate_drop(td);
2409 		error = 0;
2410 	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2411 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2412 		error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
2413 		    xfpustate, xfpustate_len);
2414 	} else
2415 		return (EINVAL);
2416 	return (error);
2417 }
2418 
2419 void
2420 fpstate_drop(struct thread *td)
2421 {
2422 
2423 	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2424 	critical_enter();
2425 	if (PCPU_GET(fpcurthread) == td)
2426 		fpudrop();
2427 	/*
2428 	 * XXX force a full drop of the fpu.  The above only drops it if we
2429 	 * owned it.
2430 	 *
2431 	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2432 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
2433 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
2434 	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2435 	 * have too many layers.
2436 	 */
2437 	clear_pcb_flags(curthread->td_pcb,
2438 	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2439 	critical_exit();
2440 }
2441 
2442 int
2443 fill_dbregs(struct thread *td, struct dbreg *dbregs)
2444 {
2445 	struct pcb *pcb;
2446 
2447 	if (td == NULL) {
2448 		dbregs->dr[0] = rdr0();
2449 		dbregs->dr[1] = rdr1();
2450 		dbregs->dr[2] = rdr2();
2451 		dbregs->dr[3] = rdr3();
2452 		dbregs->dr[6] = rdr6();
2453 		dbregs->dr[7] = rdr7();
2454 	} else {
2455 		pcb = td->td_pcb;
2456 		dbregs->dr[0] = pcb->pcb_dr0;
2457 		dbregs->dr[1] = pcb->pcb_dr1;
2458 		dbregs->dr[2] = pcb->pcb_dr2;
2459 		dbregs->dr[3] = pcb->pcb_dr3;
2460 		dbregs->dr[6] = pcb->pcb_dr6;
2461 		dbregs->dr[7] = pcb->pcb_dr7;
2462 	}
2463 	dbregs->dr[4] = 0;
2464 	dbregs->dr[5] = 0;
2465 	dbregs->dr[8] = 0;
2466 	dbregs->dr[9] = 0;
2467 	dbregs->dr[10] = 0;
2468 	dbregs->dr[11] = 0;
2469 	dbregs->dr[12] = 0;
2470 	dbregs->dr[13] = 0;
2471 	dbregs->dr[14] = 0;
2472 	dbregs->dr[15] = 0;
2473 	return (0);
2474 }
2475 
2476 int
2477 set_dbregs(struct thread *td, struct dbreg *dbregs)
2478 {
2479 	struct pcb *pcb;
2480 	int i;
2481 
2482 	if (td == NULL) {
2483 		load_dr0(dbregs->dr[0]);
2484 		load_dr1(dbregs->dr[1]);
2485 		load_dr2(dbregs->dr[2]);
2486 		load_dr3(dbregs->dr[3]);
2487 		load_dr6(dbregs->dr[6]);
2488 		load_dr7(dbregs->dr[7]);
2489 	} else {
2490 		/*
2491 		 * Don't let an illegal value for dr7 get set.  Specifically,
2492 		 * check for undefined settings.  Setting these bit patterns
2493 		 * result in undefined behaviour and can lead to an unexpected
2494 		 * TRCTRAP or a general protection fault right here.
2495 		 * Upper bits of dr6 and dr7 must not be set
2496 		 */
2497 		for (i = 0; i < 4; i++) {
2498 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2499 				return (EINVAL);
2500 			if (td->td_frame->tf_cs == _ucode32sel &&
2501 			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2502 				return (EINVAL);
2503 		}
2504 		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2505 		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2506 			return (EINVAL);
2507 
2508 		pcb = td->td_pcb;
2509 
2510 		/*
2511 		 * Don't let a process set a breakpoint that is not within the
2512 		 * process's address space.  If a process could do this, it
2513 		 * could halt the system by setting a breakpoint in the kernel
2514 		 * (if ddb was enabled).  Thus, we need to check to make sure
2515 		 * that no breakpoints are being enabled for addresses outside
2516 		 * process's address space.
2517 		 *
2518 		 * XXX - what about when the watched area of the user's
2519 		 * address space is written into from within the kernel
2520 		 * ... wouldn't that still cause a breakpoint to be generated
2521 		 * from within kernel mode?
2522 		 */
2523 
2524 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2525 			/* dr0 is enabled */
2526 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2527 				return (EINVAL);
2528 		}
2529 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2530 			/* dr1 is enabled */
2531 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2532 				return (EINVAL);
2533 		}
2534 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2535 			/* dr2 is enabled */
2536 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2537 				return (EINVAL);
2538 		}
2539 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2540 			/* dr3 is enabled */
2541 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2542 				return (EINVAL);
2543 		}
2544 
2545 		pcb->pcb_dr0 = dbregs->dr[0];
2546 		pcb->pcb_dr1 = dbregs->dr[1];
2547 		pcb->pcb_dr2 = dbregs->dr[2];
2548 		pcb->pcb_dr3 = dbregs->dr[3];
2549 		pcb->pcb_dr6 = dbregs->dr[6];
2550 		pcb->pcb_dr7 = dbregs->dr[7];
2551 
2552 		set_pcb_flags(pcb, PCB_DBREGS);
2553 	}
2554 
2555 	return (0);
2556 }
2557 
2558 void
2559 reset_dbregs(void)
2560 {
2561 
2562 	load_dr7(0);	/* Turn off the control bits first */
2563 	load_dr0(0);
2564 	load_dr1(0);
2565 	load_dr2(0);
2566 	load_dr3(0);
2567 	load_dr6(0);
2568 }
2569 
2570 /*
2571  * Return > 0 if a hardware breakpoint has been hit, and the
2572  * breakpoint was in user space.  Return 0, otherwise.
2573  */
2574 int
2575 user_dbreg_trap(register_t dr6)
2576 {
2577         u_int64_t dr7;
2578         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
2579         int nbp;            /* number of breakpoints that triggered */
2580         caddr_t addr[4];    /* breakpoint addresses */
2581         int i;
2582 
2583         bp = dr6 & DBREG_DR6_BMASK;
2584         if (bp == 0) {
2585                 /*
2586                  * None of the breakpoint bits are set meaning this
2587                  * trap was not caused by any of the debug registers
2588                  */
2589                 return 0;
2590         }
2591 
2592         dr7 = rdr7();
2593         if ((dr7 & 0x000000ff) == 0) {
2594                 /*
2595                  * all GE and LE bits in the dr7 register are zero,
2596                  * thus the trap couldn't have been caused by the
2597                  * hardware debug registers
2598                  */
2599                 return 0;
2600         }
2601 
2602         nbp = 0;
2603 
2604         /*
2605          * at least one of the breakpoints were hit, check to see
2606          * which ones and if any of them are user space addresses
2607          */
2608 
2609         if (bp & 0x01) {
2610                 addr[nbp++] = (caddr_t)rdr0();
2611         }
2612         if (bp & 0x02) {
2613                 addr[nbp++] = (caddr_t)rdr1();
2614         }
2615         if (bp & 0x04) {
2616                 addr[nbp++] = (caddr_t)rdr2();
2617         }
2618         if (bp & 0x08) {
2619                 addr[nbp++] = (caddr_t)rdr3();
2620         }
2621 
2622         for (i = 0; i < nbp; i++) {
2623                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2624                         /*
2625                          * addr[i] is in user space
2626                          */
2627                         return nbp;
2628                 }
2629         }
2630 
2631         /*
2632          * None of the breakpoints are in user space.
2633          */
2634         return 0;
2635 }
2636 
2637 /*
2638  * The pcb_flags is only modified by current thread, or by other threads
2639  * when current thread is stopped.  However, current thread may change it
2640  * from the interrupt context in cpu_switch(), or in the trap handler.
2641  * When we read-modify-write pcb_flags from C sources, compiler may generate
2642  * code that is not atomic regarding the interrupt handler.  If a trap or
2643  * interrupt happens and any flag is modified from the handler, it can be
2644  * clobbered with the cached value later.  Therefore, we implement setting
2645  * and clearing flags with single-instruction functions, which do not race
2646  * with possible modification of the flags from the trap or interrupt context,
2647  * because traps and interrupts are executed only on instruction boundary.
2648  */
2649 void
2650 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
2651 {
2652 
2653 	__asm __volatile("orl %1,%0"
2654 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
2655 	    : "cc", "memory");
2656 
2657 }
2658 
2659 /*
2660  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
2661  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
2662  * pcb if user space modified the bases.  We must save on the context
2663  * switch or if the return to usermode happens through the doreti.
2664  *
2665  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
2666  * which have a consequence that the base MSRs must be saved each time
2667  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
2668  * context switches.
2669  */
2670 static void
2671 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
2672 {
2673 	register_t r;
2674 
2675 	if (curpcb == pcb &&
2676 	    (flags & PCB_FULL_IRET) != 0 &&
2677 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2678 		r = intr_disable();
2679 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2680 			if (rfs() == _ufssel)
2681 				pcb->pcb_fsbase = rdfsbase();
2682 			if (rgs() == _ugssel)
2683 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
2684 		}
2685 		set_pcb_flags_raw(pcb, flags);
2686 		intr_restore(r);
2687 	} else {
2688 		set_pcb_flags_raw(pcb, flags);
2689 	}
2690 }
2691 
2692 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
2693 {
2694 
2695 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
2696 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
2697 }
2698 
2699 void
2700 clear_pcb_flags(struct pcb *pcb, const u_int flags)
2701 {
2702 
2703 	__asm __volatile("andl %1,%0"
2704 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
2705 	    : "cc", "memory");
2706 }
2707 
2708 #ifdef KDB
2709 
2710 /*
2711  * Provide inb() and outb() as functions.  They are normally only available as
2712  * inline functions, thus cannot be called from the debugger.
2713  */
2714 
2715 /* silence compiler warnings */
2716 u_char inb_(u_short);
2717 void outb_(u_short, u_char);
2718 
2719 u_char
2720 inb_(u_short port)
2721 {
2722 	return inb(port);
2723 }
2724 
2725 void
2726 outb_(u_short port, u_char data)
2727 {
2728 	outb(port, data);
2729 }
2730 
2731 #endif /* KDB */
2732 
2733 #undef memset
2734 #undef memmove
2735 #undef memcpy
2736 
2737 void	*memset_std(void *buf, int c, size_t len);
2738 void	*memset_erms(void *buf, int c, size_t len);
2739 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
2740 	    size_t len);
2741 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
2742 	    size_t len);
2743 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
2744 	    size_t len);
2745 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
2746 	    size_t len);
2747 
2748 #ifdef KCSAN
2749 /*
2750  * These fail to build as ifuncs when used with KCSAN.
2751  */
2752 void *
2753 memset(void *buf, int c, size_t len)
2754 {
2755 
2756 	return (memset_std(buf, c, len));
2757 }
2758 
2759 void *
2760 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
2761 {
2762 
2763 	return (memmove_std(dst, src, len));
2764 }
2765 
2766 void *
2767 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
2768 {
2769 
2770 	return (memcpy_std(dst, src, len));
2771 }
2772 #else
2773 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
2774 {
2775 
2776 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2777 	    memset_erms : memset_std);
2778 }
2779 
2780 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
2781     size_t))
2782 {
2783 
2784 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2785 	    memmove_erms : memmove_std);
2786 }
2787 
2788 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
2789 {
2790 
2791 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2792 	    memcpy_erms : memcpy_std);
2793 }
2794 #endif
2795 
2796 void	pagezero_std(void *addr);
2797 void	pagezero_erms(void *addr);
2798 DEFINE_IFUNC(, void , pagezero, (void *))
2799 {
2800 
2801 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2802 	    pagezero_erms : pagezero_std);
2803 }
2804