xref: /freebsd/sys/amd64/amd64/machdep.c (revision c1d255d3)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 1992 Terrence R. Lambert.
6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
41  */
42 
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 #include "opt_atpic.h"
47 #include "opt_cpu.h"
48 #include "opt_ddb.h"
49 #include "opt_inet.h"
50 #include "opt_isa.h"
51 #include "opt_kstack_pages.h"
52 #include "opt_maxmem.h"
53 #include "opt_mp_watchdog.h"
54 #include "opt_pci.h"
55 #include "opt_platform.h"
56 #include "opt_sched.h"
57 
58 #include <sys/param.h>
59 #include <sys/proc.h>
60 #include <sys/systm.h>
61 #include <sys/asan.h>
62 #include <sys/bio.h>
63 #include <sys/buf.h>
64 #include <sys/bus.h>
65 #include <sys/callout.h>
66 #include <sys/cons.h>
67 #include <sys/cpu.h>
68 #include <sys/csan.h>
69 #include <sys/efi.h>
70 #include <sys/eventhandler.h>
71 #include <sys/exec.h>
72 #include <sys/imgact.h>
73 #include <sys/kdb.h>
74 #include <sys/kernel.h>
75 #include <sys/ktr.h>
76 #include <sys/linker.h>
77 #include <sys/lock.h>
78 #include <sys/malloc.h>
79 #include <sys/memrange.h>
80 #include <sys/msan.h>
81 #include <sys/msgbuf.h>
82 #include <sys/mutex.h>
83 #include <sys/pcpu.h>
84 #include <sys/ptrace.h>
85 #include <sys/reboot.h>
86 #include <sys/reg.h>
87 #include <sys/rwlock.h>
88 #include <sys/sched.h>
89 #include <sys/signalvar.h>
90 #ifdef SMP
91 #include <sys/smp.h>
92 #endif
93 #include <sys/syscallsubr.h>
94 #include <sys/sysctl.h>
95 #include <sys/sysent.h>
96 #include <sys/sysproto.h>
97 #include <sys/ucontext.h>
98 #include <sys/vmmeter.h>
99 
100 #include <vm/vm.h>
101 #include <vm/vm_param.h>
102 #include <vm/vm_extern.h>
103 #include <vm/vm_kern.h>
104 #include <vm/vm_page.h>
105 #include <vm/vm_map.h>
106 #include <vm/vm_object.h>
107 #include <vm/vm_pager.h>
108 #include <vm/vm_phys.h>
109 #include <vm/vm_dumpset.h>
110 
111 #ifdef DDB
112 #ifndef KDB
113 #error KDB must be enabled in order for DDB to work!
114 #endif
115 #include <ddb/ddb.h>
116 #include <ddb/db_sym.h>
117 #endif
118 
119 #include <net/netisr.h>
120 
121 #include <machine/clock.h>
122 #include <machine/cpu.h>
123 #include <machine/cputypes.h>
124 #include <machine/frame.h>
125 #include <machine/intr_machdep.h>
126 #include <x86/mca.h>
127 #include <machine/md_var.h>
128 #include <machine/metadata.h>
129 #include <machine/mp_watchdog.h>
130 #include <machine/pc/bios.h>
131 #include <machine/pcb.h>
132 #include <machine/proc.h>
133 #include <machine/sigframe.h>
134 #include <machine/specialreg.h>
135 #include <machine/trap.h>
136 #include <machine/tss.h>
137 #include <x86/ucode.h>
138 #include <x86/ifunc.h>
139 #ifdef SMP
140 #include <machine/smp.h>
141 #endif
142 #ifdef FDT
143 #include <x86/fdt.h>
144 #endif
145 
146 #ifdef DEV_ATPIC
147 #include <x86/isa/icu.h>
148 #else
149 #include <x86/apicvar.h>
150 #endif
151 
152 #include <isa/isareg.h>
153 #include <isa/rtc.h>
154 #include <x86/init.h>
155 
156 /* Sanity check for __curthread() */
157 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
158 
159 /*
160  * The PTI trampoline stack needs enough space for a hardware trapframe and a
161  * couple of scratch registers, as well as the trapframe left behind after an
162  * iret fault.
163  */
164 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
165     offsetof(struct pti_frame, pti_rip));
166 
167 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
168 
169 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
170 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
171 
172 static void cpu_startup(void *);
173 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
174     char *xfpusave, size_t xfpusave_len);
175 static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
176     char *xfpustate, size_t xfpustate_len);
177 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
178 
179 /* Preload data parse function */
180 static caddr_t native_parse_preload_data(u_int64_t);
181 
182 /* Native function to fetch and parse the e820 map */
183 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
184 
185 /* Default init_ops implementation. */
186 struct init_ops init_ops = {
187 	.parse_preload_data =	native_parse_preload_data,
188 	.early_clock_source_init =	i8254_init,
189 	.early_delay =			i8254_delay,
190 	.parse_memmap =			native_parse_memmap,
191 };
192 
193 /*
194  * Physical address of the EFI System Table. Stashed from the metadata hints
195  * passed into the kernel and used by the EFI code to call runtime services.
196  */
197 vm_paddr_t efi_systbl_phys;
198 
199 /* Intel ICH registers */
200 #define ICH_PMBASE	0x400
201 #define ICH_SMI_EN	ICH_PMBASE + 0x30
202 
203 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
204 
205 int cold = 1;
206 
207 long Maxmem = 0;
208 long realmem = 0;
209 
210 struct kva_md_info kmi;
211 
212 static struct trapframe proc0_tf;
213 struct region_descriptor r_idt;
214 
215 struct pcpu *__pcpu;
216 struct pcpu temp_bsp_pcpu;
217 
218 struct mtx icu_lock;
219 
220 struct mem_range_softc mem_range_softc;
221 
222 struct mtx dt_lock;	/* lock for GDT and LDT */
223 
224 void (*vmm_resume_p)(void);
225 
226 bool efi_boot;
227 
228 static void
229 cpu_startup(dummy)
230 	void *dummy;
231 {
232 	uintmax_t memsize;
233 	char *sysenv;
234 
235 	/*
236 	 * On MacBooks, we need to disallow the legacy USB circuit to
237 	 * generate an SMI# because this can cause several problems,
238 	 * namely: incorrect CPU frequency detection and failure to
239 	 * start the APs.
240 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
241 	 * Enable register) of the Intel ICH LPC Interface Bridge.
242 	 */
243 	sysenv = kern_getenv("smbios.system.product");
244 	if (sysenv != NULL) {
245 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
246 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
247 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
248 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
249 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
250 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
251 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
252 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
253 			if (bootverbose)
254 				printf("Disabling LEGACY_USB_EN bit on "
255 				    "Intel ICH.\n");
256 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
257 		}
258 		freeenv(sysenv);
259 	}
260 
261 	/*
262 	 * Good {morning,afternoon,evening,night}.
263 	 */
264 	startrtclock();
265 	printcpuinfo();
266 
267 	/*
268 	 * Display physical memory if SMBIOS reports reasonable amount.
269 	 */
270 	memsize = 0;
271 	sysenv = kern_getenv("smbios.memory.enabled");
272 	if (sysenv != NULL) {
273 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
274 		freeenv(sysenv);
275 	}
276 	if (memsize < ptoa((uintmax_t)vm_free_count()))
277 		memsize = ptoa((uintmax_t)Maxmem);
278 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
279 	realmem = atop(memsize);
280 
281 	/*
282 	 * Display any holes after the first chunk of extended memory.
283 	 */
284 	if (bootverbose) {
285 		int indx;
286 
287 		printf("Physical memory chunk(s):\n");
288 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
289 			vm_paddr_t size;
290 
291 			size = phys_avail[indx + 1] - phys_avail[indx];
292 			printf(
293 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
294 			    (uintmax_t)phys_avail[indx],
295 			    (uintmax_t)phys_avail[indx + 1] - 1,
296 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
297 		}
298 	}
299 
300 	vm_ksubmap_init(&kmi);
301 
302 	printf("avail memory = %ju (%ju MB)\n",
303 	    ptoa((uintmax_t)vm_free_count()),
304 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
305 #ifdef DEV_PCI
306 	if (bootverbose && intel_graphics_stolen_base != 0)
307 		printf("intel stolen mem: base %#jx size %ju MB\n",
308 		    (uintmax_t)intel_graphics_stolen_base,
309 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
310 #endif
311 
312 	/*
313 	 * Set up buffers, so they can be used to read disk labels.
314 	 */
315 	bufinit();
316 	vm_pager_bufferinit();
317 
318 	cpu_setregs();
319 }
320 
321 static void
322 late_ifunc_resolve(void *dummy __unused)
323 {
324 	link_elf_late_ireloc();
325 }
326 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
327 
328 /*
329  * Send an interrupt to process.
330  *
331  * Stack is set up to allow sigcode stored
332  * at top to call routine, followed by call
333  * to sigreturn routine below.  After sigreturn
334  * resets the signal mask, the stack, and the
335  * frame pointer, it returns to the user
336  * specified pc, psl.
337  */
338 void
339 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
340 {
341 	struct sigframe sf, *sfp;
342 	struct pcb *pcb;
343 	struct proc *p;
344 	struct thread *td;
345 	struct sigacts *psp;
346 	char *sp;
347 	struct trapframe *regs;
348 	char *xfpusave;
349 	size_t xfpusave_len;
350 	int sig;
351 	int oonstack;
352 
353 	td = curthread;
354 	pcb = td->td_pcb;
355 	p = td->td_proc;
356 	PROC_LOCK_ASSERT(p, MA_OWNED);
357 	sig = ksi->ksi_signo;
358 	psp = p->p_sigacts;
359 	mtx_assert(&psp->ps_mtx, MA_OWNED);
360 	regs = td->td_frame;
361 	oonstack = sigonstack(regs->tf_rsp);
362 
363 	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
364 		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
365 		xfpusave = __builtin_alloca(xfpusave_len);
366 	} else {
367 		xfpusave_len = 0;
368 		xfpusave = NULL;
369 	}
370 
371 	/* Save user context. */
372 	bzero(&sf, sizeof(sf));
373 	sf.sf_uc.uc_sigmask = *mask;
374 	sf.sf_uc.uc_stack = td->td_sigstk;
375 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
376 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
377 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
378 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
379 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
380 	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
381 	fpstate_drop(td);
382 	update_pcb_bases(pcb);
383 	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
384 	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
385 	bzero(sf.sf_uc.uc_mcontext.mc_spare,
386 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
387 
388 	/* Allocate space for the signal handler context. */
389 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
390 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
391 		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
392 #if defined(COMPAT_43)
393 		td->td_sigstk.ss_flags |= SS_ONSTACK;
394 #endif
395 	} else
396 		sp = (char *)regs->tf_rsp - 128;
397 	if (xfpusave != NULL) {
398 		sp -= xfpusave_len;
399 		sp = (char *)((unsigned long)sp & ~0x3Ful);
400 		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
401 	}
402 	sp -= sizeof(struct sigframe);
403 	/* Align to 16 bytes. */
404 	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
405 
406 	/* Build the argument list for the signal handler. */
407 	regs->tf_rdi = sig;			/* arg 1 in %rdi */
408 	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
409 	bzero(&sf.sf_si, sizeof(sf.sf_si));
410 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
411 		/* Signal handler installed with SA_SIGINFO. */
412 		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
413 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
414 
415 		/* Fill in POSIX parts */
416 		sf.sf_si = ksi->ksi_info;
417 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
418 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
419 	} else {
420 		/* Old FreeBSD-style arguments. */
421 		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
422 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
423 		sf.sf_ahu.sf_handler = catcher;
424 	}
425 	mtx_unlock(&psp->ps_mtx);
426 	PROC_UNLOCK(p);
427 
428 	/*
429 	 * Copy the sigframe out to the user's stack.
430 	 */
431 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
432 	    (xfpusave != NULL && copyout(xfpusave,
433 	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
434 	    != 0)) {
435 #ifdef DEBUG
436 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
437 #endif
438 		PROC_LOCK(p);
439 		sigexit(td, SIGILL);
440 	}
441 
442 	regs->tf_rsp = (long)sfp;
443 	regs->tf_rip = p->p_sysent->sv_sigcode_base;
444 	regs->tf_rflags &= ~(PSL_T | PSL_D);
445 	regs->tf_cs = _ucodesel;
446 	regs->tf_ds = _udatasel;
447 	regs->tf_ss = _udatasel;
448 	regs->tf_es = _udatasel;
449 	regs->tf_fs = _ufssel;
450 	regs->tf_gs = _ugssel;
451 	regs->tf_flags = TF_HASSEGS;
452 	PROC_LOCK(p);
453 	mtx_lock(&psp->ps_mtx);
454 }
455 
456 /*
457  * System call to cleanup state after a signal
458  * has been taken.  Reset signal mask and
459  * stack state from context left by sendsig (above).
460  * Return to previous pc and psl as specified by
461  * context left by sendsig. Check carefully to
462  * make sure that the user has not modified the
463  * state to gain improper privileges.
464  *
465  * MPSAFE
466  */
467 int
468 sys_sigreturn(td, uap)
469 	struct thread *td;
470 	struct sigreturn_args /* {
471 		const struct __ucontext *sigcntxp;
472 	} */ *uap;
473 {
474 	ucontext_t uc;
475 	struct pcb *pcb;
476 	struct proc *p;
477 	struct trapframe *regs;
478 	ucontext_t *ucp;
479 	char *xfpustate;
480 	size_t xfpustate_len;
481 	long rflags;
482 	int cs, error, ret;
483 	ksiginfo_t ksi;
484 
485 	pcb = td->td_pcb;
486 	p = td->td_proc;
487 
488 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
489 	if (error != 0) {
490 		uprintf("pid %d (%s): sigreturn copyin failed\n",
491 		    p->p_pid, td->td_name);
492 		return (error);
493 	}
494 	ucp = &uc;
495 	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
496 		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
497 		    td->td_name, ucp->uc_mcontext.mc_flags);
498 		return (EINVAL);
499 	}
500 	regs = td->td_frame;
501 	rflags = ucp->uc_mcontext.mc_rflags;
502 	/*
503 	 * Don't allow users to change privileged or reserved flags.
504 	 */
505 	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
506 		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
507 		    td->td_name, rflags);
508 		return (EINVAL);
509 	}
510 
511 	/*
512 	 * Don't allow users to load a valid privileged %cs.  Let the
513 	 * hardware check for invalid selectors, excess privilege in
514 	 * other selectors, invalid %eip's and invalid %esp's.
515 	 */
516 	cs = ucp->uc_mcontext.mc_cs;
517 	if (!CS_SECURE(cs)) {
518 		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
519 		    td->td_name, cs);
520 		ksiginfo_init_trap(&ksi);
521 		ksi.ksi_signo = SIGBUS;
522 		ksi.ksi_code = BUS_OBJERR;
523 		ksi.ksi_trapno = T_PROTFLT;
524 		ksi.ksi_addr = (void *)regs->tf_rip;
525 		trapsignal(td, &ksi);
526 		return (EINVAL);
527 	}
528 
529 	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
530 		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
531 		if (xfpustate_len > cpu_max_ext_state_size -
532 		    sizeof(struct savefpu)) {
533 			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
534 			    p->p_pid, td->td_name, xfpustate_len);
535 			return (EINVAL);
536 		}
537 		xfpustate = __builtin_alloca(xfpustate_len);
538 		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
539 		    xfpustate, xfpustate_len);
540 		if (error != 0) {
541 			uprintf(
542 	"pid %d (%s): sigreturn copying xfpustate failed\n",
543 			    p->p_pid, td->td_name);
544 			return (error);
545 		}
546 	} else {
547 		xfpustate = NULL;
548 		xfpustate_len = 0;
549 	}
550 	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
551 	if (ret != 0) {
552 		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
553 		    p->p_pid, td->td_name, ret);
554 		return (ret);
555 	}
556 	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
557 	update_pcb_bases(pcb);
558 	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
559 	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
560 
561 #if defined(COMPAT_43)
562 	if (ucp->uc_mcontext.mc_onstack & 1)
563 		td->td_sigstk.ss_flags |= SS_ONSTACK;
564 	else
565 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
566 #endif
567 
568 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
569 	return (EJUSTRETURN);
570 }
571 
572 #ifdef COMPAT_FREEBSD4
573 int
574 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
575 {
576 
577 	return sys_sigreturn(td, (struct sigreturn_args *)uap);
578 }
579 #endif
580 
581 /*
582  * Reset the hardware debug registers if they were in use.
583  * They won't have any meaning for the newly exec'd process.
584  */
585 void
586 x86_clear_dbregs(struct pcb *pcb)
587 {
588 	if ((pcb->pcb_flags & PCB_DBREGS) == 0)
589 		return;
590 
591 	pcb->pcb_dr0 = 0;
592 	pcb->pcb_dr1 = 0;
593 	pcb->pcb_dr2 = 0;
594 	pcb->pcb_dr3 = 0;
595 	pcb->pcb_dr6 = 0;
596 	pcb->pcb_dr7 = 0;
597 
598 	if (pcb == curpcb) {
599 		/*
600 		 * Clear the debug registers on the running CPU,
601 		 * otherwise they will end up affecting the next
602 		 * process we switch to.
603 		 */
604 		reset_dbregs();
605 	}
606 	clear_pcb_flags(pcb, PCB_DBREGS);
607 }
608 
609 /*
610  * Reset registers to default values on exec.
611  */
612 void
613 exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack)
614 {
615 	struct trapframe *regs;
616 	struct pcb *pcb;
617 	register_t saved_rflags;
618 
619 	regs = td->td_frame;
620 	pcb = td->td_pcb;
621 
622 	if (td->td_proc->p_md.md_ldt != NULL)
623 		user_ldt_free(td);
624 
625 	update_pcb_bases(pcb);
626 	pcb->pcb_fsbase = 0;
627 	pcb->pcb_gsbase = 0;
628 	clear_pcb_flags(pcb, PCB_32BIT);
629 	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
630 
631 	saved_rflags = regs->tf_rflags & PSL_T;
632 	bzero((char *)regs, sizeof(struct trapframe));
633 	regs->tf_rip = imgp->entry_addr;
634 	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
635 	regs->tf_rdi = stack;		/* argv */
636 	regs->tf_rflags = PSL_USER | saved_rflags;
637 	regs->tf_ss = _udatasel;
638 	regs->tf_cs = _ucodesel;
639 	regs->tf_ds = _udatasel;
640 	regs->tf_es = _udatasel;
641 	regs->tf_fs = _ufssel;
642 	regs->tf_gs = _ugssel;
643 	regs->tf_flags = TF_HASSEGS;
644 
645 	x86_clear_dbregs(pcb);
646 
647 	/*
648 	 * Drop the FP state if we hold it, so that the process gets a
649 	 * clean FP state if it uses the FPU again.
650 	 */
651 	fpstate_drop(td);
652 }
653 
654 void
655 cpu_setregs(void)
656 {
657 	register_t cr0;
658 
659 	cr0 = rcr0();
660 	/*
661 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
662 	 * BSP.  See the comments there about why we set them.
663 	 */
664 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
665 	load_cr0(cr0);
666 }
667 
668 /*
669  * Initialize amd64 and configure to run kernel
670  */
671 
672 /*
673  * Initialize segments & interrupt table
674  */
675 static struct gate_descriptor idt0[NIDT];
676 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
677 
678 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
679 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
680 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
681 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
682 CTASSERT(sizeof(struct nmi_pcpu) == 16);
683 
684 /*
685  * Software prototypes -- in more palatable form.
686  *
687  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
688  * slots as corresponding segments for i386 kernel.
689  */
690 struct soft_segment_descriptor gdt_segs[] = {
691 /* GNULL_SEL	0 Null Descriptor */
692 {	.ssd_base = 0x0,
693 	.ssd_limit = 0x0,
694 	.ssd_type = 0,
695 	.ssd_dpl = 0,
696 	.ssd_p = 0,
697 	.ssd_long = 0,
698 	.ssd_def32 = 0,
699 	.ssd_gran = 0		},
700 /* GNULL2_SEL	1 Null Descriptor */
701 {	.ssd_base = 0x0,
702 	.ssd_limit = 0x0,
703 	.ssd_type = 0,
704 	.ssd_dpl = 0,
705 	.ssd_p = 0,
706 	.ssd_long = 0,
707 	.ssd_def32 = 0,
708 	.ssd_gran = 0		},
709 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
710 {	.ssd_base = 0x0,
711 	.ssd_limit = 0xfffff,
712 	.ssd_type = SDT_MEMRWA,
713 	.ssd_dpl = SEL_UPL,
714 	.ssd_p = 1,
715 	.ssd_long = 0,
716 	.ssd_def32 = 1,
717 	.ssd_gran = 1		},
718 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
719 {	.ssd_base = 0x0,
720 	.ssd_limit = 0xfffff,
721 	.ssd_type = SDT_MEMRWA,
722 	.ssd_dpl = SEL_UPL,
723 	.ssd_p = 1,
724 	.ssd_long = 0,
725 	.ssd_def32 = 1,
726 	.ssd_gran = 1		},
727 /* GCODE_SEL	4 Code Descriptor for kernel */
728 {	.ssd_base = 0x0,
729 	.ssd_limit = 0xfffff,
730 	.ssd_type = SDT_MEMERA,
731 	.ssd_dpl = SEL_KPL,
732 	.ssd_p = 1,
733 	.ssd_long = 1,
734 	.ssd_def32 = 0,
735 	.ssd_gran = 1		},
736 /* GDATA_SEL	5 Data Descriptor for kernel */
737 {	.ssd_base = 0x0,
738 	.ssd_limit = 0xfffff,
739 	.ssd_type = SDT_MEMRWA,
740 	.ssd_dpl = SEL_KPL,
741 	.ssd_p = 1,
742 	.ssd_long = 1,
743 	.ssd_def32 = 0,
744 	.ssd_gran = 1		},
745 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
746 {	.ssd_base = 0x0,
747 	.ssd_limit = 0xfffff,
748 	.ssd_type = SDT_MEMERA,
749 	.ssd_dpl = SEL_UPL,
750 	.ssd_p = 1,
751 	.ssd_long = 0,
752 	.ssd_def32 = 1,
753 	.ssd_gran = 1		},
754 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
755 {	.ssd_base = 0x0,
756 	.ssd_limit = 0xfffff,
757 	.ssd_type = SDT_MEMRWA,
758 	.ssd_dpl = SEL_UPL,
759 	.ssd_p = 1,
760 	.ssd_long = 0,
761 	.ssd_def32 = 1,
762 	.ssd_gran = 1		},
763 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
764 {	.ssd_base = 0x0,
765 	.ssd_limit = 0xfffff,
766 	.ssd_type = SDT_MEMERA,
767 	.ssd_dpl = SEL_UPL,
768 	.ssd_p = 1,
769 	.ssd_long = 1,
770 	.ssd_def32 = 0,
771 	.ssd_gran = 1		},
772 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
773 {	.ssd_base = 0x0,
774 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
775 	.ssd_type = SDT_SYSTSS,
776 	.ssd_dpl = SEL_KPL,
777 	.ssd_p = 1,
778 	.ssd_long = 0,
779 	.ssd_def32 = 0,
780 	.ssd_gran = 0		},
781 /* Actually, the TSS is a system descriptor which is double size */
782 {	.ssd_base = 0x0,
783 	.ssd_limit = 0x0,
784 	.ssd_type = 0,
785 	.ssd_dpl = 0,
786 	.ssd_p = 0,
787 	.ssd_long = 0,
788 	.ssd_def32 = 0,
789 	.ssd_gran = 0		},
790 /* GUSERLDT_SEL	11 LDT Descriptor */
791 {	.ssd_base = 0x0,
792 	.ssd_limit = 0x0,
793 	.ssd_type = 0,
794 	.ssd_dpl = 0,
795 	.ssd_p = 0,
796 	.ssd_long = 0,
797 	.ssd_def32 = 0,
798 	.ssd_gran = 0		},
799 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
800 {	.ssd_base = 0x0,
801 	.ssd_limit = 0x0,
802 	.ssd_type = 0,
803 	.ssd_dpl = 0,
804 	.ssd_p = 0,
805 	.ssd_long = 0,
806 	.ssd_def32 = 0,
807 	.ssd_gran = 0		},
808 };
809 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
810 
811 void
812 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
813 {
814 	struct gate_descriptor *ip;
815 
816 	ip = idt + idx;
817 	ip->gd_looffset = (uintptr_t)func;
818 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
819 	ip->gd_ist = ist;
820 	ip->gd_xx = 0;
821 	ip->gd_type = typ;
822 	ip->gd_dpl = dpl;
823 	ip->gd_p = 1;
824 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
825 }
826 
827 extern inthand_t
828 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
829 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
830 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
831 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
832 	IDTVEC(xmm), IDTVEC(dblfault),
833 	IDTVEC(div_pti), IDTVEC(bpt_pti),
834 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
835 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
836 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
837 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
838 	IDTVEC(xmm_pti),
839 #ifdef KDTRACE_HOOKS
840 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
841 #endif
842 #ifdef XENHVM
843 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
844 #endif
845 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
846 	IDTVEC(fast_syscall_pti);
847 
848 #ifdef DDB
849 /*
850  * Display the index and function name of any IDT entries that don't use
851  * the default 'rsvd' entry point.
852  */
853 DB_SHOW_COMMAND(idt, db_show_idt)
854 {
855 	struct gate_descriptor *ip;
856 	int idx;
857 	uintptr_t func;
858 
859 	ip = idt;
860 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
861 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
862 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
863 			db_printf("%3d\t", idx);
864 			db_printsym(func, DB_STGY_PROC);
865 			db_printf("\n");
866 		}
867 		ip++;
868 	}
869 }
870 
871 /* Show privileged registers. */
872 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
873 {
874 	struct {
875 		uint16_t limit;
876 		uint64_t base;
877 	} __packed idtr, gdtr;
878 	uint16_t ldt, tr;
879 
880 	__asm __volatile("sidt %0" : "=m" (idtr));
881 	db_printf("idtr\t0x%016lx/%04x\n",
882 	    (u_long)idtr.base, (u_int)idtr.limit);
883 	__asm __volatile("sgdt %0" : "=m" (gdtr));
884 	db_printf("gdtr\t0x%016lx/%04x\n",
885 	    (u_long)gdtr.base, (u_int)gdtr.limit);
886 	__asm __volatile("sldt %0" : "=r" (ldt));
887 	db_printf("ldtr\t0x%04x\n", ldt);
888 	__asm __volatile("str %0" : "=r" (tr));
889 	db_printf("tr\t0x%04x\n", tr);
890 	db_printf("cr0\t0x%016lx\n", rcr0());
891 	db_printf("cr2\t0x%016lx\n", rcr2());
892 	db_printf("cr3\t0x%016lx\n", rcr3());
893 	db_printf("cr4\t0x%016lx\n", rcr4());
894 	if (rcr4() & CR4_XSAVE)
895 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
896 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
897 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
898 		db_printf("FEATURES_CTL\t%016lx\n",
899 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
900 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
901 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
902 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
903 }
904 
905 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
906 {
907 
908 	db_printf("dr0\t0x%016lx\n", rdr0());
909 	db_printf("dr1\t0x%016lx\n", rdr1());
910 	db_printf("dr2\t0x%016lx\n", rdr2());
911 	db_printf("dr3\t0x%016lx\n", rdr3());
912 	db_printf("dr6\t0x%016lx\n", rdr6());
913 	db_printf("dr7\t0x%016lx\n", rdr7());
914 }
915 #endif
916 
917 void
918 sdtossd(sd, ssd)
919 	struct user_segment_descriptor *sd;
920 	struct soft_segment_descriptor *ssd;
921 {
922 
923 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
924 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
925 	ssd->ssd_type  = sd->sd_type;
926 	ssd->ssd_dpl   = sd->sd_dpl;
927 	ssd->ssd_p     = sd->sd_p;
928 	ssd->ssd_long  = sd->sd_long;
929 	ssd->ssd_def32 = sd->sd_def32;
930 	ssd->ssd_gran  = sd->sd_gran;
931 }
932 
933 void
934 ssdtosd(ssd, sd)
935 	struct soft_segment_descriptor *ssd;
936 	struct user_segment_descriptor *sd;
937 {
938 
939 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
940 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
941 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
942 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
943 	sd->sd_type  = ssd->ssd_type;
944 	sd->sd_dpl   = ssd->ssd_dpl;
945 	sd->sd_p     = ssd->ssd_p;
946 	sd->sd_long  = ssd->ssd_long;
947 	sd->sd_def32 = ssd->ssd_def32;
948 	sd->sd_gran  = ssd->ssd_gran;
949 }
950 
951 void
952 ssdtosyssd(ssd, sd)
953 	struct soft_segment_descriptor *ssd;
954 	struct system_segment_descriptor *sd;
955 {
956 
957 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
958 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
959 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
960 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
961 	sd->sd_type  = ssd->ssd_type;
962 	sd->sd_dpl   = ssd->ssd_dpl;
963 	sd->sd_p     = ssd->ssd_p;
964 	sd->sd_gran  = ssd->ssd_gran;
965 }
966 
967 u_int basemem;
968 
969 static int
970 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
971     int *physmap_idxp)
972 {
973 	int i, insert_idx, physmap_idx;
974 
975 	physmap_idx = *physmap_idxp;
976 
977 	if (length == 0)
978 		return (1);
979 
980 	/*
981 	 * Find insertion point while checking for overlap.  Start off by
982 	 * assuming the new entry will be added to the end.
983 	 *
984 	 * NB: physmap_idx points to the next free slot.
985 	 */
986 	insert_idx = physmap_idx;
987 	for (i = 0; i <= physmap_idx; i += 2) {
988 		if (base < physmap[i + 1]) {
989 			if (base + length <= physmap[i]) {
990 				insert_idx = i;
991 				break;
992 			}
993 			if (boothowto & RB_VERBOSE)
994 				printf(
995 		    "Overlapping memory regions, ignoring second region\n");
996 			return (1);
997 		}
998 	}
999 
1000 	/* See if we can prepend to the next entry. */
1001 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1002 		physmap[insert_idx] = base;
1003 		return (1);
1004 	}
1005 
1006 	/* See if we can append to the previous entry. */
1007 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1008 		physmap[insert_idx - 1] += length;
1009 		return (1);
1010 	}
1011 
1012 	physmap_idx += 2;
1013 	*physmap_idxp = physmap_idx;
1014 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
1015 		printf(
1016 		"Too many segments in the physical address map, giving up\n");
1017 		return (0);
1018 	}
1019 
1020 	/*
1021 	 * Move the last 'N' entries down to make room for the new
1022 	 * entry if needed.
1023 	 */
1024 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1025 		physmap[i] = physmap[i - 2];
1026 		physmap[i + 1] = physmap[i - 1];
1027 	}
1028 
1029 	/* Insert the new entry. */
1030 	physmap[insert_idx] = base;
1031 	physmap[insert_idx + 1] = base + length;
1032 	return (1);
1033 }
1034 
1035 void
1036 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1037                       vm_paddr_t *physmap, int *physmap_idx)
1038 {
1039 	struct bios_smap *smap, *smapend;
1040 
1041 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1042 
1043 	for (smap = smapbase; smap < smapend; smap++) {
1044 		if (boothowto & RB_VERBOSE)
1045 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
1046 			    smap->type, smap->base, smap->length);
1047 
1048 		if (smap->type != SMAP_TYPE_MEMORY)
1049 			continue;
1050 
1051 		if (!add_physmap_entry(smap->base, smap->length, physmap,
1052 		    physmap_idx))
1053 			break;
1054 	}
1055 }
1056 
1057 static void
1058 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1059     int *physmap_idx)
1060 {
1061 	struct efi_md *map, *p;
1062 	const char *type;
1063 	size_t efisz;
1064 	int ndesc, i;
1065 
1066 	static const char *types[] = {
1067 		"Reserved",
1068 		"LoaderCode",
1069 		"LoaderData",
1070 		"BootServicesCode",
1071 		"BootServicesData",
1072 		"RuntimeServicesCode",
1073 		"RuntimeServicesData",
1074 		"ConventionalMemory",
1075 		"UnusableMemory",
1076 		"ACPIReclaimMemory",
1077 		"ACPIMemoryNVS",
1078 		"MemoryMappedIO",
1079 		"MemoryMappedIOPortSpace",
1080 		"PalCode",
1081 		"PersistentMemory"
1082 	};
1083 
1084 	/*
1085 	 * Memory map data provided by UEFI via the GetMemoryMap
1086 	 * Boot Services API.
1087 	 */
1088 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1089 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1090 
1091 	if (efihdr->descriptor_size == 0)
1092 		return;
1093 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
1094 
1095 	if (boothowto & RB_VERBOSE)
1096 		printf("%23s %12s %12s %8s %4s\n",
1097 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
1098 
1099 	for (i = 0, p = map; i < ndesc; i++,
1100 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1101 		if (boothowto & RB_VERBOSE) {
1102 			if (p->md_type < nitems(types))
1103 				type = types[p->md_type];
1104 			else
1105 				type = "<INVALID>";
1106 			printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
1107 			    p->md_virt, p->md_pages);
1108 			if (p->md_attr & EFI_MD_ATTR_UC)
1109 				printf("UC ");
1110 			if (p->md_attr & EFI_MD_ATTR_WC)
1111 				printf("WC ");
1112 			if (p->md_attr & EFI_MD_ATTR_WT)
1113 				printf("WT ");
1114 			if (p->md_attr & EFI_MD_ATTR_WB)
1115 				printf("WB ");
1116 			if (p->md_attr & EFI_MD_ATTR_UCE)
1117 				printf("UCE ");
1118 			if (p->md_attr & EFI_MD_ATTR_WP)
1119 				printf("WP ");
1120 			if (p->md_attr & EFI_MD_ATTR_RP)
1121 				printf("RP ");
1122 			if (p->md_attr & EFI_MD_ATTR_XP)
1123 				printf("XP ");
1124 			if (p->md_attr & EFI_MD_ATTR_NV)
1125 				printf("NV ");
1126 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1127 				printf("MORE_RELIABLE ");
1128 			if (p->md_attr & EFI_MD_ATTR_RO)
1129 				printf("RO ");
1130 			if (p->md_attr & EFI_MD_ATTR_RT)
1131 				printf("RUNTIME");
1132 			printf("\n");
1133 		}
1134 
1135 		switch (p->md_type) {
1136 		case EFI_MD_TYPE_CODE:
1137 		case EFI_MD_TYPE_DATA:
1138 		case EFI_MD_TYPE_BS_CODE:
1139 		case EFI_MD_TYPE_BS_DATA:
1140 		case EFI_MD_TYPE_FREE:
1141 			/*
1142 			 * We're allowed to use any entry with these types.
1143 			 */
1144 			break;
1145 		default:
1146 			continue;
1147 		}
1148 
1149 		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1150 		    physmap, physmap_idx))
1151 			break;
1152 	}
1153 }
1154 
1155 static void
1156 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1157 {
1158 	struct bios_smap *smap;
1159 	struct efi_map_header *efihdr;
1160 	u_int32_t size;
1161 
1162 	/*
1163 	 * Memory map from INT 15:E820.
1164 	 *
1165 	 * subr_module.c says:
1166 	 * "Consumer may safely assume that size value precedes data."
1167 	 * ie: an int32_t immediately precedes smap.
1168 	 */
1169 
1170 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1171 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1172 	smap = (struct bios_smap *)preload_search_info(kmdp,
1173 	    MODINFO_METADATA | MODINFOMD_SMAP);
1174 	if (efihdr == NULL && smap == NULL)
1175 		panic("No BIOS smap or EFI map info from loader!");
1176 
1177 	if (efihdr != NULL) {
1178 		add_efi_map_entries(efihdr, physmap, physmap_idx);
1179 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1180 	} else {
1181 		size = *((u_int32_t *)smap - 1);
1182 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
1183 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1184 	}
1185 }
1186 
1187 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
1188 
1189 /*
1190  * Populate the (physmap) array with base/bound pairs describing the
1191  * available physical memory in the system, then test this memory and
1192  * build the phys_avail array describing the actually-available memory.
1193  *
1194  * Total memory size may be set by the kernel environment variable
1195  * hw.physmem or the compile-time define MAXMEM.
1196  *
1197  * XXX first should be vm_paddr_t.
1198  */
1199 static void
1200 getmemsize(caddr_t kmdp, u_int64_t first)
1201 {
1202 	int i, physmap_idx, pa_indx, da_indx;
1203 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
1204 	u_long physmem_start, physmem_tunable, memtest;
1205 	pt_entry_t *pte;
1206 	quad_t dcons_addr, dcons_size;
1207 	int page_counter;
1208 
1209 	/*
1210 	 * Tell the physical memory allocator about pages used to store
1211 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
1212 	 */
1213 	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
1214 
1215 	bzero(physmap, sizeof(physmap));
1216 	physmap_idx = 0;
1217 
1218 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1219 	physmap_idx -= 2;
1220 
1221 	/*
1222 	 * Find the 'base memory' segment for SMP
1223 	 */
1224 	basemem = 0;
1225 	for (i = 0; i <= physmap_idx; i += 2) {
1226 		if (physmap[i] <= 0xA0000) {
1227 			basemem = physmap[i + 1] / 1024;
1228 			break;
1229 		}
1230 	}
1231 	if (basemem == 0 || basemem > 640) {
1232 		if (bootverbose)
1233 			printf(
1234 		"Memory map doesn't contain a basemem segment, faking it");
1235 		basemem = 640;
1236 	}
1237 
1238 	/*
1239 	 * Maxmem isn't the "maximum memory", it's one larger than the
1240 	 * highest page of the physical address space.  It should be
1241 	 * called something like "Maxphyspage".  We may adjust this
1242 	 * based on ``hw.physmem'' and the results of the memory test.
1243 	 */
1244 	Maxmem = atop(physmap[physmap_idx + 1]);
1245 
1246 #ifdef MAXMEM
1247 	Maxmem = MAXMEM / 4;
1248 #endif
1249 
1250 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1251 		Maxmem = atop(physmem_tunable);
1252 
1253 	/*
1254 	 * The boot memory test is disabled by default, as it takes a
1255 	 * significant amount of time on large-memory systems, and is
1256 	 * unfriendly to virtual machines as it unnecessarily touches all
1257 	 * pages.
1258 	 *
1259 	 * A general name is used as the code may be extended to support
1260 	 * additional tests beyond the current "page present" test.
1261 	 */
1262 	memtest = 0;
1263 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1264 
1265 	/*
1266 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1267 	 * in the system.
1268 	 */
1269 	if (Maxmem > atop(physmap[physmap_idx + 1]))
1270 		Maxmem = atop(physmap[physmap_idx + 1]);
1271 
1272 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1273 	    (boothowto & RB_VERBOSE))
1274 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
1275 
1276 	/* call pmap initialization to make new kernel address space */
1277 	pmap_bootstrap(&first);
1278 
1279 	/*
1280 	 * Size up each available chunk of physical memory.
1281 	 *
1282 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1283 	 * By default, mask off the first 16 pages unless we appear to be
1284 	 * running in a VM.
1285 	 */
1286 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1287 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1288 	if (physmap[0] < physmem_start) {
1289 		if (physmem_start < PAGE_SIZE)
1290 			physmap[0] = PAGE_SIZE;
1291 		else if (physmem_start >= physmap[1])
1292 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1293 		else
1294 			physmap[0] = round_page(physmem_start);
1295 	}
1296 	pa_indx = 0;
1297 	da_indx = 1;
1298 	phys_avail[pa_indx++] = physmap[0];
1299 	phys_avail[pa_indx] = physmap[0];
1300 	dump_avail[da_indx] = physmap[0];
1301 	pte = CMAP1;
1302 
1303 	/*
1304 	 * Get dcons buffer address
1305 	 */
1306 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1307 	    getenv_quad("dcons.size", &dcons_size) == 0)
1308 		dcons_addr = 0;
1309 
1310 	/*
1311 	 * physmap is in bytes, so when converting to page boundaries,
1312 	 * round up the start address and round down the end address.
1313 	 */
1314 	page_counter = 0;
1315 	if (memtest != 0)
1316 		printf("Testing system memory");
1317 	for (i = 0; i <= physmap_idx; i += 2) {
1318 		vm_paddr_t end;
1319 
1320 		end = ptoa((vm_paddr_t)Maxmem);
1321 		if (physmap[i + 1] < end)
1322 			end = trunc_page(physmap[i + 1]);
1323 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1324 			int tmp, page_bad, full;
1325 			int *ptr = (int *)CADDR1;
1326 
1327 			full = FALSE;
1328 			/*
1329 			 * block out kernel memory as not available.
1330 			 */
1331 			if (pa >= (vm_paddr_t)kernphys && pa < first)
1332 				goto do_dump_avail;
1333 
1334 			/*
1335 			 * block out dcons buffer
1336 			 */
1337 			if (dcons_addr > 0
1338 			    && pa >= trunc_page(dcons_addr)
1339 			    && pa < dcons_addr + dcons_size)
1340 				goto do_dump_avail;
1341 
1342 			page_bad = FALSE;
1343 			if (memtest == 0)
1344 				goto skip_memtest;
1345 
1346 			/*
1347 			 * Print a "." every GB to show we're making
1348 			 * progress.
1349 			 */
1350 			page_counter++;
1351 			if ((page_counter % PAGES_PER_GB) == 0)
1352 				printf(".");
1353 
1354 			/*
1355 			 * map page into kernel: valid, read/write,non-cacheable
1356 			 */
1357 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1358 			invltlb();
1359 
1360 			tmp = *(int *)ptr;
1361 			/*
1362 			 * Test for alternating 1's and 0's
1363 			 */
1364 			*(volatile int *)ptr = 0xaaaaaaaa;
1365 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1366 				page_bad = TRUE;
1367 			/*
1368 			 * Test for alternating 0's and 1's
1369 			 */
1370 			*(volatile int *)ptr = 0x55555555;
1371 			if (*(volatile int *)ptr != 0x55555555)
1372 				page_bad = TRUE;
1373 			/*
1374 			 * Test for all 1's
1375 			 */
1376 			*(volatile int *)ptr = 0xffffffff;
1377 			if (*(volatile int *)ptr != 0xffffffff)
1378 				page_bad = TRUE;
1379 			/*
1380 			 * Test for all 0's
1381 			 */
1382 			*(volatile int *)ptr = 0x0;
1383 			if (*(volatile int *)ptr != 0x0)
1384 				page_bad = TRUE;
1385 			/*
1386 			 * Restore original value.
1387 			 */
1388 			*(int *)ptr = tmp;
1389 
1390 skip_memtest:
1391 			/*
1392 			 * Adjust array of valid/good pages.
1393 			 */
1394 			if (page_bad == TRUE)
1395 				continue;
1396 			/*
1397 			 * If this good page is a continuation of the
1398 			 * previous set of good pages, then just increase
1399 			 * the end pointer. Otherwise start a new chunk.
1400 			 * Note that "end" points one higher than end,
1401 			 * making the range >= start and < end.
1402 			 * If we're also doing a speculative memory
1403 			 * test and we at or past the end, bump up Maxmem
1404 			 * so that we keep going. The first bad page
1405 			 * will terminate the loop.
1406 			 */
1407 			if (phys_avail[pa_indx] == pa) {
1408 				phys_avail[pa_indx] += PAGE_SIZE;
1409 			} else {
1410 				pa_indx++;
1411 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
1412 					printf(
1413 		"Too many holes in the physical address space, giving up\n");
1414 					pa_indx--;
1415 					full = TRUE;
1416 					goto do_dump_avail;
1417 				}
1418 				phys_avail[pa_indx++] = pa;	/* start */
1419 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1420 			}
1421 			physmem++;
1422 do_dump_avail:
1423 			if (dump_avail[da_indx] == pa) {
1424 				dump_avail[da_indx] += PAGE_SIZE;
1425 			} else {
1426 				da_indx++;
1427 				if (da_indx == PHYS_AVAIL_ENTRIES) {
1428 					da_indx--;
1429 					goto do_next;
1430 				}
1431 				dump_avail[da_indx++] = pa; /* start */
1432 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1433 			}
1434 do_next:
1435 			if (full)
1436 				break;
1437 		}
1438 	}
1439 	*pte = 0;
1440 	invltlb();
1441 	if (memtest != 0)
1442 		printf("\n");
1443 
1444 	/*
1445 	 * XXX
1446 	 * The last chunk must contain at least one page plus the message
1447 	 * buffer to avoid complicating other code (message buffer address
1448 	 * calculation, etc.).
1449 	 */
1450 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1451 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1452 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1453 		phys_avail[pa_indx--] = 0;
1454 		phys_avail[pa_indx--] = 0;
1455 	}
1456 
1457 	Maxmem = atop(phys_avail[pa_indx]);
1458 
1459 	/* Trim off space for the message buffer. */
1460 	phys_avail[pa_indx] -= round_page(msgbufsize);
1461 
1462 	/* Map the message buffer. */
1463 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1464 }
1465 
1466 static caddr_t
1467 native_parse_preload_data(u_int64_t modulep)
1468 {
1469 	caddr_t kmdp;
1470 	char *envp;
1471 #ifdef DDB
1472 	vm_offset_t ksym_start;
1473 	vm_offset_t ksym_end;
1474 #endif
1475 
1476 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1477 	preload_bootstrap_relocate(KERNBASE);
1478 	kmdp = preload_search_by_type("elf kernel");
1479 	if (kmdp == NULL)
1480 		kmdp = preload_search_by_type("elf64 kernel");
1481 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1482 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1483 	if (envp != NULL)
1484 		envp += KERNBASE;
1485 	init_static_kenv(envp, 0);
1486 #ifdef DDB
1487 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1488 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1489 	db_fetch_ksymtab(ksym_start, ksym_end, 0);
1490 #endif
1491 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1492 
1493 	return (kmdp);
1494 }
1495 
1496 static void
1497 amd64_kdb_init(void)
1498 {
1499 	kdb_init();
1500 #ifdef KDB
1501 	if (boothowto & RB_KDB)
1502 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1503 #endif
1504 }
1505 
1506 /* Set up the fast syscall stuff */
1507 void
1508 amd64_conf_fast_syscall(void)
1509 {
1510 	uint64_t msr;
1511 
1512 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1513 	wrmsr(MSR_EFER, msr);
1514 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1515 	    (u_int64_t)IDTVEC(fast_syscall));
1516 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1517 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1518 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1519 	wrmsr(MSR_STAR, msr);
1520 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1521 }
1522 
1523 void
1524 amd64_bsp_pcpu_init1(struct pcpu *pc)
1525 {
1526 	struct user_segment_descriptor *gdt;
1527 
1528 	PCPU_SET(prvspace, pc);
1529 	gdt = *PCPU_PTR(gdt);
1530 	PCPU_SET(curthread, &thread0);
1531 	PCPU_SET(tssp, PCPU_PTR(common_tss));
1532 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1533 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1534 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1535 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1536 	PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1537 	PCPU_SET(smp_tlb_gen, 1);
1538 }
1539 
1540 void
1541 amd64_bsp_pcpu_init2(uint64_t rsp0)
1542 {
1543 
1544 	PCPU_SET(rsp0, rsp0);
1545 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1546 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1547 	PCPU_SET(curpcb, thread0.td_pcb);
1548 }
1549 
1550 void
1551 amd64_bsp_ist_init(struct pcpu *pc)
1552 {
1553 	struct nmi_pcpu *np;
1554 	struct amd64tss *tssp;
1555 
1556 	tssp = &pc->pc_common_tss;
1557 
1558 	/* doublefault stack space, runs on ist1 */
1559 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1560 	np->np_pcpu = (register_t)pc;
1561 	tssp->tss_ist1 = (long)np;
1562 
1563 	/*
1564 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1565 	 * above the start of the ist2 stack.
1566 	 */
1567 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1568 	np->np_pcpu = (register_t)pc;
1569 	tssp->tss_ist2 = (long)np;
1570 
1571 	/*
1572 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1573 	 * above the start of the ist3 stack.
1574 	 */
1575 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1576 	np->np_pcpu = (register_t)pc;
1577 	tssp->tss_ist3 = (long)np;
1578 
1579 	/*
1580 	 * DB# stack, runs on ist4.
1581 	 */
1582 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1583 	np->np_pcpu = (register_t)pc;
1584 	tssp->tss_ist4 = (long)np;
1585 }
1586 
1587 u_int64_t
1588 hammer_time(u_int64_t modulep, u_int64_t physfree)
1589 {
1590 	caddr_t kmdp;
1591 	int gsel_tss, x;
1592 	struct pcpu *pc;
1593 	struct xstate_hdr *xhdr;
1594 	uint64_t cr3, rsp0;
1595 	pml4_entry_t *pml4e;
1596 	pdp_entry_t *pdpe;
1597 	pd_entry_t *pde;
1598 	char *env;
1599 	struct user_segment_descriptor *gdt;
1600 	struct region_descriptor r_gdt;
1601 	size_t kstack0_sz;
1602 	int late_console;
1603 
1604 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1605 
1606 	/*
1607 	 * Calculate kernphys by inspecting page table created by loader.
1608 	 * The assumptions:
1609 	 * - kernel is mapped at KERNBASE, backed by contiguous phys memory
1610 	 *   aligned at 2M, below 4G (the latter is important for AP startup)
1611 	 * - there is a 2M hole at KERNBASE
1612 	 * - kernel is mapped with 2M superpages
1613 	 * - all participating memory, i.e. kernel, modules, metadata,
1614 	 *   page table is accessible by pre-created 1:1 mapping
1615 	 *   (right now loader creates 1:1 mapping for lower 4G, and all
1616 	 *   memory is from there)
1617 	 * - there is a usable memory block right after the end of the
1618 	 *   mapped kernel and all modules/metadata, pointed to by
1619 	 *   physfree, for early allocations
1620 	 */
1621 	cr3 = rcr3();
1622 	pml4e = (pml4_entry_t *)(cr3 & ~PAGE_MASK) + pmap_pml4e_index(
1623 	    (vm_offset_t)hammer_time);
1624 	pdpe = (pdp_entry_t *)(*pml4e & ~PAGE_MASK) + pmap_pdpe_index(
1625 	    (vm_offset_t)hammer_time);
1626 	pde = (pd_entry_t *)(*pdpe & ~PAGE_MASK) + pmap_pde_index(
1627 	    (vm_offset_t)hammer_time);
1628 	kernphys = (vm_paddr_t)(*pde & ~PDRMASK) -
1629 	    (vm_paddr_t)(((vm_offset_t)hammer_time - KERNBASE) & ~PDRMASK);
1630 
1631 	/* Fix-up for 2M hole */
1632 	physfree += kernphys;
1633 	kernphys += NBPDR;
1634 
1635 	kmdp = init_ops.parse_preload_data(modulep);
1636 
1637 	efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
1638 	    MODINFOMD_EFI_MAP) != NULL;
1639 
1640 	if (!efi_boot) {
1641 		/* Tell the bios to warmboot next time */
1642 		atomic_store_short((u_short *)0x472, 0x1234);
1643 	}
1644 
1645 	physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
1646 	physfree = roundup2(physfree, PAGE_SIZE);
1647 
1648 	identify_cpu1();
1649 	identify_hypervisor();
1650 	identify_cpu_fixup_bsp();
1651 	identify_cpu2();
1652 	initializecpucache();
1653 
1654 	/*
1655 	 * Check for pti, pcid, and invpcid before ifuncs are
1656 	 * resolved, to correctly select the implementation for
1657 	 * pmap_activate_sw_mode().
1658 	 */
1659 	pti = pti_get_default();
1660 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1661 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1662 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1663 		invpcid_works = (cpu_stdext_feature &
1664 		    CPUID_STDEXT_INVPCID) != 0;
1665 	} else {
1666 		pmap_pcid_enabled = 0;
1667 	}
1668 
1669 	link_elf_ireloc(kmdp);
1670 
1671 	/*
1672 	 * This may be done better later if it gets more high level
1673 	 * components in it. If so just link td->td_proc here.
1674 	 */
1675 	proc_linkup0(&proc0, &thread0);
1676 
1677 	/* Init basic tunables, hz etc */
1678 	init_param1();
1679 
1680 	thread0.td_kstack = physfree - kernphys + KERNSTART;
1681 	thread0.td_kstack_pages = kstack_pages;
1682 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1683 	bzero((void *)thread0.td_kstack, kstack0_sz);
1684 	physfree += kstack0_sz;
1685 
1686 	/*
1687 	 * Initialize enough of thread0 for delayed invalidation to
1688 	 * work very early.  Rely on thread0.td_base_pri
1689 	 * zero-initialization, it is reset to PVM at proc0_init().
1690 	 */
1691 	pmap_thread_init_invl_gen(&thread0);
1692 
1693 	pc = &temp_bsp_pcpu;
1694 	pcpu_init(pc, 0, sizeof(struct pcpu));
1695 	gdt = &temp_bsp_pcpu.pc_gdt[0];
1696 
1697 	/*
1698 	 * make gdt memory segments
1699 	 */
1700 	for (x = 0; x < NGDT; x++) {
1701 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1702 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1703 			ssdtosd(&gdt_segs[x], &gdt[x]);
1704 	}
1705 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1706 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1707 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1708 
1709 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1710 	r_gdt.rd_base = (long)gdt;
1711 	lgdt(&r_gdt);
1712 
1713 	wrmsr(MSR_FSBASE, 0);		/* User value */
1714 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1715 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1716 
1717 	dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
1718 	physfree += DPCPU_SIZE;
1719 	amd64_bsp_pcpu_init1(pc);
1720 	/* Non-late cninit() and printf() can be moved up to here. */
1721 
1722 	/*
1723 	 * Initialize mutexes.
1724 	 *
1725 	 * icu_lock: in order to allow an interrupt to occur in a critical
1726 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1727 	 *	     must be able to get the icu lock, so it can't be
1728 	 *	     under witness.
1729 	 */
1730 	mutex_init();
1731 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1732 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1733 
1734 	/* exceptions */
1735 	for (x = 0; x < NIDT; x++)
1736 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1737 		    SEL_KPL, 0);
1738 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1739 	    SEL_KPL, 0);
1740 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1741 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1742 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1743 	    SEL_UPL, 0);
1744 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1745 	    SEL_UPL, 0);
1746 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1747 	    SEL_KPL, 0);
1748 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1749 	    SEL_KPL, 0);
1750 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1751 	    SEL_KPL, 0);
1752 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1753 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1754 	    SDT_SYSIGT, SEL_KPL, 0);
1755 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1756 	    SEL_KPL, 0);
1757 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1758 	    SDT_SYSIGT, SEL_KPL, 0);
1759 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1760 	    SEL_KPL, 0);
1761 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1762 	    SEL_KPL, 0);
1763 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1764 	    SEL_KPL, 0);
1765 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1766 	    SEL_KPL, 0);
1767 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1768 	    SEL_KPL, 0);
1769 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1770 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1771 	    SEL_KPL, 0);
1772 #ifdef KDTRACE_HOOKS
1773 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1774 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1775 #endif
1776 #ifdef XENHVM
1777 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1778 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1779 #endif
1780 	r_idt.rd_limit = sizeof(idt0) - 1;
1781 	r_idt.rd_base = (long) idt;
1782 	lidt(&r_idt);
1783 
1784 	/*
1785 	 * Initialize the clock before the console so that console
1786 	 * initialization can use DELAY().
1787 	 */
1788 	clock_init();
1789 
1790 	/*
1791 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1792 	 * transition).
1793 	 * Once bootblocks have updated, we can test directly for
1794 	 * efi_systbl != NULL here...
1795 	 */
1796 	if (efi_boot)
1797 		vty_set_preferred(VTY_VT);
1798 
1799 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1800 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1801 
1802 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1803 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1804 
1805 	TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
1806 	    &syscall_ret_l1d_flush_mode);
1807 
1808 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1809 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1810 
1811 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1812 
1813 	TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
1814 	    &x86_rngds_mitg_enable);
1815 
1816 	finishidentcpu();	/* Final stage of CPU initialization */
1817 	initializecpu();	/* Initialize CPU registers */
1818 
1819 	amd64_bsp_ist_init(pc);
1820 
1821 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1822 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1823 	    IOPERM_BITMAP_SIZE;
1824 
1825 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1826 	ltr(gsel_tss);
1827 
1828 	amd64_conf_fast_syscall();
1829 
1830 	/*
1831 	 * We initialize the PCB pointer early so that exception
1832 	 * handlers will work.  Also set up td_critnest to short-cut
1833 	 * the page fault handler.
1834 	 */
1835 	cpu_max_ext_state_size = sizeof(struct savefpu);
1836 	set_top_of_stack_td(&thread0);
1837 	thread0.td_pcb = get_pcb_td(&thread0);
1838 	thread0.td_critnest = 1;
1839 
1840 	/*
1841 	 * The console and kdb should be initialized even earlier than here,
1842 	 * but some console drivers don't work until after getmemsize().
1843 	 * Default to late console initialization to support these drivers.
1844 	 * This loses mainly printf()s in getmemsize() and early debugging.
1845 	 */
1846 	late_console = 1;
1847 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1848 	if (!late_console) {
1849 		cninit();
1850 		amd64_kdb_init();
1851 	}
1852 
1853 	getmemsize(kmdp, physfree);
1854 	init_param2(physmem);
1855 
1856 	/* now running on new page tables, configured,and u/iom is accessible */
1857 
1858 #ifdef DEV_PCI
1859         /* This call might adjust phys_avail[]. */
1860         pci_early_quirks();
1861 #endif
1862 
1863 	if (late_console)
1864 		cninit();
1865 
1866 	/*
1867 	 * Dump the boot metadata. We have to wait for cninit() since console
1868 	 * output is required. If it's grossly incorrect the kernel will never
1869 	 * make it this far.
1870 	 */
1871 	if (getenv_is_true("debug.dump_modinfo_at_boot"))
1872 		preload_dump();
1873 
1874 #ifdef DEV_ISA
1875 #ifdef DEV_ATPIC
1876 	elcr_probe();
1877 	atpic_startup();
1878 #else
1879 	/* Reset and mask the atpics and leave them shut down. */
1880 	atpic_reset();
1881 
1882 	/*
1883 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1884 	 * interrupt handler.
1885 	 */
1886 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1887 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1888 #endif
1889 #else
1890 #error "have you forgotten the isa device?"
1891 #endif
1892 
1893 	if (late_console)
1894 		amd64_kdb_init();
1895 
1896 	msgbufinit(msgbufp, msgbufsize);
1897 	fpuinit();
1898 
1899 	/*
1900 	 * Reinitialize thread0's stack base now that the xsave area size is
1901 	 * known.  Set up thread0's pcb save area after fpuinit calculated fpu
1902 	 * save area size.  Zero out the extended state header in fpu save area.
1903 	 */
1904 	set_top_of_stack_td(&thread0);
1905 	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1906 	bzero(thread0.td_pcb->pcb_save, cpu_max_ext_state_size);
1907 	if (use_xsave) {
1908 		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1909 		    1);
1910 		xhdr->xstate_bv = xsave_mask;
1911 	}
1912 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1913 	rsp0 = thread0.td_md.md_stack_base;
1914 	/* Ensure the stack is aligned to 16 bytes */
1915 	rsp0 &= ~0xFul;
1916 	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1917 	amd64_bsp_pcpu_init2(rsp0);
1918 
1919 	/* transfer to user mode */
1920 
1921 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1922 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1923 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1924 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1925 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1926 
1927 	load_ds(_udatasel);
1928 	load_es(_udatasel);
1929 	load_fs(_ufssel);
1930 
1931 	/* setup proc 0's pcb */
1932 	thread0.td_pcb->pcb_flags = 0;
1933 	thread0.td_frame = &proc0_tf;
1934 
1935         env = kern_getenv("kernelname");
1936 	if (env != NULL)
1937 		strlcpy(kernelname, env, sizeof(kernelname));
1938 
1939 	kcsan_cpu_init(0);
1940 
1941 #ifdef FDT
1942 	x86_init_fdt();
1943 #endif
1944 	thread0.td_critnest = 0;
1945 
1946 	kasan_init();
1947 	kmsan_init();
1948 
1949 	TSEXIT();
1950 
1951 	/* Location of kernel stack for locore */
1952 	return (thread0.td_md.md_stack_base);
1953 }
1954 
1955 void
1956 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1957 {
1958 
1959 	pcpu->pc_acpi_id = 0xffffffff;
1960 }
1961 
1962 static int
1963 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1964 {
1965 	struct bios_smap *smapbase;
1966 	struct bios_smap_xattr smap;
1967 	caddr_t kmdp;
1968 	uint32_t *smapattr;
1969 	int count, error, i;
1970 
1971 	/* Retrieve the system memory map from the loader. */
1972 	kmdp = preload_search_by_type("elf kernel");
1973 	if (kmdp == NULL)
1974 		kmdp = preload_search_by_type("elf64 kernel");
1975 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1976 	    MODINFO_METADATA | MODINFOMD_SMAP);
1977 	if (smapbase == NULL)
1978 		return (0);
1979 	smapattr = (uint32_t *)preload_search_info(kmdp,
1980 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1981 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1982 	error = 0;
1983 	for (i = 0; i < count; i++) {
1984 		smap.base = smapbase[i].base;
1985 		smap.length = smapbase[i].length;
1986 		smap.type = smapbase[i].type;
1987 		if (smapattr != NULL)
1988 			smap.xattr = smapattr[i];
1989 		else
1990 			smap.xattr = 0;
1991 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1992 	}
1993 	return (error);
1994 }
1995 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1996     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1997     smap_sysctl_handler, "S,bios_smap_xattr",
1998     "Raw BIOS SMAP data");
1999 
2000 static int
2001 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
2002 {
2003 	struct efi_map_header *efihdr;
2004 	caddr_t kmdp;
2005 	uint32_t efisize;
2006 
2007 	kmdp = preload_search_by_type("elf kernel");
2008 	if (kmdp == NULL)
2009 		kmdp = preload_search_by_type("elf64 kernel");
2010 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
2011 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
2012 	if (efihdr == NULL)
2013 		return (0);
2014 	efisize = *((uint32_t *)efihdr - 1);
2015 	return (SYSCTL_OUT(req, efihdr, efisize));
2016 }
2017 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
2018     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
2019     efi_map_sysctl_handler, "S,efi_map_header",
2020     "Raw EFI Memory Map");
2021 
2022 void
2023 spinlock_enter(void)
2024 {
2025 	struct thread *td;
2026 	register_t flags;
2027 
2028 	td = curthread;
2029 	if (td->td_md.md_spinlock_count == 0) {
2030 		flags = intr_disable();
2031 		td->td_md.md_spinlock_count = 1;
2032 		td->td_md.md_saved_flags = flags;
2033 		critical_enter();
2034 	} else
2035 		td->td_md.md_spinlock_count++;
2036 }
2037 
2038 void
2039 spinlock_exit(void)
2040 {
2041 	struct thread *td;
2042 	register_t flags;
2043 
2044 	td = curthread;
2045 	flags = td->td_md.md_saved_flags;
2046 	td->td_md.md_spinlock_count--;
2047 	if (td->td_md.md_spinlock_count == 0) {
2048 		critical_exit();
2049 		intr_restore(flags);
2050 	}
2051 }
2052 
2053 /*
2054  * Construct a PCB from a trapframe. This is called from kdb_trap() where
2055  * we want to start a backtrace from the function that caused us to enter
2056  * the debugger. We have the context in the trapframe, but base the trace
2057  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
2058  * enough for a backtrace.
2059  */
2060 void
2061 makectx(struct trapframe *tf, struct pcb *pcb)
2062 {
2063 
2064 	pcb->pcb_r12 = tf->tf_r12;
2065 	pcb->pcb_r13 = tf->tf_r13;
2066 	pcb->pcb_r14 = tf->tf_r14;
2067 	pcb->pcb_r15 = tf->tf_r15;
2068 	pcb->pcb_rbp = tf->tf_rbp;
2069 	pcb->pcb_rbx = tf->tf_rbx;
2070 	pcb->pcb_rip = tf->tf_rip;
2071 	pcb->pcb_rsp = tf->tf_rsp;
2072 }
2073 
2074 int
2075 ptrace_set_pc(struct thread *td, unsigned long addr)
2076 {
2077 
2078 	td->td_frame->tf_rip = addr;
2079 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2080 	return (0);
2081 }
2082 
2083 int
2084 ptrace_single_step(struct thread *td)
2085 {
2086 
2087 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2088 	if ((td->td_frame->tf_rflags & PSL_T) == 0) {
2089 		td->td_frame->tf_rflags |= PSL_T;
2090 		td->td_dbgflags |= TDB_STEP;
2091 	}
2092 	return (0);
2093 }
2094 
2095 int
2096 ptrace_clear_single_step(struct thread *td)
2097 {
2098 
2099 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2100 	td->td_frame->tf_rflags &= ~PSL_T;
2101 	td->td_dbgflags &= ~TDB_STEP;
2102 	return (0);
2103 }
2104 
2105 int
2106 fill_regs(struct thread *td, struct reg *regs)
2107 {
2108 	struct trapframe *tp;
2109 
2110 	tp = td->td_frame;
2111 	return (fill_frame_regs(tp, regs));
2112 }
2113 
2114 int
2115 fill_frame_regs(struct trapframe *tp, struct reg *regs)
2116 {
2117 
2118 	regs->r_r15 = tp->tf_r15;
2119 	regs->r_r14 = tp->tf_r14;
2120 	regs->r_r13 = tp->tf_r13;
2121 	regs->r_r12 = tp->tf_r12;
2122 	regs->r_r11 = tp->tf_r11;
2123 	regs->r_r10 = tp->tf_r10;
2124 	regs->r_r9  = tp->tf_r9;
2125 	regs->r_r8  = tp->tf_r8;
2126 	regs->r_rdi = tp->tf_rdi;
2127 	regs->r_rsi = tp->tf_rsi;
2128 	regs->r_rbp = tp->tf_rbp;
2129 	regs->r_rbx = tp->tf_rbx;
2130 	regs->r_rdx = tp->tf_rdx;
2131 	regs->r_rcx = tp->tf_rcx;
2132 	regs->r_rax = tp->tf_rax;
2133 	regs->r_rip = tp->tf_rip;
2134 	regs->r_cs = tp->tf_cs;
2135 	regs->r_rflags = tp->tf_rflags;
2136 	regs->r_rsp = tp->tf_rsp;
2137 	regs->r_ss = tp->tf_ss;
2138 	if (tp->tf_flags & TF_HASSEGS) {
2139 		regs->r_ds = tp->tf_ds;
2140 		regs->r_es = tp->tf_es;
2141 		regs->r_fs = tp->tf_fs;
2142 		regs->r_gs = tp->tf_gs;
2143 	} else {
2144 		regs->r_ds = 0;
2145 		regs->r_es = 0;
2146 		regs->r_fs = 0;
2147 		regs->r_gs = 0;
2148 	}
2149 	regs->r_err = 0;
2150 	regs->r_trapno = 0;
2151 	return (0);
2152 }
2153 
2154 int
2155 set_regs(struct thread *td, struct reg *regs)
2156 {
2157 	struct trapframe *tp;
2158 	register_t rflags;
2159 
2160 	tp = td->td_frame;
2161 	rflags = regs->r_rflags & 0xffffffff;
2162 	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
2163 		return (EINVAL);
2164 	tp->tf_r15 = regs->r_r15;
2165 	tp->tf_r14 = regs->r_r14;
2166 	tp->tf_r13 = regs->r_r13;
2167 	tp->tf_r12 = regs->r_r12;
2168 	tp->tf_r11 = regs->r_r11;
2169 	tp->tf_r10 = regs->r_r10;
2170 	tp->tf_r9  = regs->r_r9;
2171 	tp->tf_r8  = regs->r_r8;
2172 	tp->tf_rdi = regs->r_rdi;
2173 	tp->tf_rsi = regs->r_rsi;
2174 	tp->tf_rbp = regs->r_rbp;
2175 	tp->tf_rbx = regs->r_rbx;
2176 	tp->tf_rdx = regs->r_rdx;
2177 	tp->tf_rcx = regs->r_rcx;
2178 	tp->tf_rax = regs->r_rax;
2179 	tp->tf_rip = regs->r_rip;
2180 	tp->tf_cs = regs->r_cs;
2181 	tp->tf_rflags = rflags;
2182 	tp->tf_rsp = regs->r_rsp;
2183 	tp->tf_ss = regs->r_ss;
2184 	if (0) {	/* XXXKIB */
2185 		tp->tf_ds = regs->r_ds;
2186 		tp->tf_es = regs->r_es;
2187 		tp->tf_fs = regs->r_fs;
2188 		tp->tf_gs = regs->r_gs;
2189 		tp->tf_flags = TF_HASSEGS;
2190 	}
2191 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2192 	return (0);
2193 }
2194 
2195 /* XXX check all this stuff! */
2196 /* externalize from sv_xmm */
2197 static void
2198 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2199 {
2200 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2201 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2202 	int i;
2203 
2204 	/* pcb -> fpregs */
2205 	bzero(fpregs, sizeof(*fpregs));
2206 
2207 	/* FPU control/status */
2208 	penv_fpreg->en_cw = penv_xmm->en_cw;
2209 	penv_fpreg->en_sw = penv_xmm->en_sw;
2210 	penv_fpreg->en_tw = penv_xmm->en_tw;
2211 	penv_fpreg->en_opcode = penv_xmm->en_opcode;
2212 	penv_fpreg->en_rip = penv_xmm->en_rip;
2213 	penv_fpreg->en_rdp = penv_xmm->en_rdp;
2214 	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2215 	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2216 
2217 	/* FPU registers */
2218 	for (i = 0; i < 8; ++i)
2219 		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2220 
2221 	/* SSE registers */
2222 	for (i = 0; i < 16; ++i)
2223 		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2224 }
2225 
2226 /* internalize from fpregs into sv_xmm */
2227 static void
2228 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2229 {
2230 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2231 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2232 	int i;
2233 
2234 	/* fpregs -> pcb */
2235 	/* FPU control/status */
2236 	penv_xmm->en_cw = penv_fpreg->en_cw;
2237 	penv_xmm->en_sw = penv_fpreg->en_sw;
2238 	penv_xmm->en_tw = penv_fpreg->en_tw;
2239 	penv_xmm->en_opcode = penv_fpreg->en_opcode;
2240 	penv_xmm->en_rip = penv_fpreg->en_rip;
2241 	penv_xmm->en_rdp = penv_fpreg->en_rdp;
2242 	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2243 	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2244 
2245 	/* FPU registers */
2246 	for (i = 0; i < 8; ++i)
2247 		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2248 
2249 	/* SSE registers */
2250 	for (i = 0; i < 16; ++i)
2251 		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2252 }
2253 
2254 /* externalize from td->pcb */
2255 int
2256 fill_fpregs(struct thread *td, struct fpreg *fpregs)
2257 {
2258 
2259 	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2260 	    P_SHOULDSTOP(td->td_proc),
2261 	    ("not suspended thread %p", td));
2262 	fpugetregs(td);
2263 	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2264 	return (0);
2265 }
2266 
2267 /* internalize to td->pcb */
2268 int
2269 set_fpregs(struct thread *td, struct fpreg *fpregs)
2270 {
2271 
2272 	critical_enter();
2273 	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2274 	fpuuserinited(td);
2275 	critical_exit();
2276 	return (0);
2277 }
2278 
2279 /*
2280  * Get machine context.
2281  */
2282 int
2283 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2284 {
2285 	struct pcb *pcb;
2286 	struct trapframe *tp;
2287 
2288 	pcb = td->td_pcb;
2289 	tp = td->td_frame;
2290 	PROC_LOCK(curthread->td_proc);
2291 	mcp->mc_onstack = sigonstack(tp->tf_rsp);
2292 	PROC_UNLOCK(curthread->td_proc);
2293 	mcp->mc_r15 = tp->tf_r15;
2294 	mcp->mc_r14 = tp->tf_r14;
2295 	mcp->mc_r13 = tp->tf_r13;
2296 	mcp->mc_r12 = tp->tf_r12;
2297 	mcp->mc_r11 = tp->tf_r11;
2298 	mcp->mc_r10 = tp->tf_r10;
2299 	mcp->mc_r9  = tp->tf_r9;
2300 	mcp->mc_r8  = tp->tf_r8;
2301 	mcp->mc_rdi = tp->tf_rdi;
2302 	mcp->mc_rsi = tp->tf_rsi;
2303 	mcp->mc_rbp = tp->tf_rbp;
2304 	mcp->mc_rbx = tp->tf_rbx;
2305 	mcp->mc_rcx = tp->tf_rcx;
2306 	mcp->mc_rflags = tp->tf_rflags;
2307 	if (flags & GET_MC_CLEAR_RET) {
2308 		mcp->mc_rax = 0;
2309 		mcp->mc_rdx = 0;
2310 		mcp->mc_rflags &= ~PSL_C;
2311 	} else {
2312 		mcp->mc_rax = tp->tf_rax;
2313 		mcp->mc_rdx = tp->tf_rdx;
2314 	}
2315 	mcp->mc_rip = tp->tf_rip;
2316 	mcp->mc_cs = tp->tf_cs;
2317 	mcp->mc_rsp = tp->tf_rsp;
2318 	mcp->mc_ss = tp->tf_ss;
2319 	mcp->mc_ds = tp->tf_ds;
2320 	mcp->mc_es = tp->tf_es;
2321 	mcp->mc_fs = tp->tf_fs;
2322 	mcp->mc_gs = tp->tf_gs;
2323 	mcp->mc_flags = tp->tf_flags;
2324 	mcp->mc_len = sizeof(*mcp);
2325 	get_fpcontext(td, mcp, NULL, 0);
2326 	update_pcb_bases(pcb);
2327 	mcp->mc_fsbase = pcb->pcb_fsbase;
2328 	mcp->mc_gsbase = pcb->pcb_gsbase;
2329 	mcp->mc_xfpustate = 0;
2330 	mcp->mc_xfpustate_len = 0;
2331 	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2332 	return (0);
2333 }
2334 
2335 /*
2336  * Set machine context.
2337  *
2338  * However, we don't set any but the user modifiable flags, and we won't
2339  * touch the cs selector.
2340  */
2341 int
2342 set_mcontext(struct thread *td, mcontext_t *mcp)
2343 {
2344 	struct pcb *pcb;
2345 	struct trapframe *tp;
2346 	char *xfpustate;
2347 	long rflags;
2348 	int ret;
2349 
2350 	pcb = td->td_pcb;
2351 	tp = td->td_frame;
2352 	if (mcp->mc_len != sizeof(*mcp) ||
2353 	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2354 		return (EINVAL);
2355 	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2356 	    (tp->tf_rflags & ~PSL_USERCHANGE);
2357 	if (mcp->mc_flags & _MC_HASFPXSTATE) {
2358 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2359 		    sizeof(struct savefpu))
2360 			return (EINVAL);
2361 		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2362 		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2363 		    mcp->mc_xfpustate_len);
2364 		if (ret != 0)
2365 			return (ret);
2366 	} else
2367 		xfpustate = NULL;
2368 	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2369 	if (ret != 0)
2370 		return (ret);
2371 	tp->tf_r15 = mcp->mc_r15;
2372 	tp->tf_r14 = mcp->mc_r14;
2373 	tp->tf_r13 = mcp->mc_r13;
2374 	tp->tf_r12 = mcp->mc_r12;
2375 	tp->tf_r11 = mcp->mc_r11;
2376 	tp->tf_r10 = mcp->mc_r10;
2377 	tp->tf_r9  = mcp->mc_r9;
2378 	tp->tf_r8  = mcp->mc_r8;
2379 	tp->tf_rdi = mcp->mc_rdi;
2380 	tp->tf_rsi = mcp->mc_rsi;
2381 	tp->tf_rbp = mcp->mc_rbp;
2382 	tp->tf_rbx = mcp->mc_rbx;
2383 	tp->tf_rdx = mcp->mc_rdx;
2384 	tp->tf_rcx = mcp->mc_rcx;
2385 	tp->tf_rax = mcp->mc_rax;
2386 	tp->tf_rip = mcp->mc_rip;
2387 	tp->tf_rflags = rflags;
2388 	tp->tf_rsp = mcp->mc_rsp;
2389 	tp->tf_ss = mcp->mc_ss;
2390 	tp->tf_flags = mcp->mc_flags;
2391 	if (tp->tf_flags & TF_HASSEGS) {
2392 		tp->tf_ds = mcp->mc_ds;
2393 		tp->tf_es = mcp->mc_es;
2394 		tp->tf_fs = mcp->mc_fs;
2395 		tp->tf_gs = mcp->mc_gs;
2396 	}
2397 	set_pcb_flags(pcb, PCB_FULL_IRET);
2398 	if (mcp->mc_flags & _MC_HASBASES) {
2399 		pcb->pcb_fsbase = mcp->mc_fsbase;
2400 		pcb->pcb_gsbase = mcp->mc_gsbase;
2401 	}
2402 	return (0);
2403 }
2404 
2405 static void
2406 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2407     size_t xfpusave_len)
2408 {
2409 	size_t max_len, len;
2410 
2411 	mcp->mc_ownedfp = fpugetregs(td);
2412 	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2413 	    sizeof(mcp->mc_fpstate));
2414 	mcp->mc_fpformat = fpuformat();
2415 	if (!use_xsave || xfpusave_len == 0)
2416 		return;
2417 	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2418 	len = xfpusave_len;
2419 	if (len > max_len) {
2420 		len = max_len;
2421 		bzero(xfpusave + max_len, len - max_len);
2422 	}
2423 	mcp->mc_flags |= _MC_HASFPXSTATE;
2424 	mcp->mc_xfpustate_len = len;
2425 	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2426 }
2427 
2428 static int
2429 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2430     size_t xfpustate_len)
2431 {
2432 	int error;
2433 
2434 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2435 		return (0);
2436 	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2437 		return (EINVAL);
2438 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2439 		/* We don't care what state is left in the FPU or PCB. */
2440 		fpstate_drop(td);
2441 		error = 0;
2442 	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2443 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2444 		error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
2445 		    xfpustate, xfpustate_len);
2446 	} else
2447 		return (EINVAL);
2448 	return (error);
2449 }
2450 
2451 void
2452 fpstate_drop(struct thread *td)
2453 {
2454 
2455 	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2456 	critical_enter();
2457 	if (PCPU_GET(fpcurthread) == td)
2458 		fpudrop();
2459 	/*
2460 	 * XXX force a full drop of the fpu.  The above only drops it if we
2461 	 * owned it.
2462 	 *
2463 	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2464 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
2465 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
2466 	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2467 	 * have too many layers.
2468 	 */
2469 	clear_pcb_flags(curthread->td_pcb,
2470 	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2471 	critical_exit();
2472 }
2473 
2474 int
2475 fill_dbregs(struct thread *td, struct dbreg *dbregs)
2476 {
2477 	struct pcb *pcb;
2478 
2479 	if (td == NULL) {
2480 		dbregs->dr[0] = rdr0();
2481 		dbregs->dr[1] = rdr1();
2482 		dbregs->dr[2] = rdr2();
2483 		dbregs->dr[3] = rdr3();
2484 		dbregs->dr[6] = rdr6();
2485 		dbregs->dr[7] = rdr7();
2486 	} else {
2487 		pcb = td->td_pcb;
2488 		dbregs->dr[0] = pcb->pcb_dr0;
2489 		dbregs->dr[1] = pcb->pcb_dr1;
2490 		dbregs->dr[2] = pcb->pcb_dr2;
2491 		dbregs->dr[3] = pcb->pcb_dr3;
2492 		dbregs->dr[6] = pcb->pcb_dr6;
2493 		dbregs->dr[7] = pcb->pcb_dr7;
2494 	}
2495 	dbregs->dr[4] = 0;
2496 	dbregs->dr[5] = 0;
2497 	dbregs->dr[8] = 0;
2498 	dbregs->dr[9] = 0;
2499 	dbregs->dr[10] = 0;
2500 	dbregs->dr[11] = 0;
2501 	dbregs->dr[12] = 0;
2502 	dbregs->dr[13] = 0;
2503 	dbregs->dr[14] = 0;
2504 	dbregs->dr[15] = 0;
2505 	return (0);
2506 }
2507 
2508 int
2509 set_dbregs(struct thread *td, struct dbreg *dbregs)
2510 {
2511 	struct pcb *pcb;
2512 	int i;
2513 
2514 	if (td == NULL) {
2515 		load_dr0(dbregs->dr[0]);
2516 		load_dr1(dbregs->dr[1]);
2517 		load_dr2(dbregs->dr[2]);
2518 		load_dr3(dbregs->dr[3]);
2519 		load_dr6(dbregs->dr[6]);
2520 		load_dr7(dbregs->dr[7]);
2521 	} else {
2522 		/*
2523 		 * Don't let an illegal value for dr7 get set.  Specifically,
2524 		 * check for undefined settings.  Setting these bit patterns
2525 		 * result in undefined behaviour and can lead to an unexpected
2526 		 * TRCTRAP or a general protection fault right here.
2527 		 * Upper bits of dr6 and dr7 must not be set
2528 		 */
2529 		for (i = 0; i < 4; i++) {
2530 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2531 				return (EINVAL);
2532 			if (td->td_frame->tf_cs == _ucode32sel &&
2533 			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2534 				return (EINVAL);
2535 		}
2536 		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2537 		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2538 			return (EINVAL);
2539 
2540 		pcb = td->td_pcb;
2541 
2542 		/*
2543 		 * Don't let a process set a breakpoint that is not within the
2544 		 * process's address space.  If a process could do this, it
2545 		 * could halt the system by setting a breakpoint in the kernel
2546 		 * (if ddb was enabled).  Thus, we need to check to make sure
2547 		 * that no breakpoints are being enabled for addresses outside
2548 		 * process's address space.
2549 		 *
2550 		 * XXX - what about when the watched area of the user's
2551 		 * address space is written into from within the kernel
2552 		 * ... wouldn't that still cause a breakpoint to be generated
2553 		 * from within kernel mode?
2554 		 */
2555 
2556 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2557 			/* dr0 is enabled */
2558 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2559 				return (EINVAL);
2560 		}
2561 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2562 			/* dr1 is enabled */
2563 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2564 				return (EINVAL);
2565 		}
2566 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2567 			/* dr2 is enabled */
2568 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2569 				return (EINVAL);
2570 		}
2571 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2572 			/* dr3 is enabled */
2573 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2574 				return (EINVAL);
2575 		}
2576 
2577 		pcb->pcb_dr0 = dbregs->dr[0];
2578 		pcb->pcb_dr1 = dbregs->dr[1];
2579 		pcb->pcb_dr2 = dbregs->dr[2];
2580 		pcb->pcb_dr3 = dbregs->dr[3];
2581 		pcb->pcb_dr6 = dbregs->dr[6];
2582 		pcb->pcb_dr7 = dbregs->dr[7];
2583 
2584 		set_pcb_flags(pcb, PCB_DBREGS);
2585 	}
2586 
2587 	return (0);
2588 }
2589 
2590 void
2591 reset_dbregs(void)
2592 {
2593 
2594 	load_dr7(0);	/* Turn off the control bits first */
2595 	load_dr0(0);
2596 	load_dr1(0);
2597 	load_dr2(0);
2598 	load_dr3(0);
2599 	load_dr6(0);
2600 }
2601 
2602 /*
2603  * Return > 0 if a hardware breakpoint has been hit, and the
2604  * breakpoint was in user space.  Return 0, otherwise.
2605  */
2606 int
2607 user_dbreg_trap(register_t dr6)
2608 {
2609         u_int64_t dr7;
2610         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
2611         int nbp;            /* number of breakpoints that triggered */
2612         caddr_t addr[4];    /* breakpoint addresses */
2613         int i;
2614 
2615         bp = dr6 & DBREG_DR6_BMASK;
2616         if (bp == 0) {
2617                 /*
2618                  * None of the breakpoint bits are set meaning this
2619                  * trap was not caused by any of the debug registers
2620                  */
2621                 return 0;
2622         }
2623 
2624         dr7 = rdr7();
2625         if ((dr7 & 0x000000ff) == 0) {
2626                 /*
2627                  * all GE and LE bits in the dr7 register are zero,
2628                  * thus the trap couldn't have been caused by the
2629                  * hardware debug registers
2630                  */
2631                 return 0;
2632         }
2633 
2634         nbp = 0;
2635 
2636         /*
2637          * at least one of the breakpoints were hit, check to see
2638          * which ones and if any of them are user space addresses
2639          */
2640 
2641         if (bp & 0x01) {
2642                 addr[nbp++] = (caddr_t)rdr0();
2643         }
2644         if (bp & 0x02) {
2645                 addr[nbp++] = (caddr_t)rdr1();
2646         }
2647         if (bp & 0x04) {
2648                 addr[nbp++] = (caddr_t)rdr2();
2649         }
2650         if (bp & 0x08) {
2651                 addr[nbp++] = (caddr_t)rdr3();
2652         }
2653 
2654         for (i = 0; i < nbp; i++) {
2655                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2656                         /*
2657                          * addr[i] is in user space
2658                          */
2659                         return nbp;
2660                 }
2661         }
2662 
2663         /*
2664          * None of the breakpoints are in user space.
2665          */
2666         return 0;
2667 }
2668 
2669 /*
2670  * The pcb_flags is only modified by current thread, or by other threads
2671  * when current thread is stopped.  However, current thread may change it
2672  * from the interrupt context in cpu_switch(), or in the trap handler.
2673  * When we read-modify-write pcb_flags from C sources, compiler may generate
2674  * code that is not atomic regarding the interrupt handler.  If a trap or
2675  * interrupt happens and any flag is modified from the handler, it can be
2676  * clobbered with the cached value later.  Therefore, we implement setting
2677  * and clearing flags with single-instruction functions, which do not race
2678  * with possible modification of the flags from the trap or interrupt context,
2679  * because traps and interrupts are executed only on instruction boundary.
2680  */
2681 void
2682 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
2683 {
2684 
2685 	__asm __volatile("orl %1,%0"
2686 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
2687 	    : "cc", "memory");
2688 
2689 }
2690 
2691 /*
2692  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
2693  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
2694  * pcb if user space modified the bases.  We must save on the context
2695  * switch or if the return to usermode happens through the doreti.
2696  *
2697  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
2698  * which have a consequence that the base MSRs must be saved each time
2699  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
2700  * context switches.
2701  */
2702 static void
2703 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
2704 {
2705 	register_t r;
2706 
2707 	if (curpcb == pcb &&
2708 	    (flags & PCB_FULL_IRET) != 0 &&
2709 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2710 		r = intr_disable();
2711 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2712 			if (rfs() == _ufssel)
2713 				pcb->pcb_fsbase = rdfsbase();
2714 			if (rgs() == _ugssel)
2715 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
2716 		}
2717 		set_pcb_flags_raw(pcb, flags);
2718 		intr_restore(r);
2719 	} else {
2720 		set_pcb_flags_raw(pcb, flags);
2721 	}
2722 }
2723 
2724 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
2725 {
2726 
2727 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
2728 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
2729 }
2730 
2731 void
2732 clear_pcb_flags(struct pcb *pcb, const u_int flags)
2733 {
2734 
2735 	__asm __volatile("andl %1,%0"
2736 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
2737 	    : "cc", "memory");
2738 }
2739 
2740 #ifdef KDB
2741 
2742 /*
2743  * Provide inb() and outb() as functions.  They are normally only available as
2744  * inline functions, thus cannot be called from the debugger.
2745  */
2746 
2747 /* silence compiler warnings */
2748 u_char inb_(u_short);
2749 void outb_(u_short, u_char);
2750 
2751 u_char
2752 inb_(u_short port)
2753 {
2754 	return inb(port);
2755 }
2756 
2757 void
2758 outb_(u_short port, u_char data)
2759 {
2760 	outb(port, data);
2761 }
2762 
2763 #endif /* KDB */
2764 
2765 #undef memset
2766 #undef memmove
2767 #undef memcpy
2768 
2769 void	*memset_std(void *buf, int c, size_t len);
2770 void	*memset_erms(void *buf, int c, size_t len);
2771 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
2772 	    size_t len);
2773 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
2774 	    size_t len);
2775 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
2776 	    size_t len);
2777 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
2778 	    size_t len);
2779 
2780 #ifdef KCSAN
2781 /*
2782  * These fail to build as ifuncs when used with KCSAN.
2783  */
2784 void *
2785 memset(void *buf, int c, size_t len)
2786 {
2787 
2788 	return (memset_std(buf, c, len));
2789 }
2790 
2791 void *
2792 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
2793 {
2794 
2795 	return (memmove_std(dst, src, len));
2796 }
2797 
2798 void *
2799 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
2800 {
2801 
2802 	return (memcpy_std(dst, src, len));
2803 }
2804 #else
2805 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
2806 {
2807 
2808 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2809 	    memset_erms : memset_std);
2810 }
2811 
2812 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
2813     size_t))
2814 {
2815 
2816 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2817 	    memmove_erms : memmove_std);
2818 }
2819 
2820 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
2821 {
2822 
2823 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2824 	    memcpy_erms : memcpy_std);
2825 }
2826 #endif
2827 
2828 void	pagezero_std(void *addr);
2829 void	pagezero_erms(void *addr);
2830 DEFINE_IFUNC(, void , pagezero, (void *))
2831 {
2832 
2833 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2834 	    pagezero_erms : pagezero_std);
2835 }
2836