xref: /freebsd/sys/amd64/amd64/machdep.c (revision e17f5b1d)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 1992 Terrence R. Lambert.
6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
41  */
42 
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 #include "opt_atpic.h"
47 #include "opt_cpu.h"
48 #include "opt_ddb.h"
49 #include "opt_inet.h"
50 #include "opt_isa.h"
51 #include "opt_kstack_pages.h"
52 #include "opt_maxmem.h"
53 #include "opt_mp_watchdog.h"
54 #include "opt_pci.h"
55 #include "opt_platform.h"
56 #include "opt_sched.h"
57 
58 #include <sys/param.h>
59 #include <sys/proc.h>
60 #include <sys/systm.h>
61 #include <sys/bio.h>
62 #include <sys/buf.h>
63 #include <sys/bus.h>
64 #include <sys/callout.h>
65 #include <sys/cons.h>
66 #include <sys/cpu.h>
67 #include <sys/csan.h>
68 #include <sys/efi.h>
69 #include <sys/eventhandler.h>
70 #include <sys/exec.h>
71 #include <sys/imgact.h>
72 #include <sys/kdb.h>
73 #include <sys/kernel.h>
74 #include <sys/ktr.h>
75 #include <sys/linker.h>
76 #include <sys/lock.h>
77 #include <sys/malloc.h>
78 #include <sys/memrange.h>
79 #include <sys/msgbuf.h>
80 #include <sys/mutex.h>
81 #include <sys/pcpu.h>
82 #include <sys/ptrace.h>
83 #include <sys/reboot.h>
84 #include <sys/rwlock.h>
85 #include <sys/sched.h>
86 #include <sys/signalvar.h>
87 #ifdef SMP
88 #include <sys/smp.h>
89 #endif
90 #include <sys/syscallsubr.h>
91 #include <sys/sysctl.h>
92 #include <sys/sysent.h>
93 #include <sys/sysproto.h>
94 #include <sys/ucontext.h>
95 #include <sys/vmmeter.h>
96 
97 #include <vm/vm.h>
98 #include <vm/vm_extern.h>
99 #include <vm/vm_kern.h>
100 #include <vm/vm_page.h>
101 #include <vm/vm_map.h>
102 #include <vm/vm_object.h>
103 #include <vm/vm_pager.h>
104 #include <vm/vm_param.h>
105 #include <vm/vm_phys.h>
106 
107 #ifdef DDB
108 #ifndef KDB
109 #error KDB must be enabled in order for DDB to work!
110 #endif
111 #include <ddb/ddb.h>
112 #include <ddb/db_sym.h>
113 #endif
114 
115 #include <net/netisr.h>
116 
117 #include <machine/clock.h>
118 #include <machine/cpu.h>
119 #include <machine/cputypes.h>
120 #include <machine/frame.h>
121 #include <machine/intr_machdep.h>
122 #include <x86/mca.h>
123 #include <machine/md_var.h>
124 #include <machine/metadata.h>
125 #include <machine/mp_watchdog.h>
126 #include <machine/pc/bios.h>
127 #include <machine/pcb.h>
128 #include <machine/proc.h>
129 #include <machine/reg.h>
130 #include <machine/sigframe.h>
131 #include <machine/specialreg.h>
132 #include <machine/trap.h>
133 #include <machine/tss.h>
134 #include <x86/ucode.h>
135 #include <x86/ifunc.h>
136 #ifdef SMP
137 #include <machine/smp.h>
138 #endif
139 #ifdef FDT
140 #include <x86/fdt.h>
141 #endif
142 
143 #ifdef DEV_ATPIC
144 #include <x86/isa/icu.h>
145 #else
146 #include <x86/apicvar.h>
147 #endif
148 
149 #include <isa/isareg.h>
150 #include <isa/rtc.h>
151 #include <x86/init.h>
152 
153 /* Sanity check for __curthread() */
154 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
155 
156 /*
157  * The PTI trampoline stack needs enough space for a hardware trapframe and a
158  * couple of scratch registers, as well as the trapframe left behind after an
159  * iret fault.
160  */
161 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
162     offsetof(struct pti_frame, pti_rip));
163 
164 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
165 
166 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
167 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
168 
169 static void cpu_startup(void *);
170 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
171     char *xfpusave, size_t xfpusave_len);
172 static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
173     char *xfpustate, size_t xfpustate_len);
174 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
175 
176 /* Preload data parse function */
177 static caddr_t native_parse_preload_data(u_int64_t);
178 
179 /* Native function to fetch and parse the e820 map */
180 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
181 
182 /* Default init_ops implementation. */
183 struct init_ops init_ops = {
184 	.parse_preload_data =	native_parse_preload_data,
185 	.early_clock_source_init =	i8254_init,
186 	.early_delay =			i8254_delay,
187 	.parse_memmap =			native_parse_memmap,
188 #ifdef SMP
189 	.mp_bootaddress =		mp_bootaddress,
190 	.start_all_aps =		native_start_all_aps,
191 #endif
192 #ifdef DEV_PCI
193 	.msi_init =			msi_init,
194 #endif
195 };
196 
197 /*
198  * Physical address of the EFI System Table. Stashed from the metadata hints
199  * passed into the kernel and used by the EFI code to call runtime services.
200  */
201 vm_paddr_t efi_systbl_phys;
202 
203 /* Intel ICH registers */
204 #define ICH_PMBASE	0x400
205 #define ICH_SMI_EN	ICH_PMBASE + 0x30
206 
207 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
208 
209 int cold = 1;
210 
211 long Maxmem = 0;
212 long realmem = 0;
213 
214 struct kva_md_info kmi;
215 
216 static struct trapframe proc0_tf;
217 struct region_descriptor r_idt;
218 
219 struct pcpu *__pcpu;
220 struct pcpu temp_bsp_pcpu;
221 
222 struct mtx icu_lock;
223 
224 struct mem_range_softc mem_range_softc;
225 
226 struct mtx dt_lock;	/* lock for GDT and LDT */
227 
228 void (*vmm_resume_p)(void);
229 
230 static void
231 cpu_startup(dummy)
232 	void *dummy;
233 {
234 	uintmax_t memsize;
235 	char *sysenv;
236 
237 	/*
238 	 * On MacBooks, we need to disallow the legacy USB circuit to
239 	 * generate an SMI# because this can cause several problems,
240 	 * namely: incorrect CPU frequency detection and failure to
241 	 * start the APs.
242 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
243 	 * Enable register) of the Intel ICH LPC Interface Bridge.
244 	 */
245 	sysenv = kern_getenv("smbios.system.product");
246 	if (sysenv != NULL) {
247 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
248 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
249 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
250 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
251 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
252 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
253 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
254 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
255 			if (bootverbose)
256 				printf("Disabling LEGACY_USB_EN bit on "
257 				    "Intel ICH.\n");
258 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
259 		}
260 		freeenv(sysenv);
261 	}
262 
263 	/*
264 	 * Good {morning,afternoon,evening,night}.
265 	 */
266 	startrtclock();
267 	printcpuinfo();
268 
269 	/*
270 	 * Display physical memory if SMBIOS reports reasonable amount.
271 	 */
272 	memsize = 0;
273 	sysenv = kern_getenv("smbios.memory.enabled");
274 	if (sysenv != NULL) {
275 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
276 		freeenv(sysenv);
277 	}
278 	if (memsize < ptoa((uintmax_t)vm_free_count()))
279 		memsize = ptoa((uintmax_t)Maxmem);
280 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
281 	realmem = atop(memsize);
282 
283 	/*
284 	 * Display any holes after the first chunk of extended memory.
285 	 */
286 	if (bootverbose) {
287 		int indx;
288 
289 		printf("Physical memory chunk(s):\n");
290 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
291 			vm_paddr_t size;
292 
293 			size = phys_avail[indx + 1] - phys_avail[indx];
294 			printf(
295 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
296 			    (uintmax_t)phys_avail[indx],
297 			    (uintmax_t)phys_avail[indx + 1] - 1,
298 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
299 		}
300 	}
301 
302 	vm_ksubmap_init(&kmi);
303 
304 	printf("avail memory = %ju (%ju MB)\n",
305 	    ptoa((uintmax_t)vm_free_count()),
306 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
307 #ifdef DEV_PCI
308 	if (bootverbose && intel_graphics_stolen_base != 0)
309 		printf("intel stolen mem: base %#jx size %ju MB\n",
310 		    (uintmax_t)intel_graphics_stolen_base,
311 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
312 #endif
313 
314 	/*
315 	 * Set up buffers, so they can be used to read disk labels.
316 	 */
317 	bufinit();
318 	vm_pager_bufferinit();
319 
320 	cpu_setregs();
321 }
322 
323 static void
324 late_ifunc_resolve(void *dummy __unused)
325 {
326 	link_elf_late_ireloc();
327 }
328 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
329 
330 /*
331  * Send an interrupt to process.
332  *
333  * Stack is set up to allow sigcode stored
334  * at top to call routine, followed by call
335  * to sigreturn routine below.  After sigreturn
336  * resets the signal mask, the stack, and the
337  * frame pointer, it returns to the user
338  * specified pc, psl.
339  */
340 void
341 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
342 {
343 	struct sigframe sf, *sfp;
344 	struct pcb *pcb;
345 	struct proc *p;
346 	struct thread *td;
347 	struct sigacts *psp;
348 	char *sp;
349 	struct trapframe *regs;
350 	char *xfpusave;
351 	size_t xfpusave_len;
352 	int sig;
353 	int oonstack;
354 
355 	td = curthread;
356 	pcb = td->td_pcb;
357 	p = td->td_proc;
358 	PROC_LOCK_ASSERT(p, MA_OWNED);
359 	sig = ksi->ksi_signo;
360 	psp = p->p_sigacts;
361 	mtx_assert(&psp->ps_mtx, MA_OWNED);
362 	regs = td->td_frame;
363 	oonstack = sigonstack(regs->tf_rsp);
364 
365 	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
366 		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
367 		xfpusave = __builtin_alloca(xfpusave_len);
368 	} else {
369 		xfpusave_len = 0;
370 		xfpusave = NULL;
371 	}
372 
373 	/* Save user context. */
374 	bzero(&sf, sizeof(sf));
375 	sf.sf_uc.uc_sigmask = *mask;
376 	sf.sf_uc.uc_stack = td->td_sigstk;
377 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
378 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
379 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
380 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
381 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
382 	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
383 	fpstate_drop(td);
384 	update_pcb_bases(pcb);
385 	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
386 	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
387 	bzero(sf.sf_uc.uc_mcontext.mc_spare,
388 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
389 
390 	/* Allocate space for the signal handler context. */
391 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
392 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
393 		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
394 #if defined(COMPAT_43)
395 		td->td_sigstk.ss_flags |= SS_ONSTACK;
396 #endif
397 	} else
398 		sp = (char *)regs->tf_rsp - 128;
399 	if (xfpusave != NULL) {
400 		sp -= xfpusave_len;
401 		sp = (char *)((unsigned long)sp & ~0x3Ful);
402 		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
403 	}
404 	sp -= sizeof(struct sigframe);
405 	/* Align to 16 bytes. */
406 	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
407 
408 	/* Build the argument list for the signal handler. */
409 	regs->tf_rdi = sig;			/* arg 1 in %rdi */
410 	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
411 	bzero(&sf.sf_si, sizeof(sf.sf_si));
412 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
413 		/* Signal handler installed with SA_SIGINFO. */
414 		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
415 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
416 
417 		/* Fill in POSIX parts */
418 		sf.sf_si = ksi->ksi_info;
419 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
420 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
421 	} else {
422 		/* Old FreeBSD-style arguments. */
423 		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
424 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
425 		sf.sf_ahu.sf_handler = catcher;
426 	}
427 	mtx_unlock(&psp->ps_mtx);
428 	PROC_UNLOCK(p);
429 
430 	/*
431 	 * Copy the sigframe out to the user's stack.
432 	 */
433 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
434 	    (xfpusave != NULL && copyout(xfpusave,
435 	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
436 	    != 0)) {
437 #ifdef DEBUG
438 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
439 #endif
440 		PROC_LOCK(p);
441 		sigexit(td, SIGILL);
442 	}
443 
444 	regs->tf_rsp = (long)sfp;
445 	regs->tf_rip = p->p_sysent->sv_sigcode_base;
446 	regs->tf_rflags &= ~(PSL_T | PSL_D);
447 	regs->tf_cs = _ucodesel;
448 	regs->tf_ds = _udatasel;
449 	regs->tf_ss = _udatasel;
450 	regs->tf_es = _udatasel;
451 	regs->tf_fs = _ufssel;
452 	regs->tf_gs = _ugssel;
453 	regs->tf_flags = TF_HASSEGS;
454 	PROC_LOCK(p);
455 	mtx_lock(&psp->ps_mtx);
456 }
457 
458 /*
459  * System call to cleanup state after a signal
460  * has been taken.  Reset signal mask and
461  * stack state from context left by sendsig (above).
462  * Return to previous pc and psl as specified by
463  * context left by sendsig. Check carefully to
464  * make sure that the user has not modified the
465  * state to gain improper privileges.
466  *
467  * MPSAFE
468  */
469 int
470 sys_sigreturn(td, uap)
471 	struct thread *td;
472 	struct sigreturn_args /* {
473 		const struct __ucontext *sigcntxp;
474 	} */ *uap;
475 {
476 	ucontext_t uc;
477 	struct pcb *pcb;
478 	struct proc *p;
479 	struct trapframe *regs;
480 	ucontext_t *ucp;
481 	char *xfpustate;
482 	size_t xfpustate_len;
483 	long rflags;
484 	int cs, error, ret;
485 	ksiginfo_t ksi;
486 
487 	pcb = td->td_pcb;
488 	p = td->td_proc;
489 
490 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
491 	if (error != 0) {
492 		uprintf("pid %d (%s): sigreturn copyin failed\n",
493 		    p->p_pid, td->td_name);
494 		return (error);
495 	}
496 	ucp = &uc;
497 	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
498 		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
499 		    td->td_name, ucp->uc_mcontext.mc_flags);
500 		return (EINVAL);
501 	}
502 	regs = td->td_frame;
503 	rflags = ucp->uc_mcontext.mc_rflags;
504 	/*
505 	 * Don't allow users to change privileged or reserved flags.
506 	 */
507 	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
508 		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
509 		    td->td_name, rflags);
510 		return (EINVAL);
511 	}
512 
513 	/*
514 	 * Don't allow users to load a valid privileged %cs.  Let the
515 	 * hardware check for invalid selectors, excess privilege in
516 	 * other selectors, invalid %eip's and invalid %esp's.
517 	 */
518 	cs = ucp->uc_mcontext.mc_cs;
519 	if (!CS_SECURE(cs)) {
520 		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
521 		    td->td_name, cs);
522 		ksiginfo_init_trap(&ksi);
523 		ksi.ksi_signo = SIGBUS;
524 		ksi.ksi_code = BUS_OBJERR;
525 		ksi.ksi_trapno = T_PROTFLT;
526 		ksi.ksi_addr = (void *)regs->tf_rip;
527 		trapsignal(td, &ksi);
528 		return (EINVAL);
529 	}
530 
531 	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
532 		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
533 		if (xfpustate_len > cpu_max_ext_state_size -
534 		    sizeof(struct savefpu)) {
535 			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
536 			    p->p_pid, td->td_name, xfpustate_len);
537 			return (EINVAL);
538 		}
539 		xfpustate = __builtin_alloca(xfpustate_len);
540 		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
541 		    xfpustate, xfpustate_len);
542 		if (error != 0) {
543 			uprintf(
544 	"pid %d (%s): sigreturn copying xfpustate failed\n",
545 			    p->p_pid, td->td_name);
546 			return (error);
547 		}
548 	} else {
549 		xfpustate = NULL;
550 		xfpustate_len = 0;
551 	}
552 	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
553 	if (ret != 0) {
554 		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
555 		    p->p_pid, td->td_name, ret);
556 		return (ret);
557 	}
558 	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
559 	update_pcb_bases(pcb);
560 	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
561 	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
562 
563 #if defined(COMPAT_43)
564 	if (ucp->uc_mcontext.mc_onstack & 1)
565 		td->td_sigstk.ss_flags |= SS_ONSTACK;
566 	else
567 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
568 #endif
569 
570 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
571 	return (EJUSTRETURN);
572 }
573 
574 #ifdef COMPAT_FREEBSD4
575 int
576 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
577 {
578 
579 	return sys_sigreturn(td, (struct sigreturn_args *)uap);
580 }
581 #endif
582 
583 /*
584  * Reset registers to default values on exec.
585  */
586 void
587 exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack)
588 {
589 	struct trapframe *regs;
590 	struct pcb *pcb;
591 	register_t saved_rflags;
592 
593 	regs = td->td_frame;
594 	pcb = td->td_pcb;
595 
596 	if (td->td_proc->p_md.md_ldt != NULL)
597 		user_ldt_free(td);
598 
599 	update_pcb_bases(pcb);
600 	pcb->pcb_fsbase = 0;
601 	pcb->pcb_gsbase = 0;
602 	clear_pcb_flags(pcb, PCB_32BIT);
603 	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
604 
605 	saved_rflags = regs->tf_rflags & PSL_T;
606 	bzero((char *)regs, sizeof(struct trapframe));
607 	regs->tf_rip = imgp->entry_addr;
608 	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
609 	regs->tf_rdi = stack;		/* argv */
610 	regs->tf_rflags = PSL_USER | saved_rflags;
611 	regs->tf_ss = _udatasel;
612 	regs->tf_cs = _ucodesel;
613 	regs->tf_ds = _udatasel;
614 	regs->tf_es = _udatasel;
615 	regs->tf_fs = _ufssel;
616 	regs->tf_gs = _ugssel;
617 	regs->tf_flags = TF_HASSEGS;
618 
619 	/*
620 	 * Reset the hardware debug registers if they were in use.
621 	 * They won't have any meaning for the newly exec'd process.
622 	 */
623 	if (pcb->pcb_flags & PCB_DBREGS) {
624 		pcb->pcb_dr0 = 0;
625 		pcb->pcb_dr1 = 0;
626 		pcb->pcb_dr2 = 0;
627 		pcb->pcb_dr3 = 0;
628 		pcb->pcb_dr6 = 0;
629 		pcb->pcb_dr7 = 0;
630 		if (pcb == curpcb) {
631 			/*
632 			 * Clear the debug registers on the running
633 			 * CPU, otherwise they will end up affecting
634 			 * the next process we switch to.
635 			 */
636 			reset_dbregs();
637 		}
638 		clear_pcb_flags(pcb, PCB_DBREGS);
639 	}
640 
641 	/*
642 	 * Drop the FP state if we hold it, so that the process gets a
643 	 * clean FP state if it uses the FPU again.
644 	 */
645 	fpstate_drop(td);
646 }
647 
648 void
649 cpu_setregs(void)
650 {
651 	register_t cr0;
652 
653 	cr0 = rcr0();
654 	/*
655 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
656 	 * BSP.  See the comments there about why we set them.
657 	 */
658 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
659 	load_cr0(cr0);
660 }
661 
662 /*
663  * Initialize amd64 and configure to run kernel
664  */
665 
666 /*
667  * Initialize segments & interrupt table
668  */
669 static struct gate_descriptor idt0[NIDT];
670 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
671 
672 static char dblfault_stack[PAGE_SIZE] __aligned(16);
673 static char mce0_stack[PAGE_SIZE] __aligned(16);
674 static char nmi0_stack[PAGE_SIZE] __aligned(16);
675 static char dbg0_stack[PAGE_SIZE] __aligned(16);
676 CTASSERT(sizeof(struct nmi_pcpu) == 16);
677 
678 /*
679  * Software prototypes -- in more palatable form.
680  *
681  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
682  * slots as corresponding segments for i386 kernel.
683  */
684 struct soft_segment_descriptor gdt_segs[] = {
685 /* GNULL_SEL	0 Null Descriptor */
686 {	.ssd_base = 0x0,
687 	.ssd_limit = 0x0,
688 	.ssd_type = 0,
689 	.ssd_dpl = 0,
690 	.ssd_p = 0,
691 	.ssd_long = 0,
692 	.ssd_def32 = 0,
693 	.ssd_gran = 0		},
694 /* GNULL2_SEL	1 Null Descriptor */
695 {	.ssd_base = 0x0,
696 	.ssd_limit = 0x0,
697 	.ssd_type = 0,
698 	.ssd_dpl = 0,
699 	.ssd_p = 0,
700 	.ssd_long = 0,
701 	.ssd_def32 = 0,
702 	.ssd_gran = 0		},
703 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
704 {	.ssd_base = 0x0,
705 	.ssd_limit = 0xfffff,
706 	.ssd_type = SDT_MEMRWA,
707 	.ssd_dpl = SEL_UPL,
708 	.ssd_p = 1,
709 	.ssd_long = 0,
710 	.ssd_def32 = 1,
711 	.ssd_gran = 1		},
712 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
713 {	.ssd_base = 0x0,
714 	.ssd_limit = 0xfffff,
715 	.ssd_type = SDT_MEMRWA,
716 	.ssd_dpl = SEL_UPL,
717 	.ssd_p = 1,
718 	.ssd_long = 0,
719 	.ssd_def32 = 1,
720 	.ssd_gran = 1		},
721 /* GCODE_SEL	4 Code Descriptor for kernel */
722 {	.ssd_base = 0x0,
723 	.ssd_limit = 0xfffff,
724 	.ssd_type = SDT_MEMERA,
725 	.ssd_dpl = SEL_KPL,
726 	.ssd_p = 1,
727 	.ssd_long = 1,
728 	.ssd_def32 = 0,
729 	.ssd_gran = 1		},
730 /* GDATA_SEL	5 Data Descriptor for kernel */
731 {	.ssd_base = 0x0,
732 	.ssd_limit = 0xfffff,
733 	.ssd_type = SDT_MEMRWA,
734 	.ssd_dpl = SEL_KPL,
735 	.ssd_p = 1,
736 	.ssd_long = 1,
737 	.ssd_def32 = 0,
738 	.ssd_gran = 1		},
739 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
740 {	.ssd_base = 0x0,
741 	.ssd_limit = 0xfffff,
742 	.ssd_type = SDT_MEMERA,
743 	.ssd_dpl = SEL_UPL,
744 	.ssd_p = 1,
745 	.ssd_long = 0,
746 	.ssd_def32 = 1,
747 	.ssd_gran = 1		},
748 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
749 {	.ssd_base = 0x0,
750 	.ssd_limit = 0xfffff,
751 	.ssd_type = SDT_MEMRWA,
752 	.ssd_dpl = SEL_UPL,
753 	.ssd_p = 1,
754 	.ssd_long = 0,
755 	.ssd_def32 = 1,
756 	.ssd_gran = 1		},
757 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
758 {	.ssd_base = 0x0,
759 	.ssd_limit = 0xfffff,
760 	.ssd_type = SDT_MEMERA,
761 	.ssd_dpl = SEL_UPL,
762 	.ssd_p = 1,
763 	.ssd_long = 1,
764 	.ssd_def32 = 0,
765 	.ssd_gran = 1		},
766 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
767 {	.ssd_base = 0x0,
768 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
769 	.ssd_type = SDT_SYSTSS,
770 	.ssd_dpl = SEL_KPL,
771 	.ssd_p = 1,
772 	.ssd_long = 0,
773 	.ssd_def32 = 0,
774 	.ssd_gran = 0		},
775 /* Actually, the TSS is a system descriptor which is double size */
776 {	.ssd_base = 0x0,
777 	.ssd_limit = 0x0,
778 	.ssd_type = 0,
779 	.ssd_dpl = 0,
780 	.ssd_p = 0,
781 	.ssd_long = 0,
782 	.ssd_def32 = 0,
783 	.ssd_gran = 0		},
784 /* GUSERLDT_SEL	11 LDT Descriptor */
785 {	.ssd_base = 0x0,
786 	.ssd_limit = 0x0,
787 	.ssd_type = 0,
788 	.ssd_dpl = 0,
789 	.ssd_p = 0,
790 	.ssd_long = 0,
791 	.ssd_def32 = 0,
792 	.ssd_gran = 0		},
793 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
794 {	.ssd_base = 0x0,
795 	.ssd_limit = 0x0,
796 	.ssd_type = 0,
797 	.ssd_dpl = 0,
798 	.ssd_p = 0,
799 	.ssd_long = 0,
800 	.ssd_def32 = 0,
801 	.ssd_gran = 0		},
802 };
803 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
804 
805 void
806 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
807 {
808 	struct gate_descriptor *ip;
809 
810 	ip = idt + idx;
811 	ip->gd_looffset = (uintptr_t)func;
812 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
813 	ip->gd_ist = ist;
814 	ip->gd_xx = 0;
815 	ip->gd_type = typ;
816 	ip->gd_dpl = dpl;
817 	ip->gd_p = 1;
818 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
819 }
820 
821 extern inthand_t
822 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
823 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
824 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
825 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
826 	IDTVEC(xmm), IDTVEC(dblfault),
827 	IDTVEC(div_pti), IDTVEC(bpt_pti),
828 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
829 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
830 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
831 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
832 	IDTVEC(xmm_pti),
833 #ifdef KDTRACE_HOOKS
834 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
835 #endif
836 #ifdef XENHVM
837 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
838 #endif
839 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
840 	IDTVEC(fast_syscall_pti);
841 
842 #ifdef DDB
843 /*
844  * Display the index and function name of any IDT entries that don't use
845  * the default 'rsvd' entry point.
846  */
847 DB_SHOW_COMMAND(idt, db_show_idt)
848 {
849 	struct gate_descriptor *ip;
850 	int idx;
851 	uintptr_t func;
852 
853 	ip = idt;
854 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
855 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
856 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
857 			db_printf("%3d\t", idx);
858 			db_printsym(func, DB_STGY_PROC);
859 			db_printf("\n");
860 		}
861 		ip++;
862 	}
863 }
864 
865 /* Show privileged registers. */
866 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
867 {
868 	struct {
869 		uint16_t limit;
870 		uint64_t base;
871 	} __packed idtr, gdtr;
872 	uint16_t ldt, tr;
873 
874 	__asm __volatile("sidt %0" : "=m" (idtr));
875 	db_printf("idtr\t0x%016lx/%04x\n",
876 	    (u_long)idtr.base, (u_int)idtr.limit);
877 	__asm __volatile("sgdt %0" : "=m" (gdtr));
878 	db_printf("gdtr\t0x%016lx/%04x\n",
879 	    (u_long)gdtr.base, (u_int)gdtr.limit);
880 	__asm __volatile("sldt %0" : "=r" (ldt));
881 	db_printf("ldtr\t0x%04x\n", ldt);
882 	__asm __volatile("str %0" : "=r" (tr));
883 	db_printf("tr\t0x%04x\n", tr);
884 	db_printf("cr0\t0x%016lx\n", rcr0());
885 	db_printf("cr2\t0x%016lx\n", rcr2());
886 	db_printf("cr3\t0x%016lx\n", rcr3());
887 	db_printf("cr4\t0x%016lx\n", rcr4());
888 	if (rcr4() & CR4_XSAVE)
889 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
890 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
891 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
892 		db_printf("FEATURES_CTL\t%016lx\n",
893 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
894 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
895 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
896 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
897 }
898 
899 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
900 {
901 
902 	db_printf("dr0\t0x%016lx\n", rdr0());
903 	db_printf("dr1\t0x%016lx\n", rdr1());
904 	db_printf("dr2\t0x%016lx\n", rdr2());
905 	db_printf("dr3\t0x%016lx\n", rdr3());
906 	db_printf("dr6\t0x%016lx\n", rdr6());
907 	db_printf("dr7\t0x%016lx\n", rdr7());
908 }
909 #endif
910 
911 void
912 sdtossd(sd, ssd)
913 	struct user_segment_descriptor *sd;
914 	struct soft_segment_descriptor *ssd;
915 {
916 
917 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
918 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
919 	ssd->ssd_type  = sd->sd_type;
920 	ssd->ssd_dpl   = sd->sd_dpl;
921 	ssd->ssd_p     = sd->sd_p;
922 	ssd->ssd_long  = sd->sd_long;
923 	ssd->ssd_def32 = sd->sd_def32;
924 	ssd->ssd_gran  = sd->sd_gran;
925 }
926 
927 void
928 ssdtosd(ssd, sd)
929 	struct soft_segment_descriptor *ssd;
930 	struct user_segment_descriptor *sd;
931 {
932 
933 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
934 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
935 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
936 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
937 	sd->sd_type  = ssd->ssd_type;
938 	sd->sd_dpl   = ssd->ssd_dpl;
939 	sd->sd_p     = ssd->ssd_p;
940 	sd->sd_long  = ssd->ssd_long;
941 	sd->sd_def32 = ssd->ssd_def32;
942 	sd->sd_gran  = ssd->ssd_gran;
943 }
944 
945 void
946 ssdtosyssd(ssd, sd)
947 	struct soft_segment_descriptor *ssd;
948 	struct system_segment_descriptor *sd;
949 {
950 
951 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
952 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
953 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
954 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
955 	sd->sd_type  = ssd->ssd_type;
956 	sd->sd_dpl   = ssd->ssd_dpl;
957 	sd->sd_p     = ssd->ssd_p;
958 	sd->sd_gran  = ssd->ssd_gran;
959 }
960 
961 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
962 #include <isa/isavar.h>
963 #include <isa/isareg.h>
964 /*
965  * Return a bitmap of the current interrupt requests.  This is 8259-specific
966  * and is only suitable for use at probe time.
967  * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
968  * It shouldn't be here.  There should probably be an APIC centric
969  * implementation in the apic driver code, if at all.
970  */
971 intrmask_t
972 isa_irq_pending(void)
973 {
974 	u_char irr1;
975 	u_char irr2;
976 
977 	irr1 = inb(IO_ICU1);
978 	irr2 = inb(IO_ICU2);
979 	return ((irr2 << 8) | irr1);
980 }
981 #endif
982 
983 u_int basemem;
984 
985 static int
986 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
987     int *physmap_idxp)
988 {
989 	int i, insert_idx, physmap_idx;
990 
991 	physmap_idx = *physmap_idxp;
992 
993 	if (length == 0)
994 		return (1);
995 
996 	/*
997 	 * Find insertion point while checking for overlap.  Start off by
998 	 * assuming the new entry will be added to the end.
999 	 *
1000 	 * NB: physmap_idx points to the next free slot.
1001 	 */
1002 	insert_idx = physmap_idx;
1003 	for (i = 0; i <= physmap_idx; i += 2) {
1004 		if (base < physmap[i + 1]) {
1005 			if (base + length <= physmap[i]) {
1006 				insert_idx = i;
1007 				break;
1008 			}
1009 			if (boothowto & RB_VERBOSE)
1010 				printf(
1011 		    "Overlapping memory regions, ignoring second region\n");
1012 			return (1);
1013 		}
1014 	}
1015 
1016 	/* See if we can prepend to the next entry. */
1017 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1018 		physmap[insert_idx] = base;
1019 		return (1);
1020 	}
1021 
1022 	/* See if we can append to the previous entry. */
1023 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1024 		physmap[insert_idx - 1] += length;
1025 		return (1);
1026 	}
1027 
1028 	physmap_idx += 2;
1029 	*physmap_idxp = physmap_idx;
1030 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
1031 		printf(
1032 		"Too many segments in the physical address map, giving up\n");
1033 		return (0);
1034 	}
1035 
1036 	/*
1037 	 * Move the last 'N' entries down to make room for the new
1038 	 * entry if needed.
1039 	 */
1040 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1041 		physmap[i] = physmap[i - 2];
1042 		physmap[i + 1] = physmap[i - 1];
1043 	}
1044 
1045 	/* Insert the new entry. */
1046 	physmap[insert_idx] = base;
1047 	physmap[insert_idx + 1] = base + length;
1048 	return (1);
1049 }
1050 
1051 void
1052 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1053                       vm_paddr_t *physmap, int *physmap_idx)
1054 {
1055 	struct bios_smap *smap, *smapend;
1056 
1057 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1058 
1059 	for (smap = smapbase; smap < smapend; smap++) {
1060 		if (boothowto & RB_VERBOSE)
1061 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
1062 			    smap->type, smap->base, smap->length);
1063 
1064 		if (smap->type != SMAP_TYPE_MEMORY)
1065 			continue;
1066 
1067 		if (!add_physmap_entry(smap->base, smap->length, physmap,
1068 		    physmap_idx))
1069 			break;
1070 	}
1071 }
1072 
1073 static void
1074 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1075     int *physmap_idx)
1076 {
1077 	struct efi_md *map, *p;
1078 	const char *type;
1079 	size_t efisz;
1080 	int ndesc, i;
1081 
1082 	static const char *types[] = {
1083 		"Reserved",
1084 		"LoaderCode",
1085 		"LoaderData",
1086 		"BootServicesCode",
1087 		"BootServicesData",
1088 		"RuntimeServicesCode",
1089 		"RuntimeServicesData",
1090 		"ConventionalMemory",
1091 		"UnusableMemory",
1092 		"ACPIReclaimMemory",
1093 		"ACPIMemoryNVS",
1094 		"MemoryMappedIO",
1095 		"MemoryMappedIOPortSpace",
1096 		"PalCode",
1097 		"PersistentMemory"
1098 	};
1099 
1100 	/*
1101 	 * Memory map data provided by UEFI via the GetMemoryMap
1102 	 * Boot Services API.
1103 	 */
1104 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1105 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1106 
1107 	if (efihdr->descriptor_size == 0)
1108 		return;
1109 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
1110 
1111 	if (boothowto & RB_VERBOSE)
1112 		printf("%23s %12s %12s %8s %4s\n",
1113 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
1114 
1115 	for (i = 0, p = map; i < ndesc; i++,
1116 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1117 		if (boothowto & RB_VERBOSE) {
1118 			if (p->md_type < nitems(types))
1119 				type = types[p->md_type];
1120 			else
1121 				type = "<INVALID>";
1122 			printf("%23s %012lx %12p %08lx ", type, p->md_phys,
1123 			    p->md_virt, p->md_pages);
1124 			if (p->md_attr & EFI_MD_ATTR_UC)
1125 				printf("UC ");
1126 			if (p->md_attr & EFI_MD_ATTR_WC)
1127 				printf("WC ");
1128 			if (p->md_attr & EFI_MD_ATTR_WT)
1129 				printf("WT ");
1130 			if (p->md_attr & EFI_MD_ATTR_WB)
1131 				printf("WB ");
1132 			if (p->md_attr & EFI_MD_ATTR_UCE)
1133 				printf("UCE ");
1134 			if (p->md_attr & EFI_MD_ATTR_WP)
1135 				printf("WP ");
1136 			if (p->md_attr & EFI_MD_ATTR_RP)
1137 				printf("RP ");
1138 			if (p->md_attr & EFI_MD_ATTR_XP)
1139 				printf("XP ");
1140 			if (p->md_attr & EFI_MD_ATTR_NV)
1141 				printf("NV ");
1142 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1143 				printf("MORE_RELIABLE ");
1144 			if (p->md_attr & EFI_MD_ATTR_RO)
1145 				printf("RO ");
1146 			if (p->md_attr & EFI_MD_ATTR_RT)
1147 				printf("RUNTIME");
1148 			printf("\n");
1149 		}
1150 
1151 		switch (p->md_type) {
1152 		case EFI_MD_TYPE_CODE:
1153 		case EFI_MD_TYPE_DATA:
1154 		case EFI_MD_TYPE_BS_CODE:
1155 		case EFI_MD_TYPE_BS_DATA:
1156 		case EFI_MD_TYPE_FREE:
1157 			/*
1158 			 * We're allowed to use any entry with these types.
1159 			 */
1160 			break;
1161 		default:
1162 			continue;
1163 		}
1164 
1165 		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1166 		    physmap, physmap_idx))
1167 			break;
1168 	}
1169 }
1170 
1171 static char bootmethod[16] = "";
1172 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
1173     "System firmware boot method");
1174 
1175 static void
1176 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1177 {
1178 	struct bios_smap *smap;
1179 	struct efi_map_header *efihdr;
1180 	u_int32_t size;
1181 
1182 	/*
1183 	 * Memory map from INT 15:E820.
1184 	 *
1185 	 * subr_module.c says:
1186 	 * "Consumer may safely assume that size value precedes data."
1187 	 * ie: an int32_t immediately precedes smap.
1188 	 */
1189 
1190 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1191 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1192 	smap = (struct bios_smap *)preload_search_info(kmdp,
1193 	    MODINFO_METADATA | MODINFOMD_SMAP);
1194 	if (efihdr == NULL && smap == NULL)
1195 		panic("No BIOS smap or EFI map info from loader!");
1196 
1197 	if (efihdr != NULL) {
1198 		add_efi_map_entries(efihdr, physmap, physmap_idx);
1199 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1200 	} else {
1201 		size = *((u_int32_t *)smap - 1);
1202 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
1203 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1204 	}
1205 }
1206 
1207 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
1208 
1209 /*
1210  * Populate the (physmap) array with base/bound pairs describing the
1211  * available physical memory in the system, then test this memory and
1212  * build the phys_avail array describing the actually-available memory.
1213  *
1214  * Total memory size may be set by the kernel environment variable
1215  * hw.physmem or the compile-time define MAXMEM.
1216  *
1217  * XXX first should be vm_paddr_t.
1218  */
1219 static void
1220 getmemsize(caddr_t kmdp, u_int64_t first)
1221 {
1222 	int i, physmap_idx, pa_indx, da_indx;
1223 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
1224 	u_long physmem_start, physmem_tunable, memtest;
1225 	pt_entry_t *pte;
1226 	quad_t dcons_addr, dcons_size;
1227 	int page_counter;
1228 
1229 	/*
1230 	 * Tell the physical memory allocator about pages used to store
1231 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
1232 	 */
1233 	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
1234 
1235 	bzero(physmap, sizeof(physmap));
1236 	physmap_idx = 0;
1237 
1238 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1239 	physmap_idx -= 2;
1240 
1241 	/*
1242 	 * Find the 'base memory' segment for SMP
1243 	 */
1244 	basemem = 0;
1245 	for (i = 0; i <= physmap_idx; i += 2) {
1246 		if (physmap[i] <= 0xA0000) {
1247 			basemem = physmap[i + 1] / 1024;
1248 			break;
1249 		}
1250 	}
1251 	if (basemem == 0 || basemem > 640) {
1252 		if (bootverbose)
1253 			printf(
1254 		"Memory map doesn't contain a basemem segment, faking it");
1255 		basemem = 640;
1256 	}
1257 
1258 	/*
1259 	 * Maxmem isn't the "maximum memory", it's one larger than the
1260 	 * highest page of the physical address space.  It should be
1261 	 * called something like "Maxphyspage".  We may adjust this
1262 	 * based on ``hw.physmem'' and the results of the memory test.
1263 	 */
1264 	Maxmem = atop(physmap[physmap_idx + 1]);
1265 
1266 #ifdef MAXMEM
1267 	Maxmem = MAXMEM / 4;
1268 #endif
1269 
1270 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1271 		Maxmem = atop(physmem_tunable);
1272 
1273 	/*
1274 	 * The boot memory test is disabled by default, as it takes a
1275 	 * significant amount of time on large-memory systems, and is
1276 	 * unfriendly to virtual machines as it unnecessarily touches all
1277 	 * pages.
1278 	 *
1279 	 * A general name is used as the code may be extended to support
1280 	 * additional tests beyond the current "page present" test.
1281 	 */
1282 	memtest = 0;
1283 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1284 
1285 	/*
1286 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1287 	 * in the system.
1288 	 */
1289 	if (Maxmem > atop(physmap[physmap_idx + 1]))
1290 		Maxmem = atop(physmap[physmap_idx + 1]);
1291 
1292 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1293 	    (boothowto & RB_VERBOSE))
1294 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
1295 
1296 	/*
1297 	 * Make hole for "AP -> long mode" bootstrap code.  The
1298 	 * mp_bootaddress vector is only available when the kernel
1299 	 * is configured to support APs and APs for the system start
1300 	 * in real mode mode (e.g. SMP bare metal).
1301 	 */
1302 	if (init_ops.mp_bootaddress)
1303 		init_ops.mp_bootaddress(physmap, &physmap_idx);
1304 
1305 	/* call pmap initialization to make new kernel address space */
1306 	pmap_bootstrap(&first);
1307 
1308 	/*
1309 	 * Size up each available chunk of physical memory.
1310 	 *
1311 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1312 	 * By default, mask off the first 16 pages unless we appear to be
1313 	 * running in a VM.
1314 	 */
1315 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1316 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1317 	if (physmap[0] < physmem_start) {
1318 		if (physmem_start < PAGE_SIZE)
1319 			physmap[0] = PAGE_SIZE;
1320 		else if (physmem_start >= physmap[1])
1321 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1322 		else
1323 			physmap[0] = round_page(physmem_start);
1324 	}
1325 	pa_indx = 0;
1326 	da_indx = 1;
1327 	phys_avail[pa_indx++] = physmap[0];
1328 	phys_avail[pa_indx] = physmap[0];
1329 	dump_avail[da_indx] = physmap[0];
1330 	pte = CMAP1;
1331 
1332 	/*
1333 	 * Get dcons buffer address
1334 	 */
1335 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1336 	    getenv_quad("dcons.size", &dcons_size) == 0)
1337 		dcons_addr = 0;
1338 
1339 	/*
1340 	 * physmap is in bytes, so when converting to page boundaries,
1341 	 * round up the start address and round down the end address.
1342 	 */
1343 	page_counter = 0;
1344 	if (memtest != 0)
1345 		printf("Testing system memory");
1346 	for (i = 0; i <= physmap_idx; i += 2) {
1347 		vm_paddr_t end;
1348 
1349 		end = ptoa((vm_paddr_t)Maxmem);
1350 		if (physmap[i + 1] < end)
1351 			end = trunc_page(physmap[i + 1]);
1352 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1353 			int tmp, page_bad, full;
1354 			int *ptr = (int *)CADDR1;
1355 
1356 			full = FALSE;
1357 			/*
1358 			 * block out kernel memory as not available.
1359 			 */
1360 			if (pa >= (vm_paddr_t)kernphys && pa < first)
1361 				goto do_dump_avail;
1362 
1363 			/*
1364 			 * block out dcons buffer
1365 			 */
1366 			if (dcons_addr > 0
1367 			    && pa >= trunc_page(dcons_addr)
1368 			    && pa < dcons_addr + dcons_size)
1369 				goto do_dump_avail;
1370 
1371 			page_bad = FALSE;
1372 			if (memtest == 0)
1373 				goto skip_memtest;
1374 
1375 			/*
1376 			 * Print a "." every GB to show we're making
1377 			 * progress.
1378 			 */
1379 			page_counter++;
1380 			if ((page_counter % PAGES_PER_GB) == 0)
1381 				printf(".");
1382 
1383 			/*
1384 			 * map page into kernel: valid, read/write,non-cacheable
1385 			 */
1386 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1387 			invltlb();
1388 
1389 			tmp = *(int *)ptr;
1390 			/*
1391 			 * Test for alternating 1's and 0's
1392 			 */
1393 			*(volatile int *)ptr = 0xaaaaaaaa;
1394 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1395 				page_bad = TRUE;
1396 			/*
1397 			 * Test for alternating 0's and 1's
1398 			 */
1399 			*(volatile int *)ptr = 0x55555555;
1400 			if (*(volatile int *)ptr != 0x55555555)
1401 				page_bad = TRUE;
1402 			/*
1403 			 * Test for all 1's
1404 			 */
1405 			*(volatile int *)ptr = 0xffffffff;
1406 			if (*(volatile int *)ptr != 0xffffffff)
1407 				page_bad = TRUE;
1408 			/*
1409 			 * Test for all 0's
1410 			 */
1411 			*(volatile int *)ptr = 0x0;
1412 			if (*(volatile int *)ptr != 0x0)
1413 				page_bad = TRUE;
1414 			/*
1415 			 * Restore original value.
1416 			 */
1417 			*(int *)ptr = tmp;
1418 
1419 skip_memtest:
1420 			/*
1421 			 * Adjust array of valid/good pages.
1422 			 */
1423 			if (page_bad == TRUE)
1424 				continue;
1425 			/*
1426 			 * If this good page is a continuation of the
1427 			 * previous set of good pages, then just increase
1428 			 * the end pointer. Otherwise start a new chunk.
1429 			 * Note that "end" points one higher than end,
1430 			 * making the range >= start and < end.
1431 			 * If we're also doing a speculative memory
1432 			 * test and we at or past the end, bump up Maxmem
1433 			 * so that we keep going. The first bad page
1434 			 * will terminate the loop.
1435 			 */
1436 			if (phys_avail[pa_indx] == pa) {
1437 				phys_avail[pa_indx] += PAGE_SIZE;
1438 			} else {
1439 				pa_indx++;
1440 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
1441 					printf(
1442 		"Too many holes in the physical address space, giving up\n");
1443 					pa_indx--;
1444 					full = TRUE;
1445 					goto do_dump_avail;
1446 				}
1447 				phys_avail[pa_indx++] = pa;	/* start */
1448 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1449 			}
1450 			physmem++;
1451 do_dump_avail:
1452 			if (dump_avail[da_indx] == pa) {
1453 				dump_avail[da_indx] += PAGE_SIZE;
1454 			} else {
1455 				da_indx++;
1456 				if (da_indx == PHYS_AVAIL_ENTRIES) {
1457 					da_indx--;
1458 					goto do_next;
1459 				}
1460 				dump_avail[da_indx++] = pa; /* start */
1461 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1462 			}
1463 do_next:
1464 			if (full)
1465 				break;
1466 		}
1467 	}
1468 	*pte = 0;
1469 	invltlb();
1470 	if (memtest != 0)
1471 		printf("\n");
1472 
1473 	/*
1474 	 * XXX
1475 	 * The last chunk must contain at least one page plus the message
1476 	 * buffer to avoid complicating other code (message buffer address
1477 	 * calculation, etc.).
1478 	 */
1479 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1480 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1481 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1482 		phys_avail[pa_indx--] = 0;
1483 		phys_avail[pa_indx--] = 0;
1484 	}
1485 
1486 	Maxmem = atop(phys_avail[pa_indx]);
1487 
1488 	/* Trim off space for the message buffer. */
1489 	phys_avail[pa_indx] -= round_page(msgbufsize);
1490 
1491 	/* Map the message buffer. */
1492 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1493 }
1494 
1495 static caddr_t
1496 native_parse_preload_data(u_int64_t modulep)
1497 {
1498 	caddr_t kmdp;
1499 	char *envp;
1500 #ifdef DDB
1501 	vm_offset_t ksym_start;
1502 	vm_offset_t ksym_end;
1503 #endif
1504 
1505 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1506 	preload_bootstrap_relocate(KERNBASE);
1507 	kmdp = preload_search_by_type("elf kernel");
1508 	if (kmdp == NULL)
1509 		kmdp = preload_search_by_type("elf64 kernel");
1510 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1511 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1512 	if (envp != NULL)
1513 		envp += KERNBASE;
1514 	init_static_kenv(envp, 0);
1515 #ifdef DDB
1516 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1517 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1518 	db_fetch_ksymtab(ksym_start, ksym_end, 0);
1519 #endif
1520 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1521 
1522 	return (kmdp);
1523 }
1524 
1525 static void
1526 amd64_kdb_init(void)
1527 {
1528 	kdb_init();
1529 #ifdef KDB
1530 	if (boothowto & RB_KDB)
1531 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1532 #endif
1533 }
1534 
1535 /* Set up the fast syscall stuff */
1536 void
1537 amd64_conf_fast_syscall(void)
1538 {
1539 	uint64_t msr;
1540 
1541 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1542 	wrmsr(MSR_EFER, msr);
1543 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1544 	    (u_int64_t)IDTVEC(fast_syscall));
1545 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1546 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1547 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1548 	wrmsr(MSR_STAR, msr);
1549 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1550 }
1551 
1552 void
1553 amd64_bsp_pcpu_init1(struct pcpu *pc)
1554 {
1555 	struct user_segment_descriptor *gdt;
1556 
1557 	PCPU_SET(prvspace, pc);
1558 	gdt = *PCPU_PTR(gdt);
1559 	PCPU_SET(curthread, &thread0);
1560 	PCPU_SET(tssp, PCPU_PTR(common_tss));
1561 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1562 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1563 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1564 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1565 	PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1566 	PCPU_SET(smp_tlb_gen, 1);
1567 }
1568 
1569 void
1570 amd64_bsp_pcpu_init2(uint64_t rsp0)
1571 {
1572 
1573 	PCPU_SET(rsp0, rsp0);
1574 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1575 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1576 	PCPU_SET(curpcb, thread0.td_pcb);
1577 }
1578 
1579 void
1580 amd64_bsp_ist_init(struct pcpu *pc)
1581 {
1582 	struct nmi_pcpu *np;
1583 	struct amd64tss *tssp;
1584 
1585 	tssp = &pc->pc_common_tss;
1586 
1587 	/* doublefault stack space, runs on ist1 */
1588 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1589 	np->np_pcpu = (register_t)pc;
1590 	tssp->tss_ist1 = (long)np;
1591 
1592 	/*
1593 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1594 	 * above the start of the ist2 stack.
1595 	 */
1596 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1597 	np->np_pcpu = (register_t)pc;
1598 	tssp->tss_ist2 = (long)np;
1599 
1600 	/*
1601 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1602 	 * above the start of the ist3 stack.
1603 	 */
1604 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1605 	np->np_pcpu = (register_t)pc;
1606 	tssp->tss_ist3 = (long)np;
1607 
1608 	/*
1609 	 * DB# stack, runs on ist4.
1610 	 */
1611 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1612 	np->np_pcpu = (register_t)pc;
1613 	tssp->tss_ist4 = (long)np;
1614 }
1615 
1616 u_int64_t
1617 hammer_time(u_int64_t modulep, u_int64_t physfree)
1618 {
1619 	caddr_t kmdp;
1620 	int gsel_tss, x;
1621 	struct pcpu *pc;
1622 	struct xstate_hdr *xhdr;
1623 	u_int64_t rsp0;
1624 	char *env;
1625 	struct user_segment_descriptor *gdt;
1626 	struct region_descriptor r_gdt;
1627 	size_t kstack0_sz;
1628 	int late_console;
1629 
1630 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1631 
1632 	kmdp = init_ops.parse_preload_data(modulep);
1633 
1634 	physfree += ucode_load_bsp(physfree + KERNBASE);
1635 	physfree = roundup2(physfree, PAGE_SIZE);
1636 
1637 	identify_cpu1();
1638 	identify_hypervisor();
1639 	identify_cpu_fixup_bsp();
1640 	identify_cpu2();
1641 	initializecpucache();
1642 
1643 	/*
1644 	 * Check for pti, pcid, and invpcid before ifuncs are
1645 	 * resolved, to correctly select the implementation for
1646 	 * pmap_activate_sw_mode().
1647 	 */
1648 	pti = pti_get_default();
1649 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1650 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1651 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1652 		invpcid_works = (cpu_stdext_feature &
1653 		    CPUID_STDEXT_INVPCID) != 0;
1654 	} else {
1655 		pmap_pcid_enabled = 0;
1656 	}
1657 
1658 	link_elf_ireloc(kmdp);
1659 
1660 	/*
1661 	 * This may be done better later if it gets more high level
1662 	 * components in it. If so just link td->td_proc here.
1663 	 */
1664 	proc_linkup0(&proc0, &thread0);
1665 
1666 	/* Init basic tunables, hz etc */
1667 	init_param1();
1668 
1669 	thread0.td_kstack = physfree + KERNBASE;
1670 	thread0.td_kstack_pages = kstack_pages;
1671 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1672 	bzero((void *)thread0.td_kstack, kstack0_sz);
1673 	physfree += kstack0_sz;
1674 
1675 	/*
1676 	 * Initialize enough of thread0 for delayed invalidation to
1677 	 * work very early.  Rely on thread0.td_base_pri
1678 	 * zero-initialization, it is reset to PVM at proc0_init().
1679 	 */
1680 	pmap_thread_init_invl_gen(&thread0);
1681 
1682 	pc = &temp_bsp_pcpu;
1683 	pcpu_init(pc, 0, sizeof(struct pcpu));
1684 	gdt = &temp_bsp_pcpu.pc_gdt[0];
1685 
1686 	/*
1687 	 * make gdt memory segments
1688 	 */
1689 	for (x = 0; x < NGDT; x++) {
1690 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1691 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1692 			ssdtosd(&gdt_segs[x], &gdt[x]);
1693 	}
1694 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1695 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1696 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1697 
1698 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1699 	r_gdt.rd_base = (long)gdt;
1700 	lgdt(&r_gdt);
1701 
1702 	wrmsr(MSR_FSBASE, 0);		/* User value */
1703 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1704 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1705 
1706 	dpcpu_init((void *)(physfree + KERNBASE), 0);
1707 	physfree += DPCPU_SIZE;
1708 	amd64_bsp_pcpu_init1(pc);
1709 	/* Non-late cninit() and printf() can be moved up to here. */
1710 
1711 	/*
1712 	 * Initialize mutexes.
1713 	 *
1714 	 * icu_lock: in order to allow an interrupt to occur in a critical
1715 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1716 	 *	     must be able to get the icu lock, so it can't be
1717 	 *	     under witness.
1718 	 */
1719 	mutex_init();
1720 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1721 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1722 
1723 	/* exceptions */
1724 	for (x = 0; x < NIDT; x++)
1725 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1726 		    SEL_KPL, 0);
1727 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1728 	    SEL_KPL, 0);
1729 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1730 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1731 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1732 	    SEL_UPL, 0);
1733 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1734 	    SEL_UPL, 0);
1735 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1736 	    SEL_KPL, 0);
1737 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1738 	    SEL_KPL, 0);
1739 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1740 	    SEL_KPL, 0);
1741 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1742 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1743 	    SDT_SYSIGT, SEL_KPL, 0);
1744 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1745 	    SEL_KPL, 0);
1746 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1747 	    SDT_SYSIGT, SEL_KPL, 0);
1748 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1749 	    SEL_KPL, 0);
1750 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1751 	    SEL_KPL, 0);
1752 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1753 	    SEL_KPL, 0);
1754 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1755 	    SEL_KPL, 0);
1756 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1757 	    SEL_KPL, 0);
1758 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1759 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1760 	    SEL_KPL, 0);
1761 #ifdef KDTRACE_HOOKS
1762 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1763 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1764 #endif
1765 #ifdef XENHVM
1766 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1767 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1768 #endif
1769 	r_idt.rd_limit = sizeof(idt0) - 1;
1770 	r_idt.rd_base = (long) idt;
1771 	lidt(&r_idt);
1772 
1773 	/*
1774 	 * Initialize the clock before the console so that console
1775 	 * initialization can use DELAY().
1776 	 */
1777 	clock_init();
1778 
1779 	/*
1780 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1781 	 * transition).
1782 	 * Once bootblocks have updated, we can test directly for
1783 	 * efi_systbl != NULL here...
1784 	 */
1785 	if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1786 	    != NULL)
1787 		vty_set_preferred(VTY_VT);
1788 
1789 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1790 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1791 
1792 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1793 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1794 
1795 	TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
1796 	    &syscall_ret_l1d_flush_mode);
1797 
1798 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1799 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1800 
1801 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1802 
1803 	TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
1804 	    &x86_rngds_mitg_enable);
1805 
1806 	finishidentcpu();	/* Final stage of CPU initialization */
1807 	initializecpu();	/* Initialize CPU registers */
1808 
1809 	amd64_bsp_ist_init(pc);
1810 
1811 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1812 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1813 	    IOPERM_BITMAP_SIZE;
1814 
1815 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1816 	ltr(gsel_tss);
1817 
1818 	amd64_conf_fast_syscall();
1819 
1820 	/*
1821 	 * We initialize the PCB pointer early so that exception
1822 	 * handlers will work.  Also set up td_critnest to short-cut
1823 	 * the page fault handler.
1824 	 */
1825 	cpu_max_ext_state_size = sizeof(struct savefpu);
1826 	set_top_of_stack_td(&thread0);
1827 	thread0.td_pcb = get_pcb_td(&thread0);
1828 	thread0.td_critnest = 1;
1829 
1830 	/*
1831 	 * The console and kdb should be initialized even earlier than here,
1832 	 * but some console drivers don't work until after getmemsize().
1833 	 * Default to late console initialization to support these drivers.
1834 	 * This loses mainly printf()s in getmemsize() and early debugging.
1835 	 */
1836 	late_console = 1;
1837 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1838 	if (!late_console) {
1839 		cninit();
1840 		amd64_kdb_init();
1841 	}
1842 
1843 	getmemsize(kmdp, physfree);
1844 	init_param2(physmem);
1845 
1846 	/* now running on new page tables, configured,and u/iom is accessible */
1847 
1848 #ifdef DEV_PCI
1849         /* This call might adjust phys_avail[]. */
1850         pci_early_quirks();
1851 #endif
1852 
1853 	if (late_console)
1854 		cninit();
1855 
1856 #ifdef DEV_ISA
1857 #ifdef DEV_ATPIC
1858 	elcr_probe();
1859 	atpic_startup();
1860 #else
1861 	/* Reset and mask the atpics and leave them shut down. */
1862 	atpic_reset();
1863 
1864 	/*
1865 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1866 	 * interrupt handler.
1867 	 */
1868 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1869 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1870 #endif
1871 #else
1872 #error "have you forgotten the isa device?"
1873 #endif
1874 
1875 	if (late_console)
1876 		amd64_kdb_init();
1877 
1878 	msgbufinit(msgbufp, msgbufsize);
1879 	fpuinit();
1880 
1881 	/*
1882 	 * Reinitialize thread0's stack base now that the xsave area size is
1883 	 * known.  Set up thread0's pcb save area after fpuinit calculated fpu
1884 	 * save area size.  Zero out the extended state header in fpu save area.
1885 	 */
1886 	set_top_of_stack_td(&thread0);
1887 	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1888 	bzero(thread0.td_pcb->pcb_save, cpu_max_ext_state_size);
1889 	if (use_xsave) {
1890 		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1891 		    1);
1892 		xhdr->xstate_bv = xsave_mask;
1893 	}
1894 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1895 	rsp0 = thread0.td_md.md_stack_base;
1896 	/* Ensure the stack is aligned to 16 bytes */
1897 	rsp0 &= ~0xFul;
1898 	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1899 	amd64_bsp_pcpu_init2(rsp0);
1900 
1901 	/* transfer to user mode */
1902 
1903 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1904 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1905 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1906 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1907 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1908 
1909 	load_ds(_udatasel);
1910 	load_es(_udatasel);
1911 	load_fs(_ufssel);
1912 
1913 	/* setup proc 0's pcb */
1914 	thread0.td_pcb->pcb_flags = 0;
1915 	thread0.td_frame = &proc0_tf;
1916 
1917         env = kern_getenv("kernelname");
1918 	if (env != NULL)
1919 		strlcpy(kernelname, env, sizeof(kernelname));
1920 
1921 	cpu_probe_amdc1e();
1922 
1923 	kcsan_cpu_init(0);
1924 
1925 #ifdef FDT
1926 	x86_init_fdt();
1927 #endif
1928 	thread0.td_critnest = 0;
1929 
1930 	TSEXIT();
1931 
1932 	/* Location of kernel stack for locore */
1933 	return (thread0.td_md.md_stack_base);
1934 }
1935 
1936 void
1937 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1938 {
1939 
1940 	pcpu->pc_acpi_id = 0xffffffff;
1941 }
1942 
1943 static int
1944 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1945 {
1946 	struct bios_smap *smapbase;
1947 	struct bios_smap_xattr smap;
1948 	caddr_t kmdp;
1949 	uint32_t *smapattr;
1950 	int count, error, i;
1951 
1952 	/* Retrieve the system memory map from the loader. */
1953 	kmdp = preload_search_by_type("elf kernel");
1954 	if (kmdp == NULL)
1955 		kmdp = preload_search_by_type("elf64 kernel");
1956 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1957 	    MODINFO_METADATA | MODINFOMD_SMAP);
1958 	if (smapbase == NULL)
1959 		return (0);
1960 	smapattr = (uint32_t *)preload_search_info(kmdp,
1961 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1962 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1963 	error = 0;
1964 	for (i = 0; i < count; i++) {
1965 		smap.base = smapbase[i].base;
1966 		smap.length = smapbase[i].length;
1967 		smap.type = smapbase[i].type;
1968 		if (smapattr != NULL)
1969 			smap.xattr = smapattr[i];
1970 		else
1971 			smap.xattr = 0;
1972 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1973 	}
1974 	return (error);
1975 }
1976 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1977     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1978     smap_sysctl_handler, "S,bios_smap_xattr",
1979     "Raw BIOS SMAP data");
1980 
1981 static int
1982 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1983 {
1984 	struct efi_map_header *efihdr;
1985 	caddr_t kmdp;
1986 	uint32_t efisize;
1987 
1988 	kmdp = preload_search_by_type("elf kernel");
1989 	if (kmdp == NULL)
1990 		kmdp = preload_search_by_type("elf64 kernel");
1991 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1992 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1993 	if (efihdr == NULL)
1994 		return (0);
1995 	efisize = *((uint32_t *)efihdr - 1);
1996 	return (SYSCTL_OUT(req, efihdr, efisize));
1997 }
1998 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1999     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
2000     efi_map_sysctl_handler, "S,efi_map_header",
2001     "Raw EFI Memory Map");
2002 
2003 void
2004 spinlock_enter(void)
2005 {
2006 	struct thread *td;
2007 	register_t flags;
2008 
2009 	td = curthread;
2010 	if (td->td_md.md_spinlock_count == 0) {
2011 		flags = intr_disable();
2012 		td->td_md.md_spinlock_count = 1;
2013 		td->td_md.md_saved_flags = flags;
2014 		critical_enter();
2015 	} else
2016 		td->td_md.md_spinlock_count++;
2017 }
2018 
2019 void
2020 spinlock_exit(void)
2021 {
2022 	struct thread *td;
2023 	register_t flags;
2024 
2025 	td = curthread;
2026 	flags = td->td_md.md_saved_flags;
2027 	td->td_md.md_spinlock_count--;
2028 	if (td->td_md.md_spinlock_count == 0) {
2029 		critical_exit();
2030 		intr_restore(flags);
2031 	}
2032 }
2033 
2034 /*
2035  * Construct a PCB from a trapframe. This is called from kdb_trap() where
2036  * we want to start a backtrace from the function that caused us to enter
2037  * the debugger. We have the context in the trapframe, but base the trace
2038  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
2039  * enough for a backtrace.
2040  */
2041 void
2042 makectx(struct trapframe *tf, struct pcb *pcb)
2043 {
2044 
2045 	pcb->pcb_r12 = tf->tf_r12;
2046 	pcb->pcb_r13 = tf->tf_r13;
2047 	pcb->pcb_r14 = tf->tf_r14;
2048 	pcb->pcb_r15 = tf->tf_r15;
2049 	pcb->pcb_rbp = tf->tf_rbp;
2050 	pcb->pcb_rbx = tf->tf_rbx;
2051 	pcb->pcb_rip = tf->tf_rip;
2052 	pcb->pcb_rsp = tf->tf_rsp;
2053 }
2054 
2055 int
2056 ptrace_set_pc(struct thread *td, unsigned long addr)
2057 {
2058 
2059 	td->td_frame->tf_rip = addr;
2060 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2061 	return (0);
2062 }
2063 
2064 int
2065 ptrace_single_step(struct thread *td)
2066 {
2067 
2068 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2069 	if ((td->td_frame->tf_rflags & PSL_T) == 0) {
2070 		td->td_frame->tf_rflags |= PSL_T;
2071 		td->td_dbgflags |= TDB_STEP;
2072 	}
2073 	return (0);
2074 }
2075 
2076 int
2077 ptrace_clear_single_step(struct thread *td)
2078 {
2079 
2080 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2081 	td->td_frame->tf_rflags &= ~PSL_T;
2082 	td->td_dbgflags &= ~TDB_STEP;
2083 	return (0);
2084 }
2085 
2086 int
2087 fill_regs(struct thread *td, struct reg *regs)
2088 {
2089 	struct trapframe *tp;
2090 
2091 	tp = td->td_frame;
2092 	return (fill_frame_regs(tp, regs));
2093 }
2094 
2095 int
2096 fill_frame_regs(struct trapframe *tp, struct reg *regs)
2097 {
2098 
2099 	regs->r_r15 = tp->tf_r15;
2100 	regs->r_r14 = tp->tf_r14;
2101 	regs->r_r13 = tp->tf_r13;
2102 	regs->r_r12 = tp->tf_r12;
2103 	regs->r_r11 = tp->tf_r11;
2104 	regs->r_r10 = tp->tf_r10;
2105 	regs->r_r9  = tp->tf_r9;
2106 	regs->r_r8  = tp->tf_r8;
2107 	regs->r_rdi = tp->tf_rdi;
2108 	regs->r_rsi = tp->tf_rsi;
2109 	regs->r_rbp = tp->tf_rbp;
2110 	regs->r_rbx = tp->tf_rbx;
2111 	regs->r_rdx = tp->tf_rdx;
2112 	regs->r_rcx = tp->tf_rcx;
2113 	regs->r_rax = tp->tf_rax;
2114 	regs->r_rip = tp->tf_rip;
2115 	regs->r_cs = tp->tf_cs;
2116 	regs->r_rflags = tp->tf_rflags;
2117 	regs->r_rsp = tp->tf_rsp;
2118 	regs->r_ss = tp->tf_ss;
2119 	if (tp->tf_flags & TF_HASSEGS) {
2120 		regs->r_ds = tp->tf_ds;
2121 		regs->r_es = tp->tf_es;
2122 		regs->r_fs = tp->tf_fs;
2123 		regs->r_gs = tp->tf_gs;
2124 	} else {
2125 		regs->r_ds = 0;
2126 		regs->r_es = 0;
2127 		regs->r_fs = 0;
2128 		regs->r_gs = 0;
2129 	}
2130 	regs->r_err = 0;
2131 	regs->r_trapno = 0;
2132 	return (0);
2133 }
2134 
2135 int
2136 set_regs(struct thread *td, struct reg *regs)
2137 {
2138 	struct trapframe *tp;
2139 	register_t rflags;
2140 
2141 	tp = td->td_frame;
2142 	rflags = regs->r_rflags & 0xffffffff;
2143 	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
2144 		return (EINVAL);
2145 	tp->tf_r15 = regs->r_r15;
2146 	tp->tf_r14 = regs->r_r14;
2147 	tp->tf_r13 = regs->r_r13;
2148 	tp->tf_r12 = regs->r_r12;
2149 	tp->tf_r11 = regs->r_r11;
2150 	tp->tf_r10 = regs->r_r10;
2151 	tp->tf_r9  = regs->r_r9;
2152 	tp->tf_r8  = regs->r_r8;
2153 	tp->tf_rdi = regs->r_rdi;
2154 	tp->tf_rsi = regs->r_rsi;
2155 	tp->tf_rbp = regs->r_rbp;
2156 	tp->tf_rbx = regs->r_rbx;
2157 	tp->tf_rdx = regs->r_rdx;
2158 	tp->tf_rcx = regs->r_rcx;
2159 	tp->tf_rax = regs->r_rax;
2160 	tp->tf_rip = regs->r_rip;
2161 	tp->tf_cs = regs->r_cs;
2162 	tp->tf_rflags = rflags;
2163 	tp->tf_rsp = regs->r_rsp;
2164 	tp->tf_ss = regs->r_ss;
2165 	if (0) {	/* XXXKIB */
2166 		tp->tf_ds = regs->r_ds;
2167 		tp->tf_es = regs->r_es;
2168 		tp->tf_fs = regs->r_fs;
2169 		tp->tf_gs = regs->r_gs;
2170 		tp->tf_flags = TF_HASSEGS;
2171 	}
2172 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2173 	return (0);
2174 }
2175 
2176 /* XXX check all this stuff! */
2177 /* externalize from sv_xmm */
2178 static void
2179 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2180 {
2181 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2182 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2183 	int i;
2184 
2185 	/* pcb -> fpregs */
2186 	bzero(fpregs, sizeof(*fpregs));
2187 
2188 	/* FPU control/status */
2189 	penv_fpreg->en_cw = penv_xmm->en_cw;
2190 	penv_fpreg->en_sw = penv_xmm->en_sw;
2191 	penv_fpreg->en_tw = penv_xmm->en_tw;
2192 	penv_fpreg->en_opcode = penv_xmm->en_opcode;
2193 	penv_fpreg->en_rip = penv_xmm->en_rip;
2194 	penv_fpreg->en_rdp = penv_xmm->en_rdp;
2195 	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2196 	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2197 
2198 	/* FPU registers */
2199 	for (i = 0; i < 8; ++i)
2200 		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2201 
2202 	/* SSE registers */
2203 	for (i = 0; i < 16; ++i)
2204 		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2205 }
2206 
2207 /* internalize from fpregs into sv_xmm */
2208 static void
2209 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2210 {
2211 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2212 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2213 	int i;
2214 
2215 	/* fpregs -> pcb */
2216 	/* FPU control/status */
2217 	penv_xmm->en_cw = penv_fpreg->en_cw;
2218 	penv_xmm->en_sw = penv_fpreg->en_sw;
2219 	penv_xmm->en_tw = penv_fpreg->en_tw;
2220 	penv_xmm->en_opcode = penv_fpreg->en_opcode;
2221 	penv_xmm->en_rip = penv_fpreg->en_rip;
2222 	penv_xmm->en_rdp = penv_fpreg->en_rdp;
2223 	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2224 	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2225 
2226 	/* FPU registers */
2227 	for (i = 0; i < 8; ++i)
2228 		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2229 
2230 	/* SSE registers */
2231 	for (i = 0; i < 16; ++i)
2232 		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2233 }
2234 
2235 /* externalize from td->pcb */
2236 int
2237 fill_fpregs(struct thread *td, struct fpreg *fpregs)
2238 {
2239 
2240 	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2241 	    P_SHOULDSTOP(td->td_proc),
2242 	    ("not suspended thread %p", td));
2243 	fpugetregs(td);
2244 	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2245 	return (0);
2246 }
2247 
2248 /* internalize to td->pcb */
2249 int
2250 set_fpregs(struct thread *td, struct fpreg *fpregs)
2251 {
2252 
2253 	critical_enter();
2254 	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2255 	fpuuserinited(td);
2256 	critical_exit();
2257 	return (0);
2258 }
2259 
2260 /*
2261  * Get machine context.
2262  */
2263 int
2264 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2265 {
2266 	struct pcb *pcb;
2267 	struct trapframe *tp;
2268 
2269 	pcb = td->td_pcb;
2270 	tp = td->td_frame;
2271 	PROC_LOCK(curthread->td_proc);
2272 	mcp->mc_onstack = sigonstack(tp->tf_rsp);
2273 	PROC_UNLOCK(curthread->td_proc);
2274 	mcp->mc_r15 = tp->tf_r15;
2275 	mcp->mc_r14 = tp->tf_r14;
2276 	mcp->mc_r13 = tp->tf_r13;
2277 	mcp->mc_r12 = tp->tf_r12;
2278 	mcp->mc_r11 = tp->tf_r11;
2279 	mcp->mc_r10 = tp->tf_r10;
2280 	mcp->mc_r9  = tp->tf_r9;
2281 	mcp->mc_r8  = tp->tf_r8;
2282 	mcp->mc_rdi = tp->tf_rdi;
2283 	mcp->mc_rsi = tp->tf_rsi;
2284 	mcp->mc_rbp = tp->tf_rbp;
2285 	mcp->mc_rbx = tp->tf_rbx;
2286 	mcp->mc_rcx = tp->tf_rcx;
2287 	mcp->mc_rflags = tp->tf_rflags;
2288 	if (flags & GET_MC_CLEAR_RET) {
2289 		mcp->mc_rax = 0;
2290 		mcp->mc_rdx = 0;
2291 		mcp->mc_rflags &= ~PSL_C;
2292 	} else {
2293 		mcp->mc_rax = tp->tf_rax;
2294 		mcp->mc_rdx = tp->tf_rdx;
2295 	}
2296 	mcp->mc_rip = tp->tf_rip;
2297 	mcp->mc_cs = tp->tf_cs;
2298 	mcp->mc_rsp = tp->tf_rsp;
2299 	mcp->mc_ss = tp->tf_ss;
2300 	mcp->mc_ds = tp->tf_ds;
2301 	mcp->mc_es = tp->tf_es;
2302 	mcp->mc_fs = tp->tf_fs;
2303 	mcp->mc_gs = tp->tf_gs;
2304 	mcp->mc_flags = tp->tf_flags;
2305 	mcp->mc_len = sizeof(*mcp);
2306 	get_fpcontext(td, mcp, NULL, 0);
2307 	update_pcb_bases(pcb);
2308 	mcp->mc_fsbase = pcb->pcb_fsbase;
2309 	mcp->mc_gsbase = pcb->pcb_gsbase;
2310 	mcp->mc_xfpustate = 0;
2311 	mcp->mc_xfpustate_len = 0;
2312 	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2313 	return (0);
2314 }
2315 
2316 /*
2317  * Set machine context.
2318  *
2319  * However, we don't set any but the user modifiable flags, and we won't
2320  * touch the cs selector.
2321  */
2322 int
2323 set_mcontext(struct thread *td, mcontext_t *mcp)
2324 {
2325 	struct pcb *pcb;
2326 	struct trapframe *tp;
2327 	char *xfpustate;
2328 	long rflags;
2329 	int ret;
2330 
2331 	pcb = td->td_pcb;
2332 	tp = td->td_frame;
2333 	if (mcp->mc_len != sizeof(*mcp) ||
2334 	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2335 		return (EINVAL);
2336 	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2337 	    (tp->tf_rflags & ~PSL_USERCHANGE);
2338 	if (mcp->mc_flags & _MC_HASFPXSTATE) {
2339 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2340 		    sizeof(struct savefpu))
2341 			return (EINVAL);
2342 		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2343 		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2344 		    mcp->mc_xfpustate_len);
2345 		if (ret != 0)
2346 			return (ret);
2347 	} else
2348 		xfpustate = NULL;
2349 	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2350 	if (ret != 0)
2351 		return (ret);
2352 	tp->tf_r15 = mcp->mc_r15;
2353 	tp->tf_r14 = mcp->mc_r14;
2354 	tp->tf_r13 = mcp->mc_r13;
2355 	tp->tf_r12 = mcp->mc_r12;
2356 	tp->tf_r11 = mcp->mc_r11;
2357 	tp->tf_r10 = mcp->mc_r10;
2358 	tp->tf_r9  = mcp->mc_r9;
2359 	tp->tf_r8  = mcp->mc_r8;
2360 	tp->tf_rdi = mcp->mc_rdi;
2361 	tp->tf_rsi = mcp->mc_rsi;
2362 	tp->tf_rbp = mcp->mc_rbp;
2363 	tp->tf_rbx = mcp->mc_rbx;
2364 	tp->tf_rdx = mcp->mc_rdx;
2365 	tp->tf_rcx = mcp->mc_rcx;
2366 	tp->tf_rax = mcp->mc_rax;
2367 	tp->tf_rip = mcp->mc_rip;
2368 	tp->tf_rflags = rflags;
2369 	tp->tf_rsp = mcp->mc_rsp;
2370 	tp->tf_ss = mcp->mc_ss;
2371 	tp->tf_flags = mcp->mc_flags;
2372 	if (tp->tf_flags & TF_HASSEGS) {
2373 		tp->tf_ds = mcp->mc_ds;
2374 		tp->tf_es = mcp->mc_es;
2375 		tp->tf_fs = mcp->mc_fs;
2376 		tp->tf_gs = mcp->mc_gs;
2377 	}
2378 	set_pcb_flags(pcb, PCB_FULL_IRET);
2379 	if (mcp->mc_flags & _MC_HASBASES) {
2380 		pcb->pcb_fsbase = mcp->mc_fsbase;
2381 		pcb->pcb_gsbase = mcp->mc_gsbase;
2382 	}
2383 	return (0);
2384 }
2385 
2386 static void
2387 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2388     size_t xfpusave_len)
2389 {
2390 	size_t max_len, len;
2391 
2392 	mcp->mc_ownedfp = fpugetregs(td);
2393 	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2394 	    sizeof(mcp->mc_fpstate));
2395 	mcp->mc_fpformat = fpuformat();
2396 	if (!use_xsave || xfpusave_len == 0)
2397 		return;
2398 	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2399 	len = xfpusave_len;
2400 	if (len > max_len) {
2401 		len = max_len;
2402 		bzero(xfpusave + max_len, len - max_len);
2403 	}
2404 	mcp->mc_flags |= _MC_HASFPXSTATE;
2405 	mcp->mc_xfpustate_len = len;
2406 	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2407 }
2408 
2409 static int
2410 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2411     size_t xfpustate_len)
2412 {
2413 	int error;
2414 
2415 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2416 		return (0);
2417 	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2418 		return (EINVAL);
2419 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2420 		/* We don't care what state is left in the FPU or PCB. */
2421 		fpstate_drop(td);
2422 		error = 0;
2423 	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2424 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2425 		error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
2426 		    xfpustate, xfpustate_len);
2427 	} else
2428 		return (EINVAL);
2429 	return (error);
2430 }
2431 
2432 void
2433 fpstate_drop(struct thread *td)
2434 {
2435 
2436 	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2437 	critical_enter();
2438 	if (PCPU_GET(fpcurthread) == td)
2439 		fpudrop();
2440 	/*
2441 	 * XXX force a full drop of the fpu.  The above only drops it if we
2442 	 * owned it.
2443 	 *
2444 	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2445 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
2446 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
2447 	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2448 	 * have too many layers.
2449 	 */
2450 	clear_pcb_flags(curthread->td_pcb,
2451 	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2452 	critical_exit();
2453 }
2454 
2455 int
2456 fill_dbregs(struct thread *td, struct dbreg *dbregs)
2457 {
2458 	struct pcb *pcb;
2459 
2460 	if (td == NULL) {
2461 		dbregs->dr[0] = rdr0();
2462 		dbregs->dr[1] = rdr1();
2463 		dbregs->dr[2] = rdr2();
2464 		dbregs->dr[3] = rdr3();
2465 		dbregs->dr[6] = rdr6();
2466 		dbregs->dr[7] = rdr7();
2467 	} else {
2468 		pcb = td->td_pcb;
2469 		dbregs->dr[0] = pcb->pcb_dr0;
2470 		dbregs->dr[1] = pcb->pcb_dr1;
2471 		dbregs->dr[2] = pcb->pcb_dr2;
2472 		dbregs->dr[3] = pcb->pcb_dr3;
2473 		dbregs->dr[6] = pcb->pcb_dr6;
2474 		dbregs->dr[7] = pcb->pcb_dr7;
2475 	}
2476 	dbregs->dr[4] = 0;
2477 	dbregs->dr[5] = 0;
2478 	dbregs->dr[8] = 0;
2479 	dbregs->dr[9] = 0;
2480 	dbregs->dr[10] = 0;
2481 	dbregs->dr[11] = 0;
2482 	dbregs->dr[12] = 0;
2483 	dbregs->dr[13] = 0;
2484 	dbregs->dr[14] = 0;
2485 	dbregs->dr[15] = 0;
2486 	return (0);
2487 }
2488 
2489 int
2490 set_dbregs(struct thread *td, struct dbreg *dbregs)
2491 {
2492 	struct pcb *pcb;
2493 	int i;
2494 
2495 	if (td == NULL) {
2496 		load_dr0(dbregs->dr[0]);
2497 		load_dr1(dbregs->dr[1]);
2498 		load_dr2(dbregs->dr[2]);
2499 		load_dr3(dbregs->dr[3]);
2500 		load_dr6(dbregs->dr[6]);
2501 		load_dr7(dbregs->dr[7]);
2502 	} else {
2503 		/*
2504 		 * Don't let an illegal value for dr7 get set.  Specifically,
2505 		 * check for undefined settings.  Setting these bit patterns
2506 		 * result in undefined behaviour and can lead to an unexpected
2507 		 * TRCTRAP or a general protection fault right here.
2508 		 * Upper bits of dr6 and dr7 must not be set
2509 		 */
2510 		for (i = 0; i < 4; i++) {
2511 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2512 				return (EINVAL);
2513 			if (td->td_frame->tf_cs == _ucode32sel &&
2514 			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2515 				return (EINVAL);
2516 		}
2517 		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2518 		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2519 			return (EINVAL);
2520 
2521 		pcb = td->td_pcb;
2522 
2523 		/*
2524 		 * Don't let a process set a breakpoint that is not within the
2525 		 * process's address space.  If a process could do this, it
2526 		 * could halt the system by setting a breakpoint in the kernel
2527 		 * (if ddb was enabled).  Thus, we need to check to make sure
2528 		 * that no breakpoints are being enabled for addresses outside
2529 		 * process's address space.
2530 		 *
2531 		 * XXX - what about when the watched area of the user's
2532 		 * address space is written into from within the kernel
2533 		 * ... wouldn't that still cause a breakpoint to be generated
2534 		 * from within kernel mode?
2535 		 */
2536 
2537 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2538 			/* dr0 is enabled */
2539 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2540 				return (EINVAL);
2541 		}
2542 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2543 			/* dr1 is enabled */
2544 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2545 				return (EINVAL);
2546 		}
2547 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2548 			/* dr2 is enabled */
2549 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2550 				return (EINVAL);
2551 		}
2552 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2553 			/* dr3 is enabled */
2554 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2555 				return (EINVAL);
2556 		}
2557 
2558 		pcb->pcb_dr0 = dbregs->dr[0];
2559 		pcb->pcb_dr1 = dbregs->dr[1];
2560 		pcb->pcb_dr2 = dbregs->dr[2];
2561 		pcb->pcb_dr3 = dbregs->dr[3];
2562 		pcb->pcb_dr6 = dbregs->dr[6];
2563 		pcb->pcb_dr7 = dbregs->dr[7];
2564 
2565 		set_pcb_flags(pcb, PCB_DBREGS);
2566 	}
2567 
2568 	return (0);
2569 }
2570 
2571 void
2572 reset_dbregs(void)
2573 {
2574 
2575 	load_dr7(0);	/* Turn off the control bits first */
2576 	load_dr0(0);
2577 	load_dr1(0);
2578 	load_dr2(0);
2579 	load_dr3(0);
2580 	load_dr6(0);
2581 }
2582 
2583 /*
2584  * Return > 0 if a hardware breakpoint has been hit, and the
2585  * breakpoint was in user space.  Return 0, otherwise.
2586  */
2587 int
2588 user_dbreg_trap(register_t dr6)
2589 {
2590         u_int64_t dr7;
2591         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
2592         int nbp;            /* number of breakpoints that triggered */
2593         caddr_t addr[4];    /* breakpoint addresses */
2594         int i;
2595 
2596         bp = dr6 & DBREG_DR6_BMASK;
2597         if (bp == 0) {
2598                 /*
2599                  * None of the breakpoint bits are set meaning this
2600                  * trap was not caused by any of the debug registers
2601                  */
2602                 return 0;
2603         }
2604 
2605         dr7 = rdr7();
2606         if ((dr7 & 0x000000ff) == 0) {
2607                 /*
2608                  * all GE and LE bits in the dr7 register are zero,
2609                  * thus the trap couldn't have been caused by the
2610                  * hardware debug registers
2611                  */
2612                 return 0;
2613         }
2614 
2615         nbp = 0;
2616 
2617         /*
2618          * at least one of the breakpoints were hit, check to see
2619          * which ones and if any of them are user space addresses
2620          */
2621 
2622         if (bp & 0x01) {
2623                 addr[nbp++] = (caddr_t)rdr0();
2624         }
2625         if (bp & 0x02) {
2626                 addr[nbp++] = (caddr_t)rdr1();
2627         }
2628         if (bp & 0x04) {
2629                 addr[nbp++] = (caddr_t)rdr2();
2630         }
2631         if (bp & 0x08) {
2632                 addr[nbp++] = (caddr_t)rdr3();
2633         }
2634 
2635         for (i = 0; i < nbp; i++) {
2636                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2637                         /*
2638                          * addr[i] is in user space
2639                          */
2640                         return nbp;
2641                 }
2642         }
2643 
2644         /*
2645          * None of the breakpoints are in user space.
2646          */
2647         return 0;
2648 }
2649 
2650 /*
2651  * The pcb_flags is only modified by current thread, or by other threads
2652  * when current thread is stopped.  However, current thread may change it
2653  * from the interrupt context in cpu_switch(), or in the trap handler.
2654  * When we read-modify-write pcb_flags from C sources, compiler may generate
2655  * code that is not atomic regarding the interrupt handler.  If a trap or
2656  * interrupt happens and any flag is modified from the handler, it can be
2657  * clobbered with the cached value later.  Therefore, we implement setting
2658  * and clearing flags with single-instruction functions, which do not race
2659  * with possible modification of the flags from the trap or interrupt context,
2660  * because traps and interrupts are executed only on instruction boundary.
2661  */
2662 void
2663 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
2664 {
2665 
2666 	__asm __volatile("orl %1,%0"
2667 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
2668 	    : "cc", "memory");
2669 
2670 }
2671 
2672 /*
2673  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
2674  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
2675  * pcb if user space modified the bases.  We must save on the context
2676  * switch or if the return to usermode happens through the doreti.
2677  *
2678  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
2679  * which have a consequence that the base MSRs must be saved each time
2680  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
2681  * context switches.
2682  */
2683 static void
2684 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
2685 {
2686 	register_t r;
2687 
2688 	if (curpcb == pcb &&
2689 	    (flags & PCB_FULL_IRET) != 0 &&
2690 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2691 		r = intr_disable();
2692 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2693 			if (rfs() == _ufssel)
2694 				pcb->pcb_fsbase = rdfsbase();
2695 			if (rgs() == _ugssel)
2696 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
2697 		}
2698 		set_pcb_flags_raw(pcb, flags);
2699 		intr_restore(r);
2700 	} else {
2701 		set_pcb_flags_raw(pcb, flags);
2702 	}
2703 }
2704 
2705 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
2706 {
2707 
2708 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
2709 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
2710 }
2711 
2712 void
2713 clear_pcb_flags(struct pcb *pcb, const u_int flags)
2714 {
2715 
2716 	__asm __volatile("andl %1,%0"
2717 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
2718 	    : "cc", "memory");
2719 }
2720 
2721 #ifdef KDB
2722 
2723 /*
2724  * Provide inb() and outb() as functions.  They are normally only available as
2725  * inline functions, thus cannot be called from the debugger.
2726  */
2727 
2728 /* silence compiler warnings */
2729 u_char inb_(u_short);
2730 void outb_(u_short, u_char);
2731 
2732 u_char
2733 inb_(u_short port)
2734 {
2735 	return inb(port);
2736 }
2737 
2738 void
2739 outb_(u_short port, u_char data)
2740 {
2741 	outb(port, data);
2742 }
2743 
2744 #endif /* KDB */
2745 
2746 #undef memset
2747 #undef memmove
2748 #undef memcpy
2749 
2750 void	*memset_std(void *buf, int c, size_t len);
2751 void	*memset_erms(void *buf, int c, size_t len);
2752 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
2753 	    size_t len);
2754 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
2755 	    size_t len);
2756 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
2757 	    size_t len);
2758 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
2759 	    size_t len);
2760 
2761 #ifdef KCSAN
2762 /*
2763  * These fail to build as ifuncs when used with KCSAN.
2764  */
2765 void *
2766 memset(void *buf, int c, size_t len)
2767 {
2768 
2769 	return (memset_std(buf, c, len));
2770 }
2771 
2772 void *
2773 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
2774 {
2775 
2776 	return (memmove_std(dst, src, len));
2777 }
2778 
2779 void *
2780 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
2781 {
2782 
2783 	return (memcpy_std(dst, src, len));
2784 }
2785 #else
2786 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
2787 {
2788 
2789 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2790 	    memset_erms : memset_std);
2791 }
2792 
2793 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
2794     size_t))
2795 {
2796 
2797 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2798 	    memmove_erms : memmove_std);
2799 }
2800 
2801 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
2802 {
2803 
2804 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2805 	    memcpy_erms : memcpy_std);
2806 }
2807 #endif
2808 
2809 void	pagezero_std(void *addr);
2810 void	pagezero_erms(void *addr);
2811 DEFINE_IFUNC(, void , pagezero, (void *))
2812 {
2813 
2814 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2815 	    pagezero_erms : pagezero_std);
2816 }
2817