xref: /freebsd/sys/amd64/amd64/machdep.c (revision 190cef3d)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 1992 Terrence R. Lambert.
6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
41  */
42 
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 #include "opt_atpic.h"
47 #include "opt_cpu.h"
48 #include "opt_ddb.h"
49 #include "opt_inet.h"
50 #include "opt_isa.h"
51 #include "opt_kstack_pages.h"
52 #include "opt_maxmem.h"
53 #include "opt_mp_watchdog.h"
54 #include "opt_pci.h"
55 #include "opt_platform.h"
56 #include "opt_sched.h"
57 
58 #include <sys/param.h>
59 #include <sys/proc.h>
60 #include <sys/systm.h>
61 #include <sys/bio.h>
62 #include <sys/buf.h>
63 #include <sys/bus.h>
64 #include <sys/callout.h>
65 #include <sys/cons.h>
66 #include <sys/cpu.h>
67 #include <sys/efi.h>
68 #include <sys/eventhandler.h>
69 #include <sys/exec.h>
70 #include <sys/imgact.h>
71 #include <sys/kdb.h>
72 #include <sys/kernel.h>
73 #include <sys/ktr.h>
74 #include <sys/linker.h>
75 #include <sys/lock.h>
76 #include <sys/malloc.h>
77 #include <sys/memrange.h>
78 #include <sys/msgbuf.h>
79 #include <sys/mutex.h>
80 #include <sys/pcpu.h>
81 #include <sys/ptrace.h>
82 #include <sys/reboot.h>
83 #include <sys/rwlock.h>
84 #include <sys/sched.h>
85 #include <sys/signalvar.h>
86 #ifdef SMP
87 #include <sys/smp.h>
88 #endif
89 #include <sys/syscallsubr.h>
90 #include <sys/sysctl.h>
91 #include <sys/sysent.h>
92 #include <sys/sysproto.h>
93 #include <sys/ucontext.h>
94 #include <sys/vmmeter.h>
95 
96 #include <vm/vm.h>
97 #include <vm/vm_extern.h>
98 #include <vm/vm_kern.h>
99 #include <vm/vm_page.h>
100 #include <vm/vm_map.h>
101 #include <vm/vm_object.h>
102 #include <vm/vm_pager.h>
103 #include <vm/vm_param.h>
104 #include <vm/vm_phys.h>
105 
106 #ifdef DDB
107 #ifndef KDB
108 #error KDB must be enabled in order for DDB to work!
109 #endif
110 #include <ddb/ddb.h>
111 #include <ddb/db_sym.h>
112 #endif
113 
114 #include <net/netisr.h>
115 
116 #include <machine/clock.h>
117 #include <machine/cpu.h>
118 #include <machine/cputypes.h>
119 #include <machine/frame.h>
120 #include <machine/intr_machdep.h>
121 #include <x86/mca.h>
122 #include <machine/md_var.h>
123 #include <machine/metadata.h>
124 #include <machine/mp_watchdog.h>
125 #include <machine/pc/bios.h>
126 #include <machine/pcb.h>
127 #include <machine/proc.h>
128 #include <machine/reg.h>
129 #include <machine/sigframe.h>
130 #include <machine/specialreg.h>
131 #include <machine/trap.h>
132 #include <machine/tss.h>
133 #include <x86/ucode.h>
134 #ifdef SMP
135 #include <machine/smp.h>
136 #endif
137 #ifdef FDT
138 #include <x86/fdt.h>
139 #endif
140 
141 #ifdef DEV_ATPIC
142 #include <x86/isa/icu.h>
143 #else
144 #include <x86/apicvar.h>
145 #endif
146 
147 #include <isa/isareg.h>
148 #include <isa/rtc.h>
149 #include <x86/init.h>
150 
151 /* Sanity check for __curthread() */
152 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
153 
154 /*
155  * The PTI trampoline stack needs enough space for a hardware trapframe and a
156  * couple of scratch registers, as well as the trapframe left behind after an
157  * iret fault.
158  */
159 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
160     offsetof(struct pti_frame, pti_rip));
161 
162 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
163 
164 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
165 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
166 
167 static void cpu_startup(void *);
168 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
169     char *xfpusave, size_t xfpusave_len);
170 static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
171     char *xfpustate, size_t xfpustate_len);
172 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
173 
174 /* Preload data parse function */
175 static caddr_t native_parse_preload_data(u_int64_t);
176 
177 /* Native function to fetch and parse the e820 map */
178 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
179 
180 /* Default init_ops implementation. */
181 struct init_ops init_ops = {
182 	.parse_preload_data =	native_parse_preload_data,
183 	.early_clock_source_init =	i8254_init,
184 	.early_delay =			i8254_delay,
185 	.parse_memmap =			native_parse_memmap,
186 #ifdef SMP
187 	.mp_bootaddress =		mp_bootaddress,
188 	.start_all_aps =		native_start_all_aps,
189 #endif
190 #ifdef DEV_PCI
191 	.msi_init =			msi_init,
192 #endif
193 };
194 
195 /*
196  * Physical address of the EFI System Table. Stashed from the metadata hints
197  * passed into the kernel and used by the EFI code to call runtime services.
198  */
199 vm_paddr_t efi_systbl_phys;
200 
201 /* Intel ICH registers */
202 #define ICH_PMBASE	0x400
203 #define ICH_SMI_EN	ICH_PMBASE + 0x30
204 
205 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
206 
207 int cold = 1;
208 
209 long Maxmem = 0;
210 long realmem = 0;
211 
212 /*
213  * The number of PHYSMAP entries must be one less than the number of
214  * PHYSSEG entries because the PHYSMAP entry that spans the largest
215  * physical address that is accessible by ISA DMA is split into two
216  * PHYSSEG entries.
217  */
218 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
219 
220 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
221 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
222 
223 /* must be 2 less so 0 0 can signal end of chunks */
224 #define	PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
225 #define	DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
226 
227 struct kva_md_info kmi;
228 
229 static struct trapframe proc0_tf;
230 struct region_descriptor r_gdt, r_idt;
231 
232 struct pcpu __pcpu[MAXCPU];
233 
234 struct mtx icu_lock;
235 
236 struct mem_range_softc mem_range_softc;
237 
238 struct mtx dt_lock;	/* lock for GDT and LDT */
239 
240 void (*vmm_resume_p)(void);
241 
242 static void
243 cpu_startup(dummy)
244 	void *dummy;
245 {
246 	uintmax_t memsize;
247 	char *sysenv;
248 
249 	/*
250 	 * On MacBooks, we need to disallow the legacy USB circuit to
251 	 * generate an SMI# because this can cause several problems,
252 	 * namely: incorrect CPU frequency detection and failure to
253 	 * start the APs.
254 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
255 	 * Enable register) of the Intel ICH LPC Interface Bridge.
256 	 */
257 	sysenv = kern_getenv("smbios.system.product");
258 	if (sysenv != NULL) {
259 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
260 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
261 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
262 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
263 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
264 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
265 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
266 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
267 			if (bootverbose)
268 				printf("Disabling LEGACY_USB_EN bit on "
269 				    "Intel ICH.\n");
270 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
271 		}
272 		freeenv(sysenv);
273 	}
274 
275 	/*
276 	 * Good {morning,afternoon,evening,night}.
277 	 */
278 	startrtclock();
279 	printcpuinfo();
280 
281 	/*
282 	 * Display physical memory if SMBIOS reports reasonable amount.
283 	 */
284 	memsize = 0;
285 	sysenv = kern_getenv("smbios.memory.enabled");
286 	if (sysenv != NULL) {
287 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
288 		freeenv(sysenv);
289 	}
290 	if (memsize < ptoa((uintmax_t)vm_free_count()))
291 		memsize = ptoa((uintmax_t)Maxmem);
292 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
293 	realmem = atop(memsize);
294 
295 	/*
296 	 * Display any holes after the first chunk of extended memory.
297 	 */
298 	if (bootverbose) {
299 		int indx;
300 
301 		printf("Physical memory chunk(s):\n");
302 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
303 			vm_paddr_t size;
304 
305 			size = phys_avail[indx + 1] - phys_avail[indx];
306 			printf(
307 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
308 			    (uintmax_t)phys_avail[indx],
309 			    (uintmax_t)phys_avail[indx + 1] - 1,
310 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
311 		}
312 	}
313 
314 	vm_ksubmap_init(&kmi);
315 
316 	printf("avail memory = %ju (%ju MB)\n",
317 	    ptoa((uintmax_t)vm_free_count()),
318 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
319 
320 	/*
321 	 * Set up buffers, so they can be used to read disk labels.
322 	 */
323 	bufinit();
324 	vm_pager_bufferinit();
325 
326 	cpu_setregs();
327 }
328 
329 /*
330  * Send an interrupt to process.
331  *
332  * Stack is set up to allow sigcode stored
333  * at top to call routine, followed by call
334  * to sigreturn routine below.  After sigreturn
335  * resets the signal mask, the stack, and the
336  * frame pointer, it returns to the user
337  * specified pc, psl.
338  */
339 void
340 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
341 {
342 	struct sigframe sf, *sfp;
343 	struct pcb *pcb;
344 	struct proc *p;
345 	struct thread *td;
346 	struct sigacts *psp;
347 	char *sp;
348 	struct trapframe *regs;
349 	char *xfpusave;
350 	size_t xfpusave_len;
351 	int sig;
352 	int oonstack;
353 
354 	td = curthread;
355 	pcb = td->td_pcb;
356 	p = td->td_proc;
357 	PROC_LOCK_ASSERT(p, MA_OWNED);
358 	sig = ksi->ksi_signo;
359 	psp = p->p_sigacts;
360 	mtx_assert(&psp->ps_mtx, MA_OWNED);
361 	regs = td->td_frame;
362 	oonstack = sigonstack(regs->tf_rsp);
363 
364 	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
365 		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
366 		xfpusave = __builtin_alloca(xfpusave_len);
367 	} else {
368 		xfpusave_len = 0;
369 		xfpusave = NULL;
370 	}
371 
372 	/* Save user context. */
373 	bzero(&sf, sizeof(sf));
374 	sf.sf_uc.uc_sigmask = *mask;
375 	sf.sf_uc.uc_stack = td->td_sigstk;
376 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
377 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
378 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
379 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
380 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
381 	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
382 	fpstate_drop(td);
383 	update_pcb_bases(pcb);
384 	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
385 	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
386 	bzero(sf.sf_uc.uc_mcontext.mc_spare,
387 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
388 	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
389 
390 	/* Allocate space for the signal handler context. */
391 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
392 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
393 		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
394 #if defined(COMPAT_43)
395 		td->td_sigstk.ss_flags |= SS_ONSTACK;
396 #endif
397 	} else
398 		sp = (char *)regs->tf_rsp - 128;
399 	if (xfpusave != NULL) {
400 		sp -= xfpusave_len;
401 		sp = (char *)((unsigned long)sp & ~0x3Ful);
402 		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
403 	}
404 	sp -= sizeof(struct sigframe);
405 	/* Align to 16 bytes. */
406 	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
407 
408 	/* Build the argument list for the signal handler. */
409 	regs->tf_rdi = sig;			/* arg 1 in %rdi */
410 	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
411 	bzero(&sf.sf_si, sizeof(sf.sf_si));
412 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
413 		/* Signal handler installed with SA_SIGINFO. */
414 		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
415 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
416 
417 		/* Fill in POSIX parts */
418 		sf.sf_si = ksi->ksi_info;
419 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
420 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
421 	} else {
422 		/* Old FreeBSD-style arguments. */
423 		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
424 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
425 		sf.sf_ahu.sf_handler = catcher;
426 	}
427 	mtx_unlock(&psp->ps_mtx);
428 	PROC_UNLOCK(p);
429 
430 	/*
431 	 * Copy the sigframe out to the user's stack.
432 	 */
433 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
434 	    (xfpusave != NULL && copyout(xfpusave,
435 	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
436 	    != 0)) {
437 #ifdef DEBUG
438 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
439 #endif
440 		PROC_LOCK(p);
441 		sigexit(td, SIGILL);
442 	}
443 
444 	regs->tf_rsp = (long)sfp;
445 	regs->tf_rip = p->p_sysent->sv_sigcode_base;
446 	regs->tf_rflags &= ~(PSL_T | PSL_D);
447 	regs->tf_cs = _ucodesel;
448 	regs->tf_ds = _udatasel;
449 	regs->tf_ss = _udatasel;
450 	regs->tf_es = _udatasel;
451 	regs->tf_fs = _ufssel;
452 	regs->tf_gs = _ugssel;
453 	regs->tf_flags = TF_HASSEGS;
454 	PROC_LOCK(p);
455 	mtx_lock(&psp->ps_mtx);
456 }
457 
458 /*
459  * System call to cleanup state after a signal
460  * has been taken.  Reset signal mask and
461  * stack state from context left by sendsig (above).
462  * Return to previous pc and psl as specified by
463  * context left by sendsig. Check carefully to
464  * make sure that the user has not modified the
465  * state to gain improper privileges.
466  *
467  * MPSAFE
468  */
469 int
470 sys_sigreturn(td, uap)
471 	struct thread *td;
472 	struct sigreturn_args /* {
473 		const struct __ucontext *sigcntxp;
474 	} */ *uap;
475 {
476 	ucontext_t uc;
477 	struct pcb *pcb;
478 	struct proc *p;
479 	struct trapframe *regs;
480 	ucontext_t *ucp;
481 	char *xfpustate;
482 	size_t xfpustate_len;
483 	long rflags;
484 	int cs, error, ret;
485 	ksiginfo_t ksi;
486 
487 	pcb = td->td_pcb;
488 	p = td->td_proc;
489 
490 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
491 	if (error != 0) {
492 		uprintf("pid %d (%s): sigreturn copyin failed\n",
493 		    p->p_pid, td->td_name);
494 		return (error);
495 	}
496 	ucp = &uc;
497 	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
498 		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
499 		    td->td_name, ucp->uc_mcontext.mc_flags);
500 		return (EINVAL);
501 	}
502 	regs = td->td_frame;
503 	rflags = ucp->uc_mcontext.mc_rflags;
504 	/*
505 	 * Don't allow users to change privileged or reserved flags.
506 	 */
507 	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
508 		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
509 		    td->td_name, rflags);
510 		return (EINVAL);
511 	}
512 
513 	/*
514 	 * Don't allow users to load a valid privileged %cs.  Let the
515 	 * hardware check for invalid selectors, excess privilege in
516 	 * other selectors, invalid %eip's and invalid %esp's.
517 	 */
518 	cs = ucp->uc_mcontext.mc_cs;
519 	if (!CS_SECURE(cs)) {
520 		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
521 		    td->td_name, cs);
522 		ksiginfo_init_trap(&ksi);
523 		ksi.ksi_signo = SIGBUS;
524 		ksi.ksi_code = BUS_OBJERR;
525 		ksi.ksi_trapno = T_PROTFLT;
526 		ksi.ksi_addr = (void *)regs->tf_rip;
527 		trapsignal(td, &ksi);
528 		return (EINVAL);
529 	}
530 
531 	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
532 		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
533 		if (xfpustate_len > cpu_max_ext_state_size -
534 		    sizeof(struct savefpu)) {
535 			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
536 			    p->p_pid, td->td_name, xfpustate_len);
537 			return (EINVAL);
538 		}
539 		xfpustate = __builtin_alloca(xfpustate_len);
540 		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
541 		    xfpustate, xfpustate_len);
542 		if (error != 0) {
543 			uprintf(
544 	"pid %d (%s): sigreturn copying xfpustate failed\n",
545 			    p->p_pid, td->td_name);
546 			return (error);
547 		}
548 	} else {
549 		xfpustate = NULL;
550 		xfpustate_len = 0;
551 	}
552 	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
553 	if (ret != 0) {
554 		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
555 		    p->p_pid, td->td_name, ret);
556 		return (ret);
557 	}
558 	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
559 	update_pcb_bases(pcb);
560 	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
561 	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
562 
563 #if defined(COMPAT_43)
564 	if (ucp->uc_mcontext.mc_onstack & 1)
565 		td->td_sigstk.ss_flags |= SS_ONSTACK;
566 	else
567 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
568 #endif
569 
570 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
571 	return (EJUSTRETURN);
572 }
573 
574 #ifdef COMPAT_FREEBSD4
575 int
576 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
577 {
578 
579 	return sys_sigreturn(td, (struct sigreturn_args *)uap);
580 }
581 #endif
582 
583 /*
584  * Reset registers to default values on exec.
585  */
586 void
587 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
588 {
589 	struct trapframe *regs;
590 	struct pcb *pcb;
591 	register_t saved_rflags;
592 
593 	regs = td->td_frame;
594 	pcb = td->td_pcb;
595 
596 	if (td->td_proc->p_md.md_ldt != NULL)
597 		user_ldt_free(td);
598 
599 	update_pcb_bases(pcb);
600 	pcb->pcb_fsbase = 0;
601 	pcb->pcb_gsbase = 0;
602 	clear_pcb_flags(pcb, PCB_32BIT);
603 	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
604 
605 	saved_rflags = regs->tf_rflags & PSL_T;
606 	bzero((char *)regs, sizeof(struct trapframe));
607 	regs->tf_rip = imgp->entry_addr;
608 	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
609 	regs->tf_rdi = stack;		/* argv */
610 	regs->tf_rflags = PSL_USER | saved_rflags;
611 	regs->tf_ss = _udatasel;
612 	regs->tf_cs = _ucodesel;
613 	regs->tf_ds = _udatasel;
614 	regs->tf_es = _udatasel;
615 	regs->tf_fs = _ufssel;
616 	regs->tf_gs = _ugssel;
617 	regs->tf_flags = TF_HASSEGS;
618 
619 	/*
620 	 * Reset the hardware debug registers if they were in use.
621 	 * They won't have any meaning for the newly exec'd process.
622 	 */
623 	if (pcb->pcb_flags & PCB_DBREGS) {
624 		pcb->pcb_dr0 = 0;
625 		pcb->pcb_dr1 = 0;
626 		pcb->pcb_dr2 = 0;
627 		pcb->pcb_dr3 = 0;
628 		pcb->pcb_dr6 = 0;
629 		pcb->pcb_dr7 = 0;
630 		if (pcb == curpcb) {
631 			/*
632 			 * Clear the debug registers on the running
633 			 * CPU, otherwise they will end up affecting
634 			 * the next process we switch to.
635 			 */
636 			reset_dbregs();
637 		}
638 		clear_pcb_flags(pcb, PCB_DBREGS);
639 	}
640 
641 	/*
642 	 * Drop the FP state if we hold it, so that the process gets a
643 	 * clean FP state if it uses the FPU again.
644 	 */
645 	fpstate_drop(td);
646 }
647 
648 void
649 cpu_setregs(void)
650 {
651 	register_t cr0;
652 
653 	cr0 = rcr0();
654 	/*
655 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
656 	 * BSP.  See the comments there about why we set them.
657 	 */
658 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
659 	load_cr0(cr0);
660 }
661 
662 /*
663  * Initialize amd64 and configure to run kernel
664  */
665 
666 /*
667  * Initialize segments & interrupt table
668  */
669 
670 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
671 static struct gate_descriptor idt0[NIDT];
672 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
673 
674 static char dblfault_stack[PAGE_SIZE] __aligned(16);
675 static char mce0_stack[PAGE_SIZE] __aligned(16);
676 static char nmi0_stack[PAGE_SIZE] __aligned(16);
677 static char dbg0_stack[PAGE_SIZE] __aligned(16);
678 CTASSERT(sizeof(struct nmi_pcpu) == 16);
679 
680 struct amd64tss common_tss[MAXCPU];
681 
682 /*
683  * Software prototypes -- in more palatable form.
684  *
685  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
686  * slots as corresponding segments for i386 kernel.
687  */
688 struct soft_segment_descriptor gdt_segs[] = {
689 /* GNULL_SEL	0 Null Descriptor */
690 {	.ssd_base = 0x0,
691 	.ssd_limit = 0x0,
692 	.ssd_type = 0,
693 	.ssd_dpl = 0,
694 	.ssd_p = 0,
695 	.ssd_long = 0,
696 	.ssd_def32 = 0,
697 	.ssd_gran = 0		},
698 /* GNULL2_SEL	1 Null Descriptor */
699 {	.ssd_base = 0x0,
700 	.ssd_limit = 0x0,
701 	.ssd_type = 0,
702 	.ssd_dpl = 0,
703 	.ssd_p = 0,
704 	.ssd_long = 0,
705 	.ssd_def32 = 0,
706 	.ssd_gran = 0		},
707 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
708 {	.ssd_base = 0x0,
709 	.ssd_limit = 0xfffff,
710 	.ssd_type = SDT_MEMRWA,
711 	.ssd_dpl = SEL_UPL,
712 	.ssd_p = 1,
713 	.ssd_long = 0,
714 	.ssd_def32 = 1,
715 	.ssd_gran = 1		},
716 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
717 {	.ssd_base = 0x0,
718 	.ssd_limit = 0xfffff,
719 	.ssd_type = SDT_MEMRWA,
720 	.ssd_dpl = SEL_UPL,
721 	.ssd_p = 1,
722 	.ssd_long = 0,
723 	.ssd_def32 = 1,
724 	.ssd_gran = 1		},
725 /* GCODE_SEL	4 Code Descriptor for kernel */
726 {	.ssd_base = 0x0,
727 	.ssd_limit = 0xfffff,
728 	.ssd_type = SDT_MEMERA,
729 	.ssd_dpl = SEL_KPL,
730 	.ssd_p = 1,
731 	.ssd_long = 1,
732 	.ssd_def32 = 0,
733 	.ssd_gran = 1		},
734 /* GDATA_SEL	5 Data Descriptor for kernel */
735 {	.ssd_base = 0x0,
736 	.ssd_limit = 0xfffff,
737 	.ssd_type = SDT_MEMRWA,
738 	.ssd_dpl = SEL_KPL,
739 	.ssd_p = 1,
740 	.ssd_long = 1,
741 	.ssd_def32 = 0,
742 	.ssd_gran = 1		},
743 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
744 {	.ssd_base = 0x0,
745 	.ssd_limit = 0xfffff,
746 	.ssd_type = SDT_MEMERA,
747 	.ssd_dpl = SEL_UPL,
748 	.ssd_p = 1,
749 	.ssd_long = 0,
750 	.ssd_def32 = 1,
751 	.ssd_gran = 1		},
752 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
753 {	.ssd_base = 0x0,
754 	.ssd_limit = 0xfffff,
755 	.ssd_type = SDT_MEMRWA,
756 	.ssd_dpl = SEL_UPL,
757 	.ssd_p = 1,
758 	.ssd_long = 0,
759 	.ssd_def32 = 1,
760 	.ssd_gran = 1		},
761 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
762 {	.ssd_base = 0x0,
763 	.ssd_limit = 0xfffff,
764 	.ssd_type = SDT_MEMERA,
765 	.ssd_dpl = SEL_UPL,
766 	.ssd_p = 1,
767 	.ssd_long = 1,
768 	.ssd_def32 = 0,
769 	.ssd_gran = 1		},
770 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
771 {	.ssd_base = 0x0,
772 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
773 	.ssd_type = SDT_SYSTSS,
774 	.ssd_dpl = SEL_KPL,
775 	.ssd_p = 1,
776 	.ssd_long = 0,
777 	.ssd_def32 = 0,
778 	.ssd_gran = 0		},
779 /* Actually, the TSS is a system descriptor which is double size */
780 {	.ssd_base = 0x0,
781 	.ssd_limit = 0x0,
782 	.ssd_type = 0,
783 	.ssd_dpl = 0,
784 	.ssd_p = 0,
785 	.ssd_long = 0,
786 	.ssd_def32 = 0,
787 	.ssd_gran = 0		},
788 /* GUSERLDT_SEL	11 LDT Descriptor */
789 {	.ssd_base = 0x0,
790 	.ssd_limit = 0x0,
791 	.ssd_type = 0,
792 	.ssd_dpl = 0,
793 	.ssd_p = 0,
794 	.ssd_long = 0,
795 	.ssd_def32 = 0,
796 	.ssd_gran = 0		},
797 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
798 {	.ssd_base = 0x0,
799 	.ssd_limit = 0x0,
800 	.ssd_type = 0,
801 	.ssd_dpl = 0,
802 	.ssd_p = 0,
803 	.ssd_long = 0,
804 	.ssd_def32 = 0,
805 	.ssd_gran = 0		},
806 };
807 
808 void
809 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
810 {
811 	struct gate_descriptor *ip;
812 
813 	ip = idt + idx;
814 	ip->gd_looffset = (uintptr_t)func;
815 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
816 	ip->gd_ist = ist;
817 	ip->gd_xx = 0;
818 	ip->gd_type = typ;
819 	ip->gd_dpl = dpl;
820 	ip->gd_p = 1;
821 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
822 }
823 
824 extern inthand_t
825 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
826 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
827 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
828 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
829 	IDTVEC(xmm), IDTVEC(dblfault),
830 	IDTVEC(div_pti), IDTVEC(bpt_pti),
831 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
832 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
833 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
834 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
835 	IDTVEC(xmm_pti),
836 #ifdef KDTRACE_HOOKS
837 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
838 #endif
839 #ifdef XENHVM
840 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
841 #endif
842 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
843 	IDTVEC(fast_syscall_pti);
844 
845 #ifdef DDB
846 /*
847  * Display the index and function name of any IDT entries that don't use
848  * the default 'rsvd' entry point.
849  */
850 DB_SHOW_COMMAND(idt, db_show_idt)
851 {
852 	struct gate_descriptor *ip;
853 	int idx;
854 	uintptr_t func;
855 
856 	ip = idt;
857 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
858 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
859 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
860 			db_printf("%3d\t", idx);
861 			db_printsym(func, DB_STGY_PROC);
862 			db_printf("\n");
863 		}
864 		ip++;
865 	}
866 }
867 
868 /* Show privileged registers. */
869 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
870 {
871 	struct {
872 		uint16_t limit;
873 		uint64_t base;
874 	} __packed idtr, gdtr;
875 	uint16_t ldt, tr;
876 
877 	__asm __volatile("sidt %0" : "=m" (idtr));
878 	db_printf("idtr\t0x%016lx/%04x\n",
879 	    (u_long)idtr.base, (u_int)idtr.limit);
880 	__asm __volatile("sgdt %0" : "=m" (gdtr));
881 	db_printf("gdtr\t0x%016lx/%04x\n",
882 	    (u_long)gdtr.base, (u_int)gdtr.limit);
883 	__asm __volatile("sldt %0" : "=r" (ldt));
884 	db_printf("ldtr\t0x%04x\n", ldt);
885 	__asm __volatile("str %0" : "=r" (tr));
886 	db_printf("tr\t0x%04x\n", tr);
887 	db_printf("cr0\t0x%016lx\n", rcr0());
888 	db_printf("cr2\t0x%016lx\n", rcr2());
889 	db_printf("cr3\t0x%016lx\n", rcr3());
890 	db_printf("cr4\t0x%016lx\n", rcr4());
891 	if (rcr4() & CR4_XSAVE)
892 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
893 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
894 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
895 		db_printf("FEATURES_CTL\t%016lx\n",
896 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
897 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
898 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
899 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
900 }
901 
902 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
903 {
904 
905 	db_printf("dr0\t0x%016lx\n", rdr0());
906 	db_printf("dr1\t0x%016lx\n", rdr1());
907 	db_printf("dr2\t0x%016lx\n", rdr2());
908 	db_printf("dr3\t0x%016lx\n", rdr3());
909 	db_printf("dr6\t0x%016lx\n", rdr6());
910 	db_printf("dr7\t0x%016lx\n", rdr7());
911 }
912 #endif
913 
914 void
915 sdtossd(sd, ssd)
916 	struct user_segment_descriptor *sd;
917 	struct soft_segment_descriptor *ssd;
918 {
919 
920 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
921 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
922 	ssd->ssd_type  = sd->sd_type;
923 	ssd->ssd_dpl   = sd->sd_dpl;
924 	ssd->ssd_p     = sd->sd_p;
925 	ssd->ssd_long  = sd->sd_long;
926 	ssd->ssd_def32 = sd->sd_def32;
927 	ssd->ssd_gran  = sd->sd_gran;
928 }
929 
930 void
931 ssdtosd(ssd, sd)
932 	struct soft_segment_descriptor *ssd;
933 	struct user_segment_descriptor *sd;
934 {
935 
936 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
937 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
938 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
939 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
940 	sd->sd_type  = ssd->ssd_type;
941 	sd->sd_dpl   = ssd->ssd_dpl;
942 	sd->sd_p     = ssd->ssd_p;
943 	sd->sd_long  = ssd->ssd_long;
944 	sd->sd_def32 = ssd->ssd_def32;
945 	sd->sd_gran  = ssd->ssd_gran;
946 }
947 
948 void
949 ssdtosyssd(ssd, sd)
950 	struct soft_segment_descriptor *ssd;
951 	struct system_segment_descriptor *sd;
952 {
953 
954 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
955 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
956 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
957 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
958 	sd->sd_type  = ssd->ssd_type;
959 	sd->sd_dpl   = ssd->ssd_dpl;
960 	sd->sd_p     = ssd->ssd_p;
961 	sd->sd_gran  = ssd->ssd_gran;
962 }
963 
964 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
965 #include <isa/isavar.h>
966 #include <isa/isareg.h>
967 /*
968  * Return a bitmap of the current interrupt requests.  This is 8259-specific
969  * and is only suitable for use at probe time.
970  * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
971  * It shouldn't be here.  There should probably be an APIC centric
972  * implementation in the apic driver code, if at all.
973  */
974 intrmask_t
975 isa_irq_pending(void)
976 {
977 	u_char irr1;
978 	u_char irr2;
979 
980 	irr1 = inb(IO_ICU1);
981 	irr2 = inb(IO_ICU2);
982 	return ((irr2 << 8) | irr1);
983 }
984 #endif
985 
986 u_int basemem;
987 
988 static int
989 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
990     int *physmap_idxp)
991 {
992 	int i, insert_idx, physmap_idx;
993 
994 	physmap_idx = *physmap_idxp;
995 
996 	if (length == 0)
997 		return (1);
998 
999 	/*
1000 	 * Find insertion point while checking for overlap.  Start off by
1001 	 * assuming the new entry will be added to the end.
1002 	 *
1003 	 * NB: physmap_idx points to the next free slot.
1004 	 */
1005 	insert_idx = physmap_idx;
1006 	for (i = 0; i <= physmap_idx; i += 2) {
1007 		if (base < physmap[i + 1]) {
1008 			if (base + length <= physmap[i]) {
1009 				insert_idx = i;
1010 				break;
1011 			}
1012 			if (boothowto & RB_VERBOSE)
1013 				printf(
1014 		    "Overlapping memory regions, ignoring second region\n");
1015 			return (1);
1016 		}
1017 	}
1018 
1019 	/* See if we can prepend to the next entry. */
1020 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1021 		physmap[insert_idx] = base;
1022 		return (1);
1023 	}
1024 
1025 	/* See if we can append to the previous entry. */
1026 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1027 		physmap[insert_idx - 1] += length;
1028 		return (1);
1029 	}
1030 
1031 	physmap_idx += 2;
1032 	*physmap_idxp = physmap_idx;
1033 	if (physmap_idx == PHYSMAP_SIZE) {
1034 		printf(
1035 		"Too many segments in the physical address map, giving up\n");
1036 		return (0);
1037 	}
1038 
1039 	/*
1040 	 * Move the last 'N' entries down to make room for the new
1041 	 * entry if needed.
1042 	 */
1043 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1044 		physmap[i] = physmap[i - 2];
1045 		physmap[i + 1] = physmap[i - 1];
1046 	}
1047 
1048 	/* Insert the new entry. */
1049 	physmap[insert_idx] = base;
1050 	physmap[insert_idx + 1] = base + length;
1051 	return (1);
1052 }
1053 
1054 void
1055 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1056                       vm_paddr_t *physmap, int *physmap_idx)
1057 {
1058 	struct bios_smap *smap, *smapend;
1059 
1060 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1061 
1062 	for (smap = smapbase; smap < smapend; smap++) {
1063 		if (boothowto & RB_VERBOSE)
1064 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
1065 			    smap->type, smap->base, smap->length);
1066 
1067 		if (smap->type != SMAP_TYPE_MEMORY)
1068 			continue;
1069 
1070 		if (!add_physmap_entry(smap->base, smap->length, physmap,
1071 		    physmap_idx))
1072 			break;
1073 	}
1074 }
1075 
1076 static void
1077 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1078     int *physmap_idx)
1079 {
1080 	struct efi_md *map, *p;
1081 	const char *type;
1082 	size_t efisz;
1083 	int ndesc, i;
1084 
1085 	static const char *types[] = {
1086 		"Reserved",
1087 		"LoaderCode",
1088 		"LoaderData",
1089 		"BootServicesCode",
1090 		"BootServicesData",
1091 		"RuntimeServicesCode",
1092 		"RuntimeServicesData",
1093 		"ConventionalMemory",
1094 		"UnusableMemory",
1095 		"ACPIReclaimMemory",
1096 		"ACPIMemoryNVS",
1097 		"MemoryMappedIO",
1098 		"MemoryMappedIOPortSpace",
1099 		"PalCode",
1100 		"PersistentMemory"
1101 	};
1102 
1103 	/*
1104 	 * Memory map data provided by UEFI via the GetMemoryMap
1105 	 * Boot Services API.
1106 	 */
1107 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1108 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1109 
1110 	if (efihdr->descriptor_size == 0)
1111 		return;
1112 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
1113 
1114 	if (boothowto & RB_VERBOSE)
1115 		printf("%23s %12s %12s %8s %4s\n",
1116 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
1117 
1118 	for (i = 0, p = map; i < ndesc; i++,
1119 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1120 		if (boothowto & RB_VERBOSE) {
1121 			if (p->md_type < nitems(types))
1122 				type = types[p->md_type];
1123 			else
1124 				type = "<INVALID>";
1125 			printf("%23s %012lx %12p %08lx ", type, p->md_phys,
1126 			    p->md_virt, p->md_pages);
1127 			if (p->md_attr & EFI_MD_ATTR_UC)
1128 				printf("UC ");
1129 			if (p->md_attr & EFI_MD_ATTR_WC)
1130 				printf("WC ");
1131 			if (p->md_attr & EFI_MD_ATTR_WT)
1132 				printf("WT ");
1133 			if (p->md_attr & EFI_MD_ATTR_WB)
1134 				printf("WB ");
1135 			if (p->md_attr & EFI_MD_ATTR_UCE)
1136 				printf("UCE ");
1137 			if (p->md_attr & EFI_MD_ATTR_WP)
1138 				printf("WP ");
1139 			if (p->md_attr & EFI_MD_ATTR_RP)
1140 				printf("RP ");
1141 			if (p->md_attr & EFI_MD_ATTR_XP)
1142 				printf("XP ");
1143 			if (p->md_attr & EFI_MD_ATTR_NV)
1144 				printf("NV ");
1145 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1146 				printf("MORE_RELIABLE ");
1147 			if (p->md_attr & EFI_MD_ATTR_RO)
1148 				printf("RO ");
1149 			if (p->md_attr & EFI_MD_ATTR_RT)
1150 				printf("RUNTIME");
1151 			printf("\n");
1152 		}
1153 
1154 		switch (p->md_type) {
1155 		case EFI_MD_TYPE_CODE:
1156 		case EFI_MD_TYPE_DATA:
1157 		case EFI_MD_TYPE_BS_CODE:
1158 		case EFI_MD_TYPE_BS_DATA:
1159 		case EFI_MD_TYPE_FREE:
1160 			/*
1161 			 * We're allowed to use any entry with these types.
1162 			 */
1163 			break;
1164 		default:
1165 			continue;
1166 		}
1167 
1168 		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1169 		    physmap, physmap_idx))
1170 			break;
1171 	}
1172 }
1173 
1174 static char bootmethod[16] = "";
1175 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
1176     "System firmware boot method");
1177 
1178 static void
1179 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1180 {
1181 	struct bios_smap *smap;
1182 	struct efi_map_header *efihdr;
1183 	u_int32_t size;
1184 
1185 	/*
1186 	 * Memory map from INT 15:E820.
1187 	 *
1188 	 * subr_module.c says:
1189 	 * "Consumer may safely assume that size value precedes data."
1190 	 * ie: an int32_t immediately precedes smap.
1191 	 */
1192 
1193 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1194 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1195 	smap = (struct bios_smap *)preload_search_info(kmdp,
1196 	    MODINFO_METADATA | MODINFOMD_SMAP);
1197 	if (efihdr == NULL && smap == NULL)
1198 		panic("No BIOS smap or EFI map info from loader!");
1199 
1200 	if (efihdr != NULL) {
1201 		add_efi_map_entries(efihdr, physmap, physmap_idx);
1202 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1203 	} else {
1204 		size = *((u_int32_t *)smap - 1);
1205 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
1206 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1207 	}
1208 }
1209 
1210 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
1211 
1212 /*
1213  * Populate the (physmap) array with base/bound pairs describing the
1214  * available physical memory in the system, then test this memory and
1215  * build the phys_avail array describing the actually-available memory.
1216  *
1217  * Total memory size may be set by the kernel environment variable
1218  * hw.physmem or the compile-time define MAXMEM.
1219  *
1220  * XXX first should be vm_paddr_t.
1221  */
1222 static void
1223 getmemsize(caddr_t kmdp, u_int64_t first)
1224 {
1225 	int i, physmap_idx, pa_indx, da_indx;
1226 	vm_paddr_t pa, physmap[PHYSMAP_SIZE];
1227 	u_long physmem_start, physmem_tunable, memtest;
1228 	pt_entry_t *pte;
1229 	quad_t dcons_addr, dcons_size;
1230 	int page_counter;
1231 
1232 	/*
1233 	 * Tell the physical memory allocator about pages used to store
1234 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
1235 	 */
1236 	vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first));
1237 
1238 	bzero(physmap, sizeof(physmap));
1239 	physmap_idx = 0;
1240 
1241 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1242 	physmap_idx -= 2;
1243 
1244 	/*
1245 	 * Find the 'base memory' segment for SMP
1246 	 */
1247 	basemem = 0;
1248 	for (i = 0; i <= physmap_idx; i += 2) {
1249 		if (physmap[i] <= 0xA0000) {
1250 			basemem = physmap[i + 1] / 1024;
1251 			break;
1252 		}
1253 	}
1254 	if (basemem == 0 || basemem > 640) {
1255 		if (bootverbose)
1256 			printf(
1257 		"Memory map doesn't contain a basemem segment, faking it");
1258 		basemem = 640;
1259 	}
1260 
1261 	/*
1262 	 * Maxmem isn't the "maximum memory", it's one larger than the
1263 	 * highest page of the physical address space.  It should be
1264 	 * called something like "Maxphyspage".  We may adjust this
1265 	 * based on ``hw.physmem'' and the results of the memory test.
1266 	 */
1267 	Maxmem = atop(physmap[physmap_idx + 1]);
1268 
1269 #ifdef MAXMEM
1270 	Maxmem = MAXMEM / 4;
1271 #endif
1272 
1273 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1274 		Maxmem = atop(physmem_tunable);
1275 
1276 	/*
1277 	 * The boot memory test is disabled by default, as it takes a
1278 	 * significant amount of time on large-memory systems, and is
1279 	 * unfriendly to virtual machines as it unnecessarily touches all
1280 	 * pages.
1281 	 *
1282 	 * A general name is used as the code may be extended to support
1283 	 * additional tests beyond the current "page present" test.
1284 	 */
1285 	memtest = 0;
1286 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1287 
1288 	/*
1289 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1290 	 * in the system.
1291 	 */
1292 	if (Maxmem > atop(physmap[physmap_idx + 1]))
1293 		Maxmem = atop(physmap[physmap_idx + 1]);
1294 
1295 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1296 	    (boothowto & RB_VERBOSE))
1297 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
1298 
1299 	/*
1300 	 * Make hole for "AP -> long mode" bootstrap code.  The
1301 	 * mp_bootaddress vector is only available when the kernel
1302 	 * is configured to support APs and APs for the system start
1303 	 * in real mode mode (e.g. SMP bare metal).
1304 	 */
1305 	if (init_ops.mp_bootaddress)
1306 		init_ops.mp_bootaddress(physmap, &physmap_idx);
1307 
1308 	/* call pmap initialization to make new kernel address space */
1309 	pmap_bootstrap(&first);
1310 
1311 	/*
1312 	 * Size up each available chunk of physical memory.
1313 	 *
1314 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1315 	 * By default, mask off the first 16 pages unless we appear to be
1316 	 * running in a VM.
1317 	 */
1318 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1319 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1320 	if (physmap[0] < physmem_start) {
1321 		if (physmem_start < PAGE_SIZE)
1322 			physmap[0] = PAGE_SIZE;
1323 		else if (physmem_start >= physmap[1])
1324 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1325 		else
1326 			physmap[0] = round_page(physmem_start);
1327 	}
1328 	pa_indx = 0;
1329 	da_indx = 1;
1330 	phys_avail[pa_indx++] = physmap[0];
1331 	phys_avail[pa_indx] = physmap[0];
1332 	dump_avail[da_indx] = physmap[0];
1333 	pte = CMAP1;
1334 
1335 	/*
1336 	 * Get dcons buffer address
1337 	 */
1338 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1339 	    getenv_quad("dcons.size", &dcons_size) == 0)
1340 		dcons_addr = 0;
1341 
1342 	/*
1343 	 * physmap is in bytes, so when converting to page boundaries,
1344 	 * round up the start address and round down the end address.
1345 	 */
1346 	page_counter = 0;
1347 	if (memtest != 0)
1348 		printf("Testing system memory");
1349 	for (i = 0; i <= physmap_idx; i += 2) {
1350 		vm_paddr_t end;
1351 
1352 		end = ptoa((vm_paddr_t)Maxmem);
1353 		if (physmap[i + 1] < end)
1354 			end = trunc_page(physmap[i + 1]);
1355 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1356 			int tmp, page_bad, full;
1357 			int *ptr = (int *)CADDR1;
1358 
1359 			full = FALSE;
1360 			/*
1361 			 * block out kernel memory as not available.
1362 			 */
1363 			if (pa >= (vm_paddr_t)kernphys && pa < first)
1364 				goto do_dump_avail;
1365 
1366 			/*
1367 			 * block out dcons buffer
1368 			 */
1369 			if (dcons_addr > 0
1370 			    && pa >= trunc_page(dcons_addr)
1371 			    && pa < dcons_addr + dcons_size)
1372 				goto do_dump_avail;
1373 
1374 			page_bad = FALSE;
1375 			if (memtest == 0)
1376 				goto skip_memtest;
1377 
1378 			/*
1379 			 * Print a "." every GB to show we're making
1380 			 * progress.
1381 			 */
1382 			page_counter++;
1383 			if ((page_counter % PAGES_PER_GB) == 0)
1384 				printf(".");
1385 
1386 			/*
1387 			 * map page into kernel: valid, read/write,non-cacheable
1388 			 */
1389 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1390 			invltlb();
1391 
1392 			tmp = *(int *)ptr;
1393 			/*
1394 			 * Test for alternating 1's and 0's
1395 			 */
1396 			*(volatile int *)ptr = 0xaaaaaaaa;
1397 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1398 				page_bad = TRUE;
1399 			/*
1400 			 * Test for alternating 0's and 1's
1401 			 */
1402 			*(volatile int *)ptr = 0x55555555;
1403 			if (*(volatile int *)ptr != 0x55555555)
1404 				page_bad = TRUE;
1405 			/*
1406 			 * Test for all 1's
1407 			 */
1408 			*(volatile int *)ptr = 0xffffffff;
1409 			if (*(volatile int *)ptr != 0xffffffff)
1410 				page_bad = TRUE;
1411 			/*
1412 			 * Test for all 0's
1413 			 */
1414 			*(volatile int *)ptr = 0x0;
1415 			if (*(volatile int *)ptr != 0x0)
1416 				page_bad = TRUE;
1417 			/*
1418 			 * Restore original value.
1419 			 */
1420 			*(int *)ptr = tmp;
1421 
1422 skip_memtest:
1423 			/*
1424 			 * Adjust array of valid/good pages.
1425 			 */
1426 			if (page_bad == TRUE)
1427 				continue;
1428 			/*
1429 			 * If this good page is a continuation of the
1430 			 * previous set of good pages, then just increase
1431 			 * the end pointer. Otherwise start a new chunk.
1432 			 * Note that "end" points one higher than end,
1433 			 * making the range >= start and < end.
1434 			 * If we're also doing a speculative memory
1435 			 * test and we at or past the end, bump up Maxmem
1436 			 * so that we keep going. The first bad page
1437 			 * will terminate the loop.
1438 			 */
1439 			if (phys_avail[pa_indx] == pa) {
1440 				phys_avail[pa_indx] += PAGE_SIZE;
1441 			} else {
1442 				pa_indx++;
1443 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
1444 					printf(
1445 		"Too many holes in the physical address space, giving up\n");
1446 					pa_indx--;
1447 					full = TRUE;
1448 					goto do_dump_avail;
1449 				}
1450 				phys_avail[pa_indx++] = pa;	/* start */
1451 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1452 			}
1453 			physmem++;
1454 do_dump_avail:
1455 			if (dump_avail[da_indx] == pa) {
1456 				dump_avail[da_indx] += PAGE_SIZE;
1457 			} else {
1458 				da_indx++;
1459 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
1460 					da_indx--;
1461 					goto do_next;
1462 				}
1463 				dump_avail[da_indx++] = pa; /* start */
1464 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1465 			}
1466 do_next:
1467 			if (full)
1468 				break;
1469 		}
1470 	}
1471 	*pte = 0;
1472 	invltlb();
1473 	if (memtest != 0)
1474 		printf("\n");
1475 
1476 	/*
1477 	 * XXX
1478 	 * The last chunk must contain at least one page plus the message
1479 	 * buffer to avoid complicating other code (message buffer address
1480 	 * calculation, etc.).
1481 	 */
1482 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1483 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1484 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1485 		phys_avail[pa_indx--] = 0;
1486 		phys_avail[pa_indx--] = 0;
1487 	}
1488 
1489 	Maxmem = atop(phys_avail[pa_indx]);
1490 
1491 	/* Trim off space for the message buffer. */
1492 	phys_avail[pa_indx] -= round_page(msgbufsize);
1493 
1494 	/* Map the message buffer. */
1495 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1496 }
1497 
1498 static caddr_t
1499 native_parse_preload_data(u_int64_t modulep)
1500 {
1501 	caddr_t kmdp;
1502 	char *envp;
1503 #ifdef DDB
1504 	vm_offset_t ksym_start;
1505 	vm_offset_t ksym_end;
1506 #endif
1507 
1508 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1509 	preload_bootstrap_relocate(KERNBASE);
1510 	kmdp = preload_search_by_type("elf kernel");
1511 	if (kmdp == NULL)
1512 		kmdp = preload_search_by_type("elf64 kernel");
1513 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1514 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1515 	if (envp != NULL)
1516 		envp += KERNBASE;
1517 	init_static_kenv(envp, 0);
1518 #ifdef DDB
1519 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1520 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1521 	db_fetch_ksymtab(ksym_start, ksym_end);
1522 #endif
1523 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1524 
1525 	return (kmdp);
1526 }
1527 
1528 static void
1529 amd64_kdb_init(void)
1530 {
1531 	kdb_init();
1532 #ifdef KDB
1533 	if (boothowto & RB_KDB)
1534 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1535 #endif
1536 }
1537 
1538 /* Set up the fast syscall stuff */
1539 void
1540 amd64_conf_fast_syscall(void)
1541 {
1542 	uint64_t msr;
1543 
1544 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1545 	wrmsr(MSR_EFER, msr);
1546 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1547 	    (u_int64_t)IDTVEC(fast_syscall));
1548 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1549 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1550 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1551 	wrmsr(MSR_STAR, msr);
1552 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1553 }
1554 
1555 u_int64_t
1556 hammer_time(u_int64_t modulep, u_int64_t physfree)
1557 {
1558 	caddr_t kmdp;
1559 	int gsel_tss, x;
1560 	struct pcpu *pc;
1561 	struct nmi_pcpu *np;
1562 	struct xstate_hdr *xhdr;
1563 	u_int64_t rsp0;
1564 	char *env;
1565 	size_t kstack0_sz;
1566 	int late_console;
1567 
1568 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1569 
1570 	kmdp = init_ops.parse_preload_data(modulep);
1571 
1572 	physfree += ucode_load_bsp(physfree + KERNBASE);
1573 	physfree = roundup2(physfree, PAGE_SIZE);
1574 
1575 	identify_cpu1();
1576 	identify_hypervisor();
1577 	/*
1578 	 * hw.cpu_stdext_disable is ignored by the call, it will be
1579 	 * re-evaluted by the below call to finishidentcpu().
1580 	 */
1581 	identify_cpu2();
1582 
1583 	link_elf_ireloc(kmdp);
1584 
1585 	/*
1586 	 * This may be done better later if it gets more high level
1587 	 * components in it. If so just link td->td_proc here.
1588 	 */
1589 	proc_linkup0(&proc0, &thread0);
1590 
1591 	/* Init basic tunables, hz etc */
1592 	init_param1();
1593 
1594 	thread0.td_kstack = physfree + KERNBASE;
1595 	thread0.td_kstack_pages = kstack_pages;
1596 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1597 	bzero((void *)thread0.td_kstack, kstack0_sz);
1598 	physfree += kstack0_sz;
1599 
1600 	/*
1601 	 * make gdt memory segments
1602 	 */
1603 	for (x = 0; x < NGDT; x++) {
1604 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1605 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1606 			ssdtosd(&gdt_segs[x], &gdt[x]);
1607 	}
1608 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
1609 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1610 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1611 
1612 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1613 	r_gdt.rd_base =  (long) gdt;
1614 	lgdt(&r_gdt);
1615 	pc = &__pcpu[0];
1616 
1617 	wrmsr(MSR_FSBASE, 0);		/* User value */
1618 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1619 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1620 
1621 	pcpu_init(pc, 0, sizeof(struct pcpu));
1622 	dpcpu_init((void *)(physfree + KERNBASE), 0);
1623 	physfree += DPCPU_SIZE;
1624 	PCPU_SET(prvspace, pc);
1625 	PCPU_SET(curthread, &thread0);
1626 	/* Non-late cninit() and printf() can be moved up to here. */
1627 	PCPU_SET(tssp, &common_tss[0]);
1628 	PCPU_SET(commontssp, &common_tss[0]);
1629 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1630 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1631 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1632 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1633 
1634 	/*
1635 	 * Initialize mutexes.
1636 	 *
1637 	 * icu_lock: in order to allow an interrupt to occur in a critical
1638 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1639 	 *	     must be able to get the icu lock, so it can't be
1640 	 *	     under witness.
1641 	 */
1642 	mutex_init();
1643 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1644 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1645 
1646 	/* exceptions */
1647 	pti = pti_get_default();
1648 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1649 
1650 	for (x = 0; x < NIDT; x++)
1651 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1652 		    SEL_KPL, 0);
1653 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1654 	    SEL_KPL, 0);
1655 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1656 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1657 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1658 	    SEL_UPL, 0);
1659 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1660 	    SEL_UPL, 0);
1661 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1662 	    SEL_KPL, 0);
1663 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1664 	    SEL_KPL, 0);
1665 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1666 	    SEL_KPL, 0);
1667 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1668 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1669 	    SDT_SYSIGT, SEL_KPL, 0);
1670 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1671 	    SEL_KPL, 0);
1672 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1673 	    SDT_SYSIGT, SEL_KPL, 0);
1674 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1675 	    SEL_KPL, 0);
1676 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1677 	    SEL_KPL, 0);
1678 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1679 	    SEL_KPL, 0);
1680 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1681 	    SEL_KPL, 0);
1682 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1683 	    SEL_KPL, 0);
1684 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1685 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1686 	    SEL_KPL, 0);
1687 #ifdef KDTRACE_HOOKS
1688 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1689 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1690 #endif
1691 #ifdef XENHVM
1692 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1693 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1694 #endif
1695 	r_idt.rd_limit = sizeof(idt0) - 1;
1696 	r_idt.rd_base = (long) idt;
1697 	lidt(&r_idt);
1698 
1699 	/*
1700 	 * Initialize the clock before the console so that console
1701 	 * initialization can use DELAY().
1702 	 */
1703 	clock_init();
1704 
1705 	/*
1706 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1707 	 * transition).
1708 	 * Once bootblocks have updated, we can test directly for
1709 	 * efi_systbl != NULL here...
1710 	 */
1711 	if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1712 	    != NULL)
1713 		vty_set_preferred(VTY_VT);
1714 
1715 	finishidentcpu();	/* Final stage of CPU initialization */
1716 	initializecpu();	/* Initialize CPU registers */
1717 	initializecpucache();
1718 
1719 	/* doublefault stack space, runs on ist1 */
1720 	common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
1721 
1722 	/*
1723 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1724 	 * above the start of the ist2 stack.
1725 	 */
1726 	np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
1727 	np->np_pcpu = (register_t) pc;
1728 	common_tss[0].tss_ist2 = (long) np;
1729 
1730 	/*
1731 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1732 	 * above the start of the ist3 stack.
1733 	 */
1734 	np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
1735 	np->np_pcpu = (register_t) pc;
1736 	common_tss[0].tss_ist3 = (long) np;
1737 
1738 	/*
1739 	 * DB# stack, runs on ist4.
1740 	 */
1741 	np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
1742 	np->np_pcpu = (register_t) pc;
1743 	common_tss[0].tss_ist4 = (long) np;
1744 
1745 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1746 	common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
1747 
1748 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1749 	ltr(gsel_tss);
1750 
1751 	amd64_conf_fast_syscall();
1752 
1753 	/*
1754 	 * Temporary forge some valid pointer to PCB, for exception
1755 	 * handlers.  It is reinitialized properly below after FPU is
1756 	 * set up.  Also set up td_critnest to short-cut the page
1757 	 * fault handler.
1758 	 */
1759 	cpu_max_ext_state_size = sizeof(struct savefpu);
1760 	thread0.td_pcb = get_pcb_td(&thread0);
1761 	thread0.td_critnest = 1;
1762 
1763 	/*
1764 	 * The console and kdb should be initialized even earlier than here,
1765 	 * but some console drivers don't work until after getmemsize().
1766 	 * Default to late console initialization to support these drivers.
1767 	 * This loses mainly printf()s in getmemsize() and early debugging.
1768 	 */
1769 	late_console = 1;
1770 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1771 	if (!late_console) {
1772 		cninit();
1773 		amd64_kdb_init();
1774 	}
1775 
1776 	getmemsize(kmdp, physfree);
1777 	init_param2(physmem);
1778 
1779 	/* now running on new page tables, configured,and u/iom is accessible */
1780 
1781 	if (late_console)
1782 		cninit();
1783 
1784 #ifdef DEV_ISA
1785 #ifdef DEV_ATPIC
1786 	elcr_probe();
1787 	atpic_startup();
1788 #else
1789 	/* Reset and mask the atpics and leave them shut down. */
1790 	atpic_reset();
1791 
1792 	/*
1793 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1794 	 * interrupt handler.
1795 	 */
1796 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1797 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1798 #endif
1799 #else
1800 #error "have you forgotten the isa device?";
1801 #endif
1802 
1803 	if (late_console)
1804 		amd64_kdb_init();
1805 
1806 	msgbufinit(msgbufp, msgbufsize);
1807 	fpuinit();
1808 
1809 	/*
1810 	 * Set up thread0 pcb after fpuinit calculated pcb + fpu save
1811 	 * area size.  Zero out the extended state header in fpu save
1812 	 * area.
1813 	 */
1814 	thread0.td_pcb = get_pcb_td(&thread0);
1815 	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1816 	bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
1817 	if (use_xsave) {
1818 		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1819 		    1);
1820 		xhdr->xstate_bv = xsave_mask;
1821 	}
1822 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1823 	rsp0 = (vm_offset_t)thread0.td_pcb;
1824 	/* Ensure the stack is aligned to 16 bytes */
1825 	rsp0 &= ~0xFul;
1826 	common_tss[0].tss_rsp0 = rsp0;
1827 	PCPU_SET(rsp0, rsp0);
1828 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1829 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1830 	PCPU_SET(curpcb, thread0.td_pcb);
1831 
1832 	/* transfer to user mode */
1833 
1834 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1835 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1836 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1837 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1838 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1839 
1840 	load_ds(_udatasel);
1841 	load_es(_udatasel);
1842 	load_fs(_ufssel);
1843 
1844 	/* setup proc 0's pcb */
1845 	thread0.td_pcb->pcb_flags = 0;
1846 	thread0.td_frame = &proc0_tf;
1847 
1848         env = kern_getenv("kernelname");
1849 	if (env != NULL)
1850 		strlcpy(kernelname, env, sizeof(kernelname));
1851 
1852 	cpu_probe_amdc1e();
1853 
1854 #ifdef FDT
1855 	x86_init_fdt();
1856 #endif
1857 	thread0.td_critnest = 0;
1858 
1859 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1860 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1861 
1862 	TSEXIT();
1863 
1864 	/* Location of kernel stack for locore */
1865 	return ((u_int64_t)thread0.td_pcb);
1866 }
1867 
1868 void
1869 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1870 {
1871 
1872 	pcpu->pc_acpi_id = 0xffffffff;
1873 }
1874 
1875 static int
1876 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1877 {
1878 	struct bios_smap *smapbase;
1879 	struct bios_smap_xattr smap;
1880 	caddr_t kmdp;
1881 	uint32_t *smapattr;
1882 	int count, error, i;
1883 
1884 	/* Retrieve the system memory map from the loader. */
1885 	kmdp = preload_search_by_type("elf kernel");
1886 	if (kmdp == NULL)
1887 		kmdp = preload_search_by_type("elf64 kernel");
1888 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1889 	    MODINFO_METADATA | MODINFOMD_SMAP);
1890 	if (smapbase == NULL)
1891 		return (0);
1892 	smapattr = (uint32_t *)preload_search_info(kmdp,
1893 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1894 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1895 	error = 0;
1896 	for (i = 0; i < count; i++) {
1897 		smap.base = smapbase[i].base;
1898 		smap.length = smapbase[i].length;
1899 		smap.type = smapbase[i].type;
1900 		if (smapattr != NULL)
1901 			smap.xattr = smapattr[i];
1902 		else
1903 			smap.xattr = 0;
1904 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1905 	}
1906 	return (error);
1907 }
1908 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1909     smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
1910 
1911 static int
1912 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1913 {
1914 	struct efi_map_header *efihdr;
1915 	caddr_t kmdp;
1916 	uint32_t efisize;
1917 
1918 	kmdp = preload_search_by_type("elf kernel");
1919 	if (kmdp == NULL)
1920 		kmdp = preload_search_by_type("elf64 kernel");
1921 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1922 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1923 	if (efihdr == NULL)
1924 		return (0);
1925 	efisize = *((uint32_t *)efihdr - 1);
1926 	return (SYSCTL_OUT(req, efihdr, efisize));
1927 }
1928 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1929     efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1930 
1931 void
1932 spinlock_enter(void)
1933 {
1934 	struct thread *td;
1935 	register_t flags;
1936 
1937 	td = curthread;
1938 	if (td->td_md.md_spinlock_count == 0) {
1939 		flags = intr_disable();
1940 		td->td_md.md_spinlock_count = 1;
1941 		td->td_md.md_saved_flags = flags;
1942 		critical_enter();
1943 	} else
1944 		td->td_md.md_spinlock_count++;
1945 }
1946 
1947 void
1948 spinlock_exit(void)
1949 {
1950 	struct thread *td;
1951 	register_t flags;
1952 
1953 	td = curthread;
1954 	flags = td->td_md.md_saved_flags;
1955 	td->td_md.md_spinlock_count--;
1956 	if (td->td_md.md_spinlock_count == 0) {
1957 		critical_exit();
1958 		intr_restore(flags);
1959 	}
1960 }
1961 
1962 /*
1963  * Construct a PCB from a trapframe. This is called from kdb_trap() where
1964  * we want to start a backtrace from the function that caused us to enter
1965  * the debugger. We have the context in the trapframe, but base the trace
1966  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1967  * enough for a backtrace.
1968  */
1969 void
1970 makectx(struct trapframe *tf, struct pcb *pcb)
1971 {
1972 
1973 	pcb->pcb_r12 = tf->tf_r12;
1974 	pcb->pcb_r13 = tf->tf_r13;
1975 	pcb->pcb_r14 = tf->tf_r14;
1976 	pcb->pcb_r15 = tf->tf_r15;
1977 	pcb->pcb_rbp = tf->tf_rbp;
1978 	pcb->pcb_rbx = tf->tf_rbx;
1979 	pcb->pcb_rip = tf->tf_rip;
1980 	pcb->pcb_rsp = tf->tf_rsp;
1981 }
1982 
1983 int
1984 ptrace_set_pc(struct thread *td, unsigned long addr)
1985 {
1986 
1987 	td->td_frame->tf_rip = addr;
1988 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
1989 	return (0);
1990 }
1991 
1992 int
1993 ptrace_single_step(struct thread *td)
1994 {
1995 
1996 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
1997 	if ((td->td_frame->tf_rflags & PSL_T) == 0) {
1998 		td->td_frame->tf_rflags |= PSL_T;
1999 		td->td_dbgflags |= TDB_STEP;
2000 	}
2001 	return (0);
2002 }
2003 
2004 int
2005 ptrace_clear_single_step(struct thread *td)
2006 {
2007 
2008 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2009 	td->td_frame->tf_rflags &= ~PSL_T;
2010 	td->td_dbgflags &= ~TDB_STEP;
2011 	return (0);
2012 }
2013 
2014 int
2015 fill_regs(struct thread *td, struct reg *regs)
2016 {
2017 	struct trapframe *tp;
2018 
2019 	tp = td->td_frame;
2020 	return (fill_frame_regs(tp, regs));
2021 }
2022 
2023 int
2024 fill_frame_regs(struct trapframe *tp, struct reg *regs)
2025 {
2026 	regs->r_r15 = tp->tf_r15;
2027 	regs->r_r14 = tp->tf_r14;
2028 	regs->r_r13 = tp->tf_r13;
2029 	regs->r_r12 = tp->tf_r12;
2030 	regs->r_r11 = tp->tf_r11;
2031 	regs->r_r10 = tp->tf_r10;
2032 	regs->r_r9  = tp->tf_r9;
2033 	regs->r_r8  = tp->tf_r8;
2034 	regs->r_rdi = tp->tf_rdi;
2035 	regs->r_rsi = tp->tf_rsi;
2036 	regs->r_rbp = tp->tf_rbp;
2037 	regs->r_rbx = tp->tf_rbx;
2038 	regs->r_rdx = tp->tf_rdx;
2039 	regs->r_rcx = tp->tf_rcx;
2040 	regs->r_rax = tp->tf_rax;
2041 	regs->r_rip = tp->tf_rip;
2042 	regs->r_cs = tp->tf_cs;
2043 	regs->r_rflags = tp->tf_rflags;
2044 	regs->r_rsp = tp->tf_rsp;
2045 	regs->r_ss = tp->tf_ss;
2046 	if (tp->tf_flags & TF_HASSEGS) {
2047 		regs->r_ds = tp->tf_ds;
2048 		regs->r_es = tp->tf_es;
2049 		regs->r_fs = tp->tf_fs;
2050 		regs->r_gs = tp->tf_gs;
2051 	} else {
2052 		regs->r_ds = 0;
2053 		regs->r_es = 0;
2054 		regs->r_fs = 0;
2055 		regs->r_gs = 0;
2056 	}
2057 	return (0);
2058 }
2059 
2060 int
2061 set_regs(struct thread *td, struct reg *regs)
2062 {
2063 	struct trapframe *tp;
2064 	register_t rflags;
2065 
2066 	tp = td->td_frame;
2067 	rflags = regs->r_rflags & 0xffffffff;
2068 	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
2069 		return (EINVAL);
2070 	tp->tf_r15 = regs->r_r15;
2071 	tp->tf_r14 = regs->r_r14;
2072 	tp->tf_r13 = regs->r_r13;
2073 	tp->tf_r12 = regs->r_r12;
2074 	tp->tf_r11 = regs->r_r11;
2075 	tp->tf_r10 = regs->r_r10;
2076 	tp->tf_r9  = regs->r_r9;
2077 	tp->tf_r8  = regs->r_r8;
2078 	tp->tf_rdi = regs->r_rdi;
2079 	tp->tf_rsi = regs->r_rsi;
2080 	tp->tf_rbp = regs->r_rbp;
2081 	tp->tf_rbx = regs->r_rbx;
2082 	tp->tf_rdx = regs->r_rdx;
2083 	tp->tf_rcx = regs->r_rcx;
2084 	tp->tf_rax = regs->r_rax;
2085 	tp->tf_rip = regs->r_rip;
2086 	tp->tf_cs = regs->r_cs;
2087 	tp->tf_rflags = rflags;
2088 	tp->tf_rsp = regs->r_rsp;
2089 	tp->tf_ss = regs->r_ss;
2090 	if (0) {	/* XXXKIB */
2091 		tp->tf_ds = regs->r_ds;
2092 		tp->tf_es = regs->r_es;
2093 		tp->tf_fs = regs->r_fs;
2094 		tp->tf_gs = regs->r_gs;
2095 		tp->tf_flags = TF_HASSEGS;
2096 	}
2097 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2098 	return (0);
2099 }
2100 
2101 /* XXX check all this stuff! */
2102 /* externalize from sv_xmm */
2103 static void
2104 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2105 {
2106 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2107 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2108 	int i;
2109 
2110 	/* pcb -> fpregs */
2111 	bzero(fpregs, sizeof(*fpregs));
2112 
2113 	/* FPU control/status */
2114 	penv_fpreg->en_cw = penv_xmm->en_cw;
2115 	penv_fpreg->en_sw = penv_xmm->en_sw;
2116 	penv_fpreg->en_tw = penv_xmm->en_tw;
2117 	penv_fpreg->en_opcode = penv_xmm->en_opcode;
2118 	penv_fpreg->en_rip = penv_xmm->en_rip;
2119 	penv_fpreg->en_rdp = penv_xmm->en_rdp;
2120 	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2121 	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2122 
2123 	/* FPU registers */
2124 	for (i = 0; i < 8; ++i)
2125 		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2126 
2127 	/* SSE registers */
2128 	for (i = 0; i < 16; ++i)
2129 		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2130 }
2131 
2132 /* internalize from fpregs into sv_xmm */
2133 static void
2134 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2135 {
2136 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2137 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2138 	int i;
2139 
2140 	/* fpregs -> pcb */
2141 	/* FPU control/status */
2142 	penv_xmm->en_cw = penv_fpreg->en_cw;
2143 	penv_xmm->en_sw = penv_fpreg->en_sw;
2144 	penv_xmm->en_tw = penv_fpreg->en_tw;
2145 	penv_xmm->en_opcode = penv_fpreg->en_opcode;
2146 	penv_xmm->en_rip = penv_fpreg->en_rip;
2147 	penv_xmm->en_rdp = penv_fpreg->en_rdp;
2148 	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2149 	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2150 
2151 	/* FPU registers */
2152 	for (i = 0; i < 8; ++i)
2153 		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2154 
2155 	/* SSE registers */
2156 	for (i = 0; i < 16; ++i)
2157 		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2158 }
2159 
2160 /* externalize from td->pcb */
2161 int
2162 fill_fpregs(struct thread *td, struct fpreg *fpregs)
2163 {
2164 
2165 	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2166 	    P_SHOULDSTOP(td->td_proc),
2167 	    ("not suspended thread %p", td));
2168 	fpugetregs(td);
2169 	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2170 	return (0);
2171 }
2172 
2173 /* internalize to td->pcb */
2174 int
2175 set_fpregs(struct thread *td, struct fpreg *fpregs)
2176 {
2177 
2178 	critical_enter();
2179 	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2180 	fpuuserinited(td);
2181 	critical_exit();
2182 	return (0);
2183 }
2184 
2185 /*
2186  * Get machine context.
2187  */
2188 int
2189 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2190 {
2191 	struct pcb *pcb;
2192 	struct trapframe *tp;
2193 
2194 	pcb = td->td_pcb;
2195 	tp = td->td_frame;
2196 	PROC_LOCK(curthread->td_proc);
2197 	mcp->mc_onstack = sigonstack(tp->tf_rsp);
2198 	PROC_UNLOCK(curthread->td_proc);
2199 	mcp->mc_r15 = tp->tf_r15;
2200 	mcp->mc_r14 = tp->tf_r14;
2201 	mcp->mc_r13 = tp->tf_r13;
2202 	mcp->mc_r12 = tp->tf_r12;
2203 	mcp->mc_r11 = tp->tf_r11;
2204 	mcp->mc_r10 = tp->tf_r10;
2205 	mcp->mc_r9  = tp->tf_r9;
2206 	mcp->mc_r8  = tp->tf_r8;
2207 	mcp->mc_rdi = tp->tf_rdi;
2208 	mcp->mc_rsi = tp->tf_rsi;
2209 	mcp->mc_rbp = tp->tf_rbp;
2210 	mcp->mc_rbx = tp->tf_rbx;
2211 	mcp->mc_rcx = tp->tf_rcx;
2212 	mcp->mc_rflags = tp->tf_rflags;
2213 	if (flags & GET_MC_CLEAR_RET) {
2214 		mcp->mc_rax = 0;
2215 		mcp->mc_rdx = 0;
2216 		mcp->mc_rflags &= ~PSL_C;
2217 	} else {
2218 		mcp->mc_rax = tp->tf_rax;
2219 		mcp->mc_rdx = tp->tf_rdx;
2220 	}
2221 	mcp->mc_rip = tp->tf_rip;
2222 	mcp->mc_cs = tp->tf_cs;
2223 	mcp->mc_rsp = tp->tf_rsp;
2224 	mcp->mc_ss = tp->tf_ss;
2225 	mcp->mc_ds = tp->tf_ds;
2226 	mcp->mc_es = tp->tf_es;
2227 	mcp->mc_fs = tp->tf_fs;
2228 	mcp->mc_gs = tp->tf_gs;
2229 	mcp->mc_flags = tp->tf_flags;
2230 	mcp->mc_len = sizeof(*mcp);
2231 	get_fpcontext(td, mcp, NULL, 0);
2232 	update_pcb_bases(pcb);
2233 	mcp->mc_fsbase = pcb->pcb_fsbase;
2234 	mcp->mc_gsbase = pcb->pcb_gsbase;
2235 	mcp->mc_xfpustate = 0;
2236 	mcp->mc_xfpustate_len = 0;
2237 	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2238 	return (0);
2239 }
2240 
2241 /*
2242  * Set machine context.
2243  *
2244  * However, we don't set any but the user modifiable flags, and we won't
2245  * touch the cs selector.
2246  */
2247 int
2248 set_mcontext(struct thread *td, mcontext_t *mcp)
2249 {
2250 	struct pcb *pcb;
2251 	struct trapframe *tp;
2252 	char *xfpustate;
2253 	long rflags;
2254 	int ret;
2255 
2256 	pcb = td->td_pcb;
2257 	tp = td->td_frame;
2258 	if (mcp->mc_len != sizeof(*mcp) ||
2259 	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2260 		return (EINVAL);
2261 	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2262 	    (tp->tf_rflags & ~PSL_USERCHANGE);
2263 	if (mcp->mc_flags & _MC_HASFPXSTATE) {
2264 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2265 		    sizeof(struct savefpu))
2266 			return (EINVAL);
2267 		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2268 		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2269 		    mcp->mc_xfpustate_len);
2270 		if (ret != 0)
2271 			return (ret);
2272 	} else
2273 		xfpustate = NULL;
2274 	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2275 	if (ret != 0)
2276 		return (ret);
2277 	tp->tf_r15 = mcp->mc_r15;
2278 	tp->tf_r14 = mcp->mc_r14;
2279 	tp->tf_r13 = mcp->mc_r13;
2280 	tp->tf_r12 = mcp->mc_r12;
2281 	tp->tf_r11 = mcp->mc_r11;
2282 	tp->tf_r10 = mcp->mc_r10;
2283 	tp->tf_r9  = mcp->mc_r9;
2284 	tp->tf_r8  = mcp->mc_r8;
2285 	tp->tf_rdi = mcp->mc_rdi;
2286 	tp->tf_rsi = mcp->mc_rsi;
2287 	tp->tf_rbp = mcp->mc_rbp;
2288 	tp->tf_rbx = mcp->mc_rbx;
2289 	tp->tf_rdx = mcp->mc_rdx;
2290 	tp->tf_rcx = mcp->mc_rcx;
2291 	tp->tf_rax = mcp->mc_rax;
2292 	tp->tf_rip = mcp->mc_rip;
2293 	tp->tf_rflags = rflags;
2294 	tp->tf_rsp = mcp->mc_rsp;
2295 	tp->tf_ss = mcp->mc_ss;
2296 	tp->tf_flags = mcp->mc_flags;
2297 	if (tp->tf_flags & TF_HASSEGS) {
2298 		tp->tf_ds = mcp->mc_ds;
2299 		tp->tf_es = mcp->mc_es;
2300 		tp->tf_fs = mcp->mc_fs;
2301 		tp->tf_gs = mcp->mc_gs;
2302 	}
2303 	set_pcb_flags(pcb, PCB_FULL_IRET);
2304 	if (mcp->mc_flags & _MC_HASBASES) {
2305 		pcb->pcb_fsbase = mcp->mc_fsbase;
2306 		pcb->pcb_gsbase = mcp->mc_gsbase;
2307 	}
2308 	return (0);
2309 }
2310 
2311 static void
2312 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2313     size_t xfpusave_len)
2314 {
2315 	size_t max_len, len;
2316 
2317 	mcp->mc_ownedfp = fpugetregs(td);
2318 	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2319 	    sizeof(mcp->mc_fpstate));
2320 	mcp->mc_fpformat = fpuformat();
2321 	if (!use_xsave || xfpusave_len == 0)
2322 		return;
2323 	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2324 	len = xfpusave_len;
2325 	if (len > max_len) {
2326 		len = max_len;
2327 		bzero(xfpusave + max_len, len - max_len);
2328 	}
2329 	mcp->mc_flags |= _MC_HASFPXSTATE;
2330 	mcp->mc_xfpustate_len = len;
2331 	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2332 }
2333 
2334 static int
2335 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2336     size_t xfpustate_len)
2337 {
2338 	int error;
2339 
2340 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2341 		return (0);
2342 	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2343 		return (EINVAL);
2344 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2345 		/* We don't care what state is left in the FPU or PCB. */
2346 		fpstate_drop(td);
2347 		error = 0;
2348 	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2349 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2350 		error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
2351 		    xfpustate, xfpustate_len);
2352 	} else
2353 		return (EINVAL);
2354 	return (error);
2355 }
2356 
2357 void
2358 fpstate_drop(struct thread *td)
2359 {
2360 
2361 	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2362 	critical_enter();
2363 	if (PCPU_GET(fpcurthread) == td)
2364 		fpudrop();
2365 	/*
2366 	 * XXX force a full drop of the fpu.  The above only drops it if we
2367 	 * owned it.
2368 	 *
2369 	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2370 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
2371 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
2372 	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2373 	 * have too many layers.
2374 	 */
2375 	clear_pcb_flags(curthread->td_pcb,
2376 	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2377 	critical_exit();
2378 }
2379 
2380 int
2381 fill_dbregs(struct thread *td, struct dbreg *dbregs)
2382 {
2383 	struct pcb *pcb;
2384 
2385 	if (td == NULL) {
2386 		dbregs->dr[0] = rdr0();
2387 		dbregs->dr[1] = rdr1();
2388 		dbregs->dr[2] = rdr2();
2389 		dbregs->dr[3] = rdr3();
2390 		dbregs->dr[6] = rdr6();
2391 		dbregs->dr[7] = rdr7();
2392 	} else {
2393 		pcb = td->td_pcb;
2394 		dbregs->dr[0] = pcb->pcb_dr0;
2395 		dbregs->dr[1] = pcb->pcb_dr1;
2396 		dbregs->dr[2] = pcb->pcb_dr2;
2397 		dbregs->dr[3] = pcb->pcb_dr3;
2398 		dbregs->dr[6] = pcb->pcb_dr6;
2399 		dbregs->dr[7] = pcb->pcb_dr7;
2400 	}
2401 	dbregs->dr[4] = 0;
2402 	dbregs->dr[5] = 0;
2403 	dbregs->dr[8] = 0;
2404 	dbregs->dr[9] = 0;
2405 	dbregs->dr[10] = 0;
2406 	dbregs->dr[11] = 0;
2407 	dbregs->dr[12] = 0;
2408 	dbregs->dr[13] = 0;
2409 	dbregs->dr[14] = 0;
2410 	dbregs->dr[15] = 0;
2411 	return (0);
2412 }
2413 
2414 int
2415 set_dbregs(struct thread *td, struct dbreg *dbregs)
2416 {
2417 	struct pcb *pcb;
2418 	int i;
2419 
2420 	if (td == NULL) {
2421 		load_dr0(dbregs->dr[0]);
2422 		load_dr1(dbregs->dr[1]);
2423 		load_dr2(dbregs->dr[2]);
2424 		load_dr3(dbregs->dr[3]);
2425 		load_dr6(dbregs->dr[6]);
2426 		load_dr7(dbregs->dr[7]);
2427 	} else {
2428 		/*
2429 		 * Don't let an illegal value for dr7 get set.  Specifically,
2430 		 * check for undefined settings.  Setting these bit patterns
2431 		 * result in undefined behaviour and can lead to an unexpected
2432 		 * TRCTRAP or a general protection fault right here.
2433 		 * Upper bits of dr6 and dr7 must not be set
2434 		 */
2435 		for (i = 0; i < 4; i++) {
2436 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2437 				return (EINVAL);
2438 			if (td->td_frame->tf_cs == _ucode32sel &&
2439 			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2440 				return (EINVAL);
2441 		}
2442 		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2443 		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2444 			return (EINVAL);
2445 
2446 		pcb = td->td_pcb;
2447 
2448 		/*
2449 		 * Don't let a process set a breakpoint that is not within the
2450 		 * process's address space.  If a process could do this, it
2451 		 * could halt the system by setting a breakpoint in the kernel
2452 		 * (if ddb was enabled).  Thus, we need to check to make sure
2453 		 * that no breakpoints are being enabled for addresses outside
2454 		 * process's address space.
2455 		 *
2456 		 * XXX - what about when the watched area of the user's
2457 		 * address space is written into from within the kernel
2458 		 * ... wouldn't that still cause a breakpoint to be generated
2459 		 * from within kernel mode?
2460 		 */
2461 
2462 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2463 			/* dr0 is enabled */
2464 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2465 				return (EINVAL);
2466 		}
2467 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2468 			/* dr1 is enabled */
2469 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2470 				return (EINVAL);
2471 		}
2472 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2473 			/* dr2 is enabled */
2474 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2475 				return (EINVAL);
2476 		}
2477 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2478 			/* dr3 is enabled */
2479 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2480 				return (EINVAL);
2481 		}
2482 
2483 		pcb->pcb_dr0 = dbregs->dr[0];
2484 		pcb->pcb_dr1 = dbregs->dr[1];
2485 		pcb->pcb_dr2 = dbregs->dr[2];
2486 		pcb->pcb_dr3 = dbregs->dr[3];
2487 		pcb->pcb_dr6 = dbregs->dr[6];
2488 		pcb->pcb_dr7 = dbregs->dr[7];
2489 
2490 		set_pcb_flags(pcb, PCB_DBREGS);
2491 	}
2492 
2493 	return (0);
2494 }
2495 
2496 void
2497 reset_dbregs(void)
2498 {
2499 
2500 	load_dr7(0);	/* Turn off the control bits first */
2501 	load_dr0(0);
2502 	load_dr1(0);
2503 	load_dr2(0);
2504 	load_dr3(0);
2505 	load_dr6(0);
2506 }
2507 
2508 /*
2509  * Return > 0 if a hardware breakpoint has been hit, and the
2510  * breakpoint was in user space.  Return 0, otherwise.
2511  */
2512 int
2513 user_dbreg_trap(register_t dr6)
2514 {
2515         u_int64_t dr7;
2516         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
2517         int nbp;            /* number of breakpoints that triggered */
2518         caddr_t addr[4];    /* breakpoint addresses */
2519         int i;
2520 
2521         bp = dr6 & DBREG_DR6_BMASK;
2522         if (bp == 0) {
2523                 /*
2524                  * None of the breakpoint bits are set meaning this
2525                  * trap was not caused by any of the debug registers
2526                  */
2527                 return 0;
2528         }
2529 
2530         dr7 = rdr7();
2531         if ((dr7 & 0x000000ff) == 0) {
2532                 /*
2533                  * all GE and LE bits in the dr7 register are zero,
2534                  * thus the trap couldn't have been caused by the
2535                  * hardware debug registers
2536                  */
2537                 return 0;
2538         }
2539 
2540         nbp = 0;
2541 
2542         /*
2543          * at least one of the breakpoints were hit, check to see
2544          * which ones and if any of them are user space addresses
2545          */
2546 
2547         if (bp & 0x01) {
2548                 addr[nbp++] = (caddr_t)rdr0();
2549         }
2550         if (bp & 0x02) {
2551                 addr[nbp++] = (caddr_t)rdr1();
2552         }
2553         if (bp & 0x04) {
2554                 addr[nbp++] = (caddr_t)rdr2();
2555         }
2556         if (bp & 0x08) {
2557                 addr[nbp++] = (caddr_t)rdr3();
2558         }
2559 
2560         for (i = 0; i < nbp; i++) {
2561                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2562                         /*
2563                          * addr[i] is in user space
2564                          */
2565                         return nbp;
2566                 }
2567         }
2568 
2569         /*
2570          * None of the breakpoints are in user space.
2571          */
2572         return 0;
2573 }
2574 
2575 /*
2576  * The pcb_flags is only modified by current thread, or by other threads
2577  * when current thread is stopped.  However, current thread may change it
2578  * from the interrupt context in cpu_switch(), or in the trap handler.
2579  * When we read-modify-write pcb_flags from C sources, compiler may generate
2580  * code that is not atomic regarding the interrupt handler.  If a trap or
2581  * interrupt happens and any flag is modified from the handler, it can be
2582  * clobbered with the cached value later.  Therefore, we implement setting
2583  * and clearing flags with single-instruction functions, which do not race
2584  * with possible modification of the flags from the trap or interrupt context,
2585  * because traps and interrupts are executed only on instruction boundary.
2586  */
2587 void
2588 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
2589 {
2590 
2591 	__asm __volatile("orl %1,%0"
2592 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
2593 	    : "cc", "memory");
2594 
2595 }
2596 
2597 /*
2598  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
2599  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
2600  * pcb if user space modified the bases.  We must save on the context
2601  * switch or if the return to usermode happens through the doreti.
2602  *
2603  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
2604  * which have a consequence that the base MSRs must be saved each time
2605  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
2606  * context switches.
2607  */
2608 void
2609 set_pcb_flags(struct pcb *pcb, const u_int flags)
2610 {
2611 	register_t r;
2612 
2613 	if (curpcb == pcb &&
2614 	    (flags & PCB_FULL_IRET) != 0 &&
2615 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0 &&
2616 	    (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) {
2617 		r = intr_disable();
2618 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2619 			if (rfs() == _ufssel)
2620 				pcb->pcb_fsbase = rdfsbase();
2621 			if (rgs() == _ugssel)
2622 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
2623 		}
2624 		set_pcb_flags_raw(pcb, flags);
2625 		intr_restore(r);
2626 	} else {
2627 		set_pcb_flags_raw(pcb, flags);
2628 	}
2629 }
2630 
2631 void
2632 clear_pcb_flags(struct pcb *pcb, const u_int flags)
2633 {
2634 
2635 	__asm __volatile("andl %1,%0"
2636 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
2637 	    : "cc", "memory");
2638 }
2639 
2640 #ifdef KDB
2641 
2642 /*
2643  * Provide inb() and outb() as functions.  They are normally only available as
2644  * inline functions, thus cannot be called from the debugger.
2645  */
2646 
2647 /* silence compiler warnings */
2648 u_char inb_(u_short);
2649 void outb_(u_short, u_char);
2650 
2651 u_char
2652 inb_(u_short port)
2653 {
2654 	return inb(port);
2655 }
2656 
2657 void
2658 outb_(u_short port, u_char data)
2659 {
2660 	outb(port, data);
2661 }
2662 
2663 #endif /* KDB */
2664