xref: /freebsd/sys/amd64/amd64/machdep.c (revision 0957b409)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 1992 Terrence R. Lambert.
6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
41  */
42 
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 #include "opt_atpic.h"
47 #include "opt_cpu.h"
48 #include "opt_ddb.h"
49 #include "opt_inet.h"
50 #include "opt_isa.h"
51 #include "opt_kstack_pages.h"
52 #include "opt_maxmem.h"
53 #include "opt_mp_watchdog.h"
54 #include "opt_pci.h"
55 #include "opt_platform.h"
56 #include "opt_sched.h"
57 
58 #include <sys/param.h>
59 #include <sys/proc.h>
60 #include <sys/systm.h>
61 #include <sys/bio.h>
62 #include <sys/buf.h>
63 #include <sys/bus.h>
64 #include <sys/callout.h>
65 #include <sys/cons.h>
66 #include <sys/cpu.h>
67 #include <sys/efi.h>
68 #include <sys/eventhandler.h>
69 #include <sys/exec.h>
70 #include <sys/imgact.h>
71 #include <sys/kdb.h>
72 #include <sys/kernel.h>
73 #include <sys/ktr.h>
74 #include <sys/linker.h>
75 #include <sys/lock.h>
76 #include <sys/malloc.h>
77 #include <sys/memrange.h>
78 #include <sys/msgbuf.h>
79 #include <sys/mutex.h>
80 #include <sys/pcpu.h>
81 #include <sys/ptrace.h>
82 #include <sys/reboot.h>
83 #include <sys/rwlock.h>
84 #include <sys/sched.h>
85 #include <sys/signalvar.h>
86 #ifdef SMP
87 #include <sys/smp.h>
88 #endif
89 #include <sys/syscallsubr.h>
90 #include <sys/sysctl.h>
91 #include <sys/sysent.h>
92 #include <sys/sysproto.h>
93 #include <sys/ucontext.h>
94 #include <sys/vmmeter.h>
95 
96 #include <vm/vm.h>
97 #include <vm/vm_extern.h>
98 #include <vm/vm_kern.h>
99 #include <vm/vm_page.h>
100 #include <vm/vm_map.h>
101 #include <vm/vm_object.h>
102 #include <vm/vm_pager.h>
103 #include <vm/vm_param.h>
104 #include <vm/vm_phys.h>
105 
106 #ifdef DDB
107 #ifndef KDB
108 #error KDB must be enabled in order for DDB to work!
109 #endif
110 #include <ddb/ddb.h>
111 #include <ddb/db_sym.h>
112 #endif
113 
114 #include <net/netisr.h>
115 
116 #include <machine/clock.h>
117 #include <machine/cpu.h>
118 #include <machine/cputypes.h>
119 #include <machine/frame.h>
120 #include <machine/intr_machdep.h>
121 #include <x86/mca.h>
122 #include <machine/md_var.h>
123 #include <machine/metadata.h>
124 #include <machine/mp_watchdog.h>
125 #include <machine/pc/bios.h>
126 #include <machine/pcb.h>
127 #include <machine/proc.h>
128 #include <machine/reg.h>
129 #include <machine/sigframe.h>
130 #include <machine/specialreg.h>
131 #include <machine/trap.h>
132 #include <machine/tss.h>
133 #include <x86/ucode.h>
134 #include <x86/ifunc.h>
135 #ifdef SMP
136 #include <machine/smp.h>
137 #endif
138 #ifdef FDT
139 #include <x86/fdt.h>
140 #endif
141 
142 #ifdef DEV_ATPIC
143 #include <x86/isa/icu.h>
144 #else
145 #include <x86/apicvar.h>
146 #endif
147 
148 #include <isa/isareg.h>
149 #include <isa/rtc.h>
150 #include <x86/init.h>
151 
152 /* Sanity check for __curthread() */
153 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
154 
155 /*
156  * The PTI trampoline stack needs enough space for a hardware trapframe and a
157  * couple of scratch registers, as well as the trapframe left behind after an
158  * iret fault.
159  */
160 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
161     offsetof(struct pti_frame, pti_rip));
162 
163 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
164 
165 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
166 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
167 
168 static void cpu_startup(void *);
169 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
170     char *xfpusave, size_t xfpusave_len);
171 static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
172     char *xfpustate, size_t xfpustate_len);
173 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
174 
175 /* Preload data parse function */
176 static caddr_t native_parse_preload_data(u_int64_t);
177 
178 /* Native function to fetch and parse the e820 map */
179 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
180 
181 /* Default init_ops implementation. */
182 struct init_ops init_ops = {
183 	.parse_preload_data =	native_parse_preload_data,
184 	.early_clock_source_init =	i8254_init,
185 	.early_delay =			i8254_delay,
186 	.parse_memmap =			native_parse_memmap,
187 #ifdef SMP
188 	.mp_bootaddress =		mp_bootaddress,
189 	.start_all_aps =		native_start_all_aps,
190 #endif
191 #ifdef DEV_PCI
192 	.msi_init =			msi_init,
193 #endif
194 };
195 
196 /*
197  * Physical address of the EFI System Table. Stashed from the metadata hints
198  * passed into the kernel and used by the EFI code to call runtime services.
199  */
200 vm_paddr_t efi_systbl_phys;
201 
202 /* Intel ICH registers */
203 #define ICH_PMBASE	0x400
204 #define ICH_SMI_EN	ICH_PMBASE + 0x30
205 
206 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
207 
208 int cold = 1;
209 
210 long Maxmem = 0;
211 long realmem = 0;
212 
213 /*
214  * The number of PHYSMAP entries must be one less than the number of
215  * PHYSSEG entries because the PHYSMAP entry that spans the largest
216  * physical address that is accessible by ISA DMA is split into two
217  * PHYSSEG entries.
218  */
219 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
220 
221 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
222 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
223 
224 /* must be 2 less so 0 0 can signal end of chunks */
225 #define	PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
226 #define	DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
227 
228 struct kva_md_info kmi;
229 
230 static struct trapframe proc0_tf;
231 struct region_descriptor r_gdt, r_idt;
232 
233 struct pcpu __pcpu[MAXCPU];
234 
235 struct mtx icu_lock;
236 
237 struct mem_range_softc mem_range_softc;
238 
239 struct mtx dt_lock;	/* lock for GDT and LDT */
240 
241 void (*vmm_resume_p)(void);
242 
243 static void
244 cpu_startup(dummy)
245 	void *dummy;
246 {
247 	uintmax_t memsize;
248 	char *sysenv;
249 
250 	/*
251 	 * On MacBooks, we need to disallow the legacy USB circuit to
252 	 * generate an SMI# because this can cause several problems,
253 	 * namely: incorrect CPU frequency detection and failure to
254 	 * start the APs.
255 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
256 	 * Enable register) of the Intel ICH LPC Interface Bridge.
257 	 */
258 	sysenv = kern_getenv("smbios.system.product");
259 	if (sysenv != NULL) {
260 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
261 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
262 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
263 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
264 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
265 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
266 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
267 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
268 			if (bootverbose)
269 				printf("Disabling LEGACY_USB_EN bit on "
270 				    "Intel ICH.\n");
271 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
272 		}
273 		freeenv(sysenv);
274 	}
275 
276 	/*
277 	 * Good {morning,afternoon,evening,night}.
278 	 */
279 	startrtclock();
280 	printcpuinfo();
281 
282 	/*
283 	 * Display physical memory if SMBIOS reports reasonable amount.
284 	 */
285 	memsize = 0;
286 	sysenv = kern_getenv("smbios.memory.enabled");
287 	if (sysenv != NULL) {
288 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
289 		freeenv(sysenv);
290 	}
291 	if (memsize < ptoa((uintmax_t)vm_free_count()))
292 		memsize = ptoa((uintmax_t)Maxmem);
293 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
294 	realmem = atop(memsize);
295 
296 	/*
297 	 * Display any holes after the first chunk of extended memory.
298 	 */
299 	if (bootverbose) {
300 		int indx;
301 
302 		printf("Physical memory chunk(s):\n");
303 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
304 			vm_paddr_t size;
305 
306 			size = phys_avail[indx + 1] - phys_avail[indx];
307 			printf(
308 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
309 			    (uintmax_t)phys_avail[indx],
310 			    (uintmax_t)phys_avail[indx + 1] - 1,
311 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
312 		}
313 	}
314 
315 	vm_ksubmap_init(&kmi);
316 
317 	printf("avail memory = %ju (%ju MB)\n",
318 	    ptoa((uintmax_t)vm_free_count()),
319 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
320 #ifdef DEV_PCI
321 	if (bootverbose && intel_graphics_stolen_base != 0)
322 		printf("intel stolen mem: base %#jx size %ju MB\n",
323 		    (uintmax_t)intel_graphics_stolen_base,
324 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
325 #endif
326 
327 	/*
328 	 * Set up buffers, so they can be used to read disk labels.
329 	 */
330 	bufinit();
331 	vm_pager_bufferinit();
332 
333 	cpu_setregs();
334 }
335 
336 /*
337  * Send an interrupt to process.
338  *
339  * Stack is set up to allow sigcode stored
340  * at top to call routine, followed by call
341  * to sigreturn routine below.  After sigreturn
342  * resets the signal mask, the stack, and the
343  * frame pointer, it returns to the user
344  * specified pc, psl.
345  */
346 void
347 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
348 {
349 	struct sigframe sf, *sfp;
350 	struct pcb *pcb;
351 	struct proc *p;
352 	struct thread *td;
353 	struct sigacts *psp;
354 	char *sp;
355 	struct trapframe *regs;
356 	char *xfpusave;
357 	size_t xfpusave_len;
358 	int sig;
359 	int oonstack;
360 
361 	td = curthread;
362 	pcb = td->td_pcb;
363 	p = td->td_proc;
364 	PROC_LOCK_ASSERT(p, MA_OWNED);
365 	sig = ksi->ksi_signo;
366 	psp = p->p_sigacts;
367 	mtx_assert(&psp->ps_mtx, MA_OWNED);
368 	regs = td->td_frame;
369 	oonstack = sigonstack(regs->tf_rsp);
370 
371 	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
372 		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
373 		xfpusave = __builtin_alloca(xfpusave_len);
374 	} else {
375 		xfpusave_len = 0;
376 		xfpusave = NULL;
377 	}
378 
379 	/* Save user context. */
380 	bzero(&sf, sizeof(sf));
381 	sf.sf_uc.uc_sigmask = *mask;
382 	sf.sf_uc.uc_stack = td->td_sigstk;
383 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
384 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
385 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
386 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
387 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
388 	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
389 	fpstate_drop(td);
390 	update_pcb_bases(pcb);
391 	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
392 	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
393 	bzero(sf.sf_uc.uc_mcontext.mc_spare,
394 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
395 
396 	/* Allocate space for the signal handler context. */
397 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
398 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
399 		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
400 #if defined(COMPAT_43)
401 		td->td_sigstk.ss_flags |= SS_ONSTACK;
402 #endif
403 	} else
404 		sp = (char *)regs->tf_rsp - 128;
405 	if (xfpusave != NULL) {
406 		sp -= xfpusave_len;
407 		sp = (char *)((unsigned long)sp & ~0x3Ful);
408 		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
409 	}
410 	sp -= sizeof(struct sigframe);
411 	/* Align to 16 bytes. */
412 	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
413 
414 	/* Build the argument list for the signal handler. */
415 	regs->tf_rdi = sig;			/* arg 1 in %rdi */
416 	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
417 	bzero(&sf.sf_si, sizeof(sf.sf_si));
418 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
419 		/* Signal handler installed with SA_SIGINFO. */
420 		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
421 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
422 
423 		/* Fill in POSIX parts */
424 		sf.sf_si = ksi->ksi_info;
425 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
426 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
427 	} else {
428 		/* Old FreeBSD-style arguments. */
429 		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
430 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
431 		sf.sf_ahu.sf_handler = catcher;
432 	}
433 	mtx_unlock(&psp->ps_mtx);
434 	PROC_UNLOCK(p);
435 
436 	/*
437 	 * Copy the sigframe out to the user's stack.
438 	 */
439 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
440 	    (xfpusave != NULL && copyout(xfpusave,
441 	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
442 	    != 0)) {
443 #ifdef DEBUG
444 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
445 #endif
446 		PROC_LOCK(p);
447 		sigexit(td, SIGILL);
448 	}
449 
450 	regs->tf_rsp = (long)sfp;
451 	regs->tf_rip = p->p_sysent->sv_sigcode_base;
452 	regs->tf_rflags &= ~(PSL_T | PSL_D);
453 	regs->tf_cs = _ucodesel;
454 	regs->tf_ds = _udatasel;
455 	regs->tf_ss = _udatasel;
456 	regs->tf_es = _udatasel;
457 	regs->tf_fs = _ufssel;
458 	regs->tf_gs = _ugssel;
459 	regs->tf_flags = TF_HASSEGS;
460 	PROC_LOCK(p);
461 	mtx_lock(&psp->ps_mtx);
462 }
463 
464 /*
465  * System call to cleanup state after a signal
466  * has been taken.  Reset signal mask and
467  * stack state from context left by sendsig (above).
468  * Return to previous pc and psl as specified by
469  * context left by sendsig. Check carefully to
470  * make sure that the user has not modified the
471  * state to gain improper privileges.
472  *
473  * MPSAFE
474  */
475 int
476 sys_sigreturn(td, uap)
477 	struct thread *td;
478 	struct sigreturn_args /* {
479 		const struct __ucontext *sigcntxp;
480 	} */ *uap;
481 {
482 	ucontext_t uc;
483 	struct pcb *pcb;
484 	struct proc *p;
485 	struct trapframe *regs;
486 	ucontext_t *ucp;
487 	char *xfpustate;
488 	size_t xfpustate_len;
489 	long rflags;
490 	int cs, error, ret;
491 	ksiginfo_t ksi;
492 
493 	pcb = td->td_pcb;
494 	p = td->td_proc;
495 
496 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
497 	if (error != 0) {
498 		uprintf("pid %d (%s): sigreturn copyin failed\n",
499 		    p->p_pid, td->td_name);
500 		return (error);
501 	}
502 	ucp = &uc;
503 	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
504 		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
505 		    td->td_name, ucp->uc_mcontext.mc_flags);
506 		return (EINVAL);
507 	}
508 	regs = td->td_frame;
509 	rflags = ucp->uc_mcontext.mc_rflags;
510 	/*
511 	 * Don't allow users to change privileged or reserved flags.
512 	 */
513 	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
514 		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
515 		    td->td_name, rflags);
516 		return (EINVAL);
517 	}
518 
519 	/*
520 	 * Don't allow users to load a valid privileged %cs.  Let the
521 	 * hardware check for invalid selectors, excess privilege in
522 	 * other selectors, invalid %eip's and invalid %esp's.
523 	 */
524 	cs = ucp->uc_mcontext.mc_cs;
525 	if (!CS_SECURE(cs)) {
526 		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
527 		    td->td_name, cs);
528 		ksiginfo_init_trap(&ksi);
529 		ksi.ksi_signo = SIGBUS;
530 		ksi.ksi_code = BUS_OBJERR;
531 		ksi.ksi_trapno = T_PROTFLT;
532 		ksi.ksi_addr = (void *)regs->tf_rip;
533 		trapsignal(td, &ksi);
534 		return (EINVAL);
535 	}
536 
537 	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
538 		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
539 		if (xfpustate_len > cpu_max_ext_state_size -
540 		    sizeof(struct savefpu)) {
541 			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
542 			    p->p_pid, td->td_name, xfpustate_len);
543 			return (EINVAL);
544 		}
545 		xfpustate = __builtin_alloca(xfpustate_len);
546 		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
547 		    xfpustate, xfpustate_len);
548 		if (error != 0) {
549 			uprintf(
550 	"pid %d (%s): sigreturn copying xfpustate failed\n",
551 			    p->p_pid, td->td_name);
552 			return (error);
553 		}
554 	} else {
555 		xfpustate = NULL;
556 		xfpustate_len = 0;
557 	}
558 	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
559 	if (ret != 0) {
560 		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
561 		    p->p_pid, td->td_name, ret);
562 		return (ret);
563 	}
564 	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
565 	update_pcb_bases(pcb);
566 	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
567 	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
568 
569 #if defined(COMPAT_43)
570 	if (ucp->uc_mcontext.mc_onstack & 1)
571 		td->td_sigstk.ss_flags |= SS_ONSTACK;
572 	else
573 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
574 #endif
575 
576 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
577 	return (EJUSTRETURN);
578 }
579 
580 #ifdef COMPAT_FREEBSD4
581 int
582 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
583 {
584 
585 	return sys_sigreturn(td, (struct sigreturn_args *)uap);
586 }
587 #endif
588 
589 /*
590  * Reset registers to default values on exec.
591  */
592 void
593 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
594 {
595 	struct trapframe *regs;
596 	struct pcb *pcb;
597 	register_t saved_rflags;
598 
599 	regs = td->td_frame;
600 	pcb = td->td_pcb;
601 
602 	if (td->td_proc->p_md.md_ldt != NULL)
603 		user_ldt_free(td);
604 
605 	update_pcb_bases(pcb);
606 	pcb->pcb_fsbase = 0;
607 	pcb->pcb_gsbase = 0;
608 	clear_pcb_flags(pcb, PCB_32BIT);
609 	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
610 
611 	saved_rflags = regs->tf_rflags & PSL_T;
612 	bzero((char *)regs, sizeof(struct trapframe));
613 	regs->tf_rip = imgp->entry_addr;
614 	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
615 	regs->tf_rdi = stack;		/* argv */
616 	regs->tf_rflags = PSL_USER | saved_rflags;
617 	regs->tf_ss = _udatasel;
618 	regs->tf_cs = _ucodesel;
619 	regs->tf_ds = _udatasel;
620 	regs->tf_es = _udatasel;
621 	regs->tf_fs = _ufssel;
622 	regs->tf_gs = _ugssel;
623 	regs->tf_flags = TF_HASSEGS;
624 
625 	/*
626 	 * Reset the hardware debug registers if they were in use.
627 	 * They won't have any meaning for the newly exec'd process.
628 	 */
629 	if (pcb->pcb_flags & PCB_DBREGS) {
630 		pcb->pcb_dr0 = 0;
631 		pcb->pcb_dr1 = 0;
632 		pcb->pcb_dr2 = 0;
633 		pcb->pcb_dr3 = 0;
634 		pcb->pcb_dr6 = 0;
635 		pcb->pcb_dr7 = 0;
636 		if (pcb == curpcb) {
637 			/*
638 			 * Clear the debug registers on the running
639 			 * CPU, otherwise they will end up affecting
640 			 * the next process we switch to.
641 			 */
642 			reset_dbregs();
643 		}
644 		clear_pcb_flags(pcb, PCB_DBREGS);
645 	}
646 
647 	/*
648 	 * Drop the FP state if we hold it, so that the process gets a
649 	 * clean FP state if it uses the FPU again.
650 	 */
651 	fpstate_drop(td);
652 }
653 
654 void
655 cpu_setregs(void)
656 {
657 	register_t cr0;
658 
659 	cr0 = rcr0();
660 	/*
661 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
662 	 * BSP.  See the comments there about why we set them.
663 	 */
664 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
665 	load_cr0(cr0);
666 }
667 
668 /*
669  * Initialize amd64 and configure to run kernel
670  */
671 
672 /*
673  * Initialize segments & interrupt table
674  */
675 
676 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
677 static struct gate_descriptor idt0[NIDT];
678 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
679 
680 static char dblfault_stack[PAGE_SIZE] __aligned(16);
681 static char mce0_stack[PAGE_SIZE] __aligned(16);
682 static char nmi0_stack[PAGE_SIZE] __aligned(16);
683 static char dbg0_stack[PAGE_SIZE] __aligned(16);
684 CTASSERT(sizeof(struct nmi_pcpu) == 16);
685 
686 struct amd64tss common_tss[MAXCPU];
687 
688 /*
689  * Software prototypes -- in more palatable form.
690  *
691  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
692  * slots as corresponding segments for i386 kernel.
693  */
694 struct soft_segment_descriptor gdt_segs[] = {
695 /* GNULL_SEL	0 Null Descriptor */
696 {	.ssd_base = 0x0,
697 	.ssd_limit = 0x0,
698 	.ssd_type = 0,
699 	.ssd_dpl = 0,
700 	.ssd_p = 0,
701 	.ssd_long = 0,
702 	.ssd_def32 = 0,
703 	.ssd_gran = 0		},
704 /* GNULL2_SEL	1 Null Descriptor */
705 {	.ssd_base = 0x0,
706 	.ssd_limit = 0x0,
707 	.ssd_type = 0,
708 	.ssd_dpl = 0,
709 	.ssd_p = 0,
710 	.ssd_long = 0,
711 	.ssd_def32 = 0,
712 	.ssd_gran = 0		},
713 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
714 {	.ssd_base = 0x0,
715 	.ssd_limit = 0xfffff,
716 	.ssd_type = SDT_MEMRWA,
717 	.ssd_dpl = SEL_UPL,
718 	.ssd_p = 1,
719 	.ssd_long = 0,
720 	.ssd_def32 = 1,
721 	.ssd_gran = 1		},
722 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
723 {	.ssd_base = 0x0,
724 	.ssd_limit = 0xfffff,
725 	.ssd_type = SDT_MEMRWA,
726 	.ssd_dpl = SEL_UPL,
727 	.ssd_p = 1,
728 	.ssd_long = 0,
729 	.ssd_def32 = 1,
730 	.ssd_gran = 1		},
731 /* GCODE_SEL	4 Code Descriptor for kernel */
732 {	.ssd_base = 0x0,
733 	.ssd_limit = 0xfffff,
734 	.ssd_type = SDT_MEMERA,
735 	.ssd_dpl = SEL_KPL,
736 	.ssd_p = 1,
737 	.ssd_long = 1,
738 	.ssd_def32 = 0,
739 	.ssd_gran = 1		},
740 /* GDATA_SEL	5 Data Descriptor for kernel */
741 {	.ssd_base = 0x0,
742 	.ssd_limit = 0xfffff,
743 	.ssd_type = SDT_MEMRWA,
744 	.ssd_dpl = SEL_KPL,
745 	.ssd_p = 1,
746 	.ssd_long = 1,
747 	.ssd_def32 = 0,
748 	.ssd_gran = 1		},
749 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
750 {	.ssd_base = 0x0,
751 	.ssd_limit = 0xfffff,
752 	.ssd_type = SDT_MEMERA,
753 	.ssd_dpl = SEL_UPL,
754 	.ssd_p = 1,
755 	.ssd_long = 0,
756 	.ssd_def32 = 1,
757 	.ssd_gran = 1		},
758 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
759 {	.ssd_base = 0x0,
760 	.ssd_limit = 0xfffff,
761 	.ssd_type = SDT_MEMRWA,
762 	.ssd_dpl = SEL_UPL,
763 	.ssd_p = 1,
764 	.ssd_long = 0,
765 	.ssd_def32 = 1,
766 	.ssd_gran = 1		},
767 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
768 {	.ssd_base = 0x0,
769 	.ssd_limit = 0xfffff,
770 	.ssd_type = SDT_MEMERA,
771 	.ssd_dpl = SEL_UPL,
772 	.ssd_p = 1,
773 	.ssd_long = 1,
774 	.ssd_def32 = 0,
775 	.ssd_gran = 1		},
776 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
777 {	.ssd_base = 0x0,
778 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
779 	.ssd_type = SDT_SYSTSS,
780 	.ssd_dpl = SEL_KPL,
781 	.ssd_p = 1,
782 	.ssd_long = 0,
783 	.ssd_def32 = 0,
784 	.ssd_gran = 0		},
785 /* Actually, the TSS is a system descriptor which is double size */
786 {	.ssd_base = 0x0,
787 	.ssd_limit = 0x0,
788 	.ssd_type = 0,
789 	.ssd_dpl = 0,
790 	.ssd_p = 0,
791 	.ssd_long = 0,
792 	.ssd_def32 = 0,
793 	.ssd_gran = 0		},
794 /* GUSERLDT_SEL	11 LDT Descriptor */
795 {	.ssd_base = 0x0,
796 	.ssd_limit = 0x0,
797 	.ssd_type = 0,
798 	.ssd_dpl = 0,
799 	.ssd_p = 0,
800 	.ssd_long = 0,
801 	.ssd_def32 = 0,
802 	.ssd_gran = 0		},
803 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
804 {	.ssd_base = 0x0,
805 	.ssd_limit = 0x0,
806 	.ssd_type = 0,
807 	.ssd_dpl = 0,
808 	.ssd_p = 0,
809 	.ssd_long = 0,
810 	.ssd_def32 = 0,
811 	.ssd_gran = 0		},
812 };
813 
814 void
815 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
816 {
817 	struct gate_descriptor *ip;
818 
819 	ip = idt + idx;
820 	ip->gd_looffset = (uintptr_t)func;
821 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
822 	ip->gd_ist = ist;
823 	ip->gd_xx = 0;
824 	ip->gd_type = typ;
825 	ip->gd_dpl = dpl;
826 	ip->gd_p = 1;
827 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
828 }
829 
830 extern inthand_t
831 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
832 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
833 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
834 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
835 	IDTVEC(xmm), IDTVEC(dblfault),
836 	IDTVEC(div_pti), IDTVEC(bpt_pti),
837 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
838 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
839 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
840 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
841 	IDTVEC(xmm_pti),
842 #ifdef KDTRACE_HOOKS
843 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
844 #endif
845 #ifdef XENHVM
846 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
847 #endif
848 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
849 	IDTVEC(fast_syscall_pti);
850 
851 #ifdef DDB
852 /*
853  * Display the index and function name of any IDT entries that don't use
854  * the default 'rsvd' entry point.
855  */
856 DB_SHOW_COMMAND(idt, db_show_idt)
857 {
858 	struct gate_descriptor *ip;
859 	int idx;
860 	uintptr_t func;
861 
862 	ip = idt;
863 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
864 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
865 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
866 			db_printf("%3d\t", idx);
867 			db_printsym(func, DB_STGY_PROC);
868 			db_printf("\n");
869 		}
870 		ip++;
871 	}
872 }
873 
874 /* Show privileged registers. */
875 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
876 {
877 	struct {
878 		uint16_t limit;
879 		uint64_t base;
880 	} __packed idtr, gdtr;
881 	uint16_t ldt, tr;
882 
883 	__asm __volatile("sidt %0" : "=m" (idtr));
884 	db_printf("idtr\t0x%016lx/%04x\n",
885 	    (u_long)idtr.base, (u_int)idtr.limit);
886 	__asm __volatile("sgdt %0" : "=m" (gdtr));
887 	db_printf("gdtr\t0x%016lx/%04x\n",
888 	    (u_long)gdtr.base, (u_int)gdtr.limit);
889 	__asm __volatile("sldt %0" : "=r" (ldt));
890 	db_printf("ldtr\t0x%04x\n", ldt);
891 	__asm __volatile("str %0" : "=r" (tr));
892 	db_printf("tr\t0x%04x\n", tr);
893 	db_printf("cr0\t0x%016lx\n", rcr0());
894 	db_printf("cr2\t0x%016lx\n", rcr2());
895 	db_printf("cr3\t0x%016lx\n", rcr3());
896 	db_printf("cr4\t0x%016lx\n", rcr4());
897 	if (rcr4() & CR4_XSAVE)
898 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
899 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
900 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
901 		db_printf("FEATURES_CTL\t%016lx\n",
902 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
903 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
904 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
905 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
906 }
907 
908 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
909 {
910 
911 	db_printf("dr0\t0x%016lx\n", rdr0());
912 	db_printf("dr1\t0x%016lx\n", rdr1());
913 	db_printf("dr2\t0x%016lx\n", rdr2());
914 	db_printf("dr3\t0x%016lx\n", rdr3());
915 	db_printf("dr6\t0x%016lx\n", rdr6());
916 	db_printf("dr7\t0x%016lx\n", rdr7());
917 }
918 #endif
919 
920 void
921 sdtossd(sd, ssd)
922 	struct user_segment_descriptor *sd;
923 	struct soft_segment_descriptor *ssd;
924 {
925 
926 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
927 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
928 	ssd->ssd_type  = sd->sd_type;
929 	ssd->ssd_dpl   = sd->sd_dpl;
930 	ssd->ssd_p     = sd->sd_p;
931 	ssd->ssd_long  = sd->sd_long;
932 	ssd->ssd_def32 = sd->sd_def32;
933 	ssd->ssd_gran  = sd->sd_gran;
934 }
935 
936 void
937 ssdtosd(ssd, sd)
938 	struct soft_segment_descriptor *ssd;
939 	struct user_segment_descriptor *sd;
940 {
941 
942 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
943 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
944 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
945 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
946 	sd->sd_type  = ssd->ssd_type;
947 	sd->sd_dpl   = ssd->ssd_dpl;
948 	sd->sd_p     = ssd->ssd_p;
949 	sd->sd_long  = ssd->ssd_long;
950 	sd->sd_def32 = ssd->ssd_def32;
951 	sd->sd_gran  = ssd->ssd_gran;
952 }
953 
954 void
955 ssdtosyssd(ssd, sd)
956 	struct soft_segment_descriptor *ssd;
957 	struct system_segment_descriptor *sd;
958 {
959 
960 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
961 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
962 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
963 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
964 	sd->sd_type  = ssd->ssd_type;
965 	sd->sd_dpl   = ssd->ssd_dpl;
966 	sd->sd_p     = ssd->ssd_p;
967 	sd->sd_gran  = ssd->ssd_gran;
968 }
969 
970 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
971 #include <isa/isavar.h>
972 #include <isa/isareg.h>
973 /*
974  * Return a bitmap of the current interrupt requests.  This is 8259-specific
975  * and is only suitable for use at probe time.
976  * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
977  * It shouldn't be here.  There should probably be an APIC centric
978  * implementation in the apic driver code, if at all.
979  */
980 intrmask_t
981 isa_irq_pending(void)
982 {
983 	u_char irr1;
984 	u_char irr2;
985 
986 	irr1 = inb(IO_ICU1);
987 	irr2 = inb(IO_ICU2);
988 	return ((irr2 << 8) | irr1);
989 }
990 #endif
991 
992 u_int basemem;
993 
994 static int
995 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
996     int *physmap_idxp)
997 {
998 	int i, insert_idx, physmap_idx;
999 
1000 	physmap_idx = *physmap_idxp;
1001 
1002 	if (length == 0)
1003 		return (1);
1004 
1005 	/*
1006 	 * Find insertion point while checking for overlap.  Start off by
1007 	 * assuming the new entry will be added to the end.
1008 	 *
1009 	 * NB: physmap_idx points to the next free slot.
1010 	 */
1011 	insert_idx = physmap_idx;
1012 	for (i = 0; i <= physmap_idx; i += 2) {
1013 		if (base < physmap[i + 1]) {
1014 			if (base + length <= physmap[i]) {
1015 				insert_idx = i;
1016 				break;
1017 			}
1018 			if (boothowto & RB_VERBOSE)
1019 				printf(
1020 		    "Overlapping memory regions, ignoring second region\n");
1021 			return (1);
1022 		}
1023 	}
1024 
1025 	/* See if we can prepend to the next entry. */
1026 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1027 		physmap[insert_idx] = base;
1028 		return (1);
1029 	}
1030 
1031 	/* See if we can append to the previous entry. */
1032 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1033 		physmap[insert_idx - 1] += length;
1034 		return (1);
1035 	}
1036 
1037 	physmap_idx += 2;
1038 	*physmap_idxp = physmap_idx;
1039 	if (physmap_idx == PHYSMAP_SIZE) {
1040 		printf(
1041 		"Too many segments in the physical address map, giving up\n");
1042 		return (0);
1043 	}
1044 
1045 	/*
1046 	 * Move the last 'N' entries down to make room for the new
1047 	 * entry if needed.
1048 	 */
1049 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1050 		physmap[i] = physmap[i - 2];
1051 		physmap[i + 1] = physmap[i - 1];
1052 	}
1053 
1054 	/* Insert the new entry. */
1055 	physmap[insert_idx] = base;
1056 	physmap[insert_idx + 1] = base + length;
1057 	return (1);
1058 }
1059 
1060 void
1061 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1062                       vm_paddr_t *physmap, int *physmap_idx)
1063 {
1064 	struct bios_smap *smap, *smapend;
1065 
1066 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1067 
1068 	for (smap = smapbase; smap < smapend; smap++) {
1069 		if (boothowto & RB_VERBOSE)
1070 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
1071 			    smap->type, smap->base, smap->length);
1072 
1073 		if (smap->type != SMAP_TYPE_MEMORY)
1074 			continue;
1075 
1076 		if (!add_physmap_entry(smap->base, smap->length, physmap,
1077 		    physmap_idx))
1078 			break;
1079 	}
1080 }
1081 
1082 static void
1083 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1084     int *physmap_idx)
1085 {
1086 	struct efi_md *map, *p;
1087 	const char *type;
1088 	size_t efisz;
1089 	int ndesc, i;
1090 
1091 	static const char *types[] = {
1092 		"Reserved",
1093 		"LoaderCode",
1094 		"LoaderData",
1095 		"BootServicesCode",
1096 		"BootServicesData",
1097 		"RuntimeServicesCode",
1098 		"RuntimeServicesData",
1099 		"ConventionalMemory",
1100 		"UnusableMemory",
1101 		"ACPIReclaimMemory",
1102 		"ACPIMemoryNVS",
1103 		"MemoryMappedIO",
1104 		"MemoryMappedIOPortSpace",
1105 		"PalCode",
1106 		"PersistentMemory"
1107 	};
1108 
1109 	/*
1110 	 * Memory map data provided by UEFI via the GetMemoryMap
1111 	 * Boot Services API.
1112 	 */
1113 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1114 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1115 
1116 	if (efihdr->descriptor_size == 0)
1117 		return;
1118 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
1119 
1120 	if (boothowto & RB_VERBOSE)
1121 		printf("%23s %12s %12s %8s %4s\n",
1122 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
1123 
1124 	for (i = 0, p = map; i < ndesc; i++,
1125 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1126 		if (boothowto & RB_VERBOSE) {
1127 			if (p->md_type < nitems(types))
1128 				type = types[p->md_type];
1129 			else
1130 				type = "<INVALID>";
1131 			printf("%23s %012lx %12p %08lx ", type, p->md_phys,
1132 			    p->md_virt, p->md_pages);
1133 			if (p->md_attr & EFI_MD_ATTR_UC)
1134 				printf("UC ");
1135 			if (p->md_attr & EFI_MD_ATTR_WC)
1136 				printf("WC ");
1137 			if (p->md_attr & EFI_MD_ATTR_WT)
1138 				printf("WT ");
1139 			if (p->md_attr & EFI_MD_ATTR_WB)
1140 				printf("WB ");
1141 			if (p->md_attr & EFI_MD_ATTR_UCE)
1142 				printf("UCE ");
1143 			if (p->md_attr & EFI_MD_ATTR_WP)
1144 				printf("WP ");
1145 			if (p->md_attr & EFI_MD_ATTR_RP)
1146 				printf("RP ");
1147 			if (p->md_attr & EFI_MD_ATTR_XP)
1148 				printf("XP ");
1149 			if (p->md_attr & EFI_MD_ATTR_NV)
1150 				printf("NV ");
1151 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1152 				printf("MORE_RELIABLE ");
1153 			if (p->md_attr & EFI_MD_ATTR_RO)
1154 				printf("RO ");
1155 			if (p->md_attr & EFI_MD_ATTR_RT)
1156 				printf("RUNTIME");
1157 			printf("\n");
1158 		}
1159 
1160 		switch (p->md_type) {
1161 		case EFI_MD_TYPE_CODE:
1162 		case EFI_MD_TYPE_DATA:
1163 		case EFI_MD_TYPE_BS_CODE:
1164 		case EFI_MD_TYPE_BS_DATA:
1165 		case EFI_MD_TYPE_FREE:
1166 			/*
1167 			 * We're allowed to use any entry with these types.
1168 			 */
1169 			break;
1170 		default:
1171 			continue;
1172 		}
1173 
1174 		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1175 		    physmap, physmap_idx))
1176 			break;
1177 	}
1178 }
1179 
1180 static char bootmethod[16] = "";
1181 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
1182     "System firmware boot method");
1183 
1184 static void
1185 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1186 {
1187 	struct bios_smap *smap;
1188 	struct efi_map_header *efihdr;
1189 	u_int32_t size;
1190 
1191 	/*
1192 	 * Memory map from INT 15:E820.
1193 	 *
1194 	 * subr_module.c says:
1195 	 * "Consumer may safely assume that size value precedes data."
1196 	 * ie: an int32_t immediately precedes smap.
1197 	 */
1198 
1199 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1200 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1201 	smap = (struct bios_smap *)preload_search_info(kmdp,
1202 	    MODINFO_METADATA | MODINFOMD_SMAP);
1203 	if (efihdr == NULL && smap == NULL)
1204 		panic("No BIOS smap or EFI map info from loader!");
1205 
1206 	if (efihdr != NULL) {
1207 		add_efi_map_entries(efihdr, physmap, physmap_idx);
1208 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1209 	} else {
1210 		size = *((u_int32_t *)smap - 1);
1211 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
1212 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1213 	}
1214 }
1215 
1216 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
1217 
1218 /*
1219  * Populate the (physmap) array with base/bound pairs describing the
1220  * available physical memory in the system, then test this memory and
1221  * build the phys_avail array describing the actually-available memory.
1222  *
1223  * Total memory size may be set by the kernel environment variable
1224  * hw.physmem or the compile-time define MAXMEM.
1225  *
1226  * XXX first should be vm_paddr_t.
1227  */
1228 static void
1229 getmemsize(caddr_t kmdp, u_int64_t first)
1230 {
1231 	int i, physmap_idx, pa_indx, da_indx;
1232 	vm_paddr_t pa, physmap[PHYSMAP_SIZE];
1233 	u_long physmem_start, physmem_tunable, memtest;
1234 	pt_entry_t *pte;
1235 	quad_t dcons_addr, dcons_size;
1236 	int page_counter;
1237 
1238 	/*
1239 	 * Tell the physical memory allocator about pages used to store
1240 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
1241 	 */
1242 	vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first));
1243 
1244 	bzero(physmap, sizeof(physmap));
1245 	physmap_idx = 0;
1246 
1247 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1248 	physmap_idx -= 2;
1249 
1250 	/*
1251 	 * Find the 'base memory' segment for SMP
1252 	 */
1253 	basemem = 0;
1254 	for (i = 0; i <= physmap_idx; i += 2) {
1255 		if (physmap[i] <= 0xA0000) {
1256 			basemem = physmap[i + 1] / 1024;
1257 			break;
1258 		}
1259 	}
1260 	if (basemem == 0 || basemem > 640) {
1261 		if (bootverbose)
1262 			printf(
1263 		"Memory map doesn't contain a basemem segment, faking it");
1264 		basemem = 640;
1265 	}
1266 
1267 	/*
1268 	 * Maxmem isn't the "maximum memory", it's one larger than the
1269 	 * highest page of the physical address space.  It should be
1270 	 * called something like "Maxphyspage".  We may adjust this
1271 	 * based on ``hw.physmem'' and the results of the memory test.
1272 	 */
1273 	Maxmem = atop(physmap[physmap_idx + 1]);
1274 
1275 #ifdef MAXMEM
1276 	Maxmem = MAXMEM / 4;
1277 #endif
1278 
1279 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1280 		Maxmem = atop(physmem_tunable);
1281 
1282 	/*
1283 	 * The boot memory test is disabled by default, as it takes a
1284 	 * significant amount of time on large-memory systems, and is
1285 	 * unfriendly to virtual machines as it unnecessarily touches all
1286 	 * pages.
1287 	 *
1288 	 * A general name is used as the code may be extended to support
1289 	 * additional tests beyond the current "page present" test.
1290 	 */
1291 	memtest = 0;
1292 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1293 
1294 	/*
1295 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1296 	 * in the system.
1297 	 */
1298 	if (Maxmem > atop(physmap[physmap_idx + 1]))
1299 		Maxmem = atop(physmap[physmap_idx + 1]);
1300 
1301 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1302 	    (boothowto & RB_VERBOSE))
1303 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
1304 
1305 	/*
1306 	 * Make hole for "AP -> long mode" bootstrap code.  The
1307 	 * mp_bootaddress vector is only available when the kernel
1308 	 * is configured to support APs and APs for the system start
1309 	 * in real mode mode (e.g. SMP bare metal).
1310 	 */
1311 	if (init_ops.mp_bootaddress)
1312 		init_ops.mp_bootaddress(physmap, &physmap_idx);
1313 
1314 	/* call pmap initialization to make new kernel address space */
1315 	pmap_bootstrap(&first);
1316 
1317 	/*
1318 	 * Size up each available chunk of physical memory.
1319 	 *
1320 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1321 	 * By default, mask off the first 16 pages unless we appear to be
1322 	 * running in a VM.
1323 	 */
1324 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1325 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1326 	if (physmap[0] < physmem_start) {
1327 		if (physmem_start < PAGE_SIZE)
1328 			physmap[0] = PAGE_SIZE;
1329 		else if (physmem_start >= physmap[1])
1330 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1331 		else
1332 			physmap[0] = round_page(physmem_start);
1333 	}
1334 	pa_indx = 0;
1335 	da_indx = 1;
1336 	phys_avail[pa_indx++] = physmap[0];
1337 	phys_avail[pa_indx] = physmap[0];
1338 	dump_avail[da_indx] = physmap[0];
1339 	pte = CMAP1;
1340 
1341 	/*
1342 	 * Get dcons buffer address
1343 	 */
1344 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1345 	    getenv_quad("dcons.size", &dcons_size) == 0)
1346 		dcons_addr = 0;
1347 
1348 	/*
1349 	 * physmap is in bytes, so when converting to page boundaries,
1350 	 * round up the start address and round down the end address.
1351 	 */
1352 	page_counter = 0;
1353 	if (memtest != 0)
1354 		printf("Testing system memory");
1355 	for (i = 0; i <= physmap_idx; i += 2) {
1356 		vm_paddr_t end;
1357 
1358 		end = ptoa((vm_paddr_t)Maxmem);
1359 		if (physmap[i + 1] < end)
1360 			end = trunc_page(physmap[i + 1]);
1361 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1362 			int tmp, page_bad, full;
1363 			int *ptr = (int *)CADDR1;
1364 
1365 			full = FALSE;
1366 			/*
1367 			 * block out kernel memory as not available.
1368 			 */
1369 			if (pa >= (vm_paddr_t)kernphys && pa < first)
1370 				goto do_dump_avail;
1371 
1372 			/*
1373 			 * block out dcons buffer
1374 			 */
1375 			if (dcons_addr > 0
1376 			    && pa >= trunc_page(dcons_addr)
1377 			    && pa < dcons_addr + dcons_size)
1378 				goto do_dump_avail;
1379 
1380 			page_bad = FALSE;
1381 			if (memtest == 0)
1382 				goto skip_memtest;
1383 
1384 			/*
1385 			 * Print a "." every GB to show we're making
1386 			 * progress.
1387 			 */
1388 			page_counter++;
1389 			if ((page_counter % PAGES_PER_GB) == 0)
1390 				printf(".");
1391 
1392 			/*
1393 			 * map page into kernel: valid, read/write,non-cacheable
1394 			 */
1395 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1396 			invltlb();
1397 
1398 			tmp = *(int *)ptr;
1399 			/*
1400 			 * Test for alternating 1's and 0's
1401 			 */
1402 			*(volatile int *)ptr = 0xaaaaaaaa;
1403 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1404 				page_bad = TRUE;
1405 			/*
1406 			 * Test for alternating 0's and 1's
1407 			 */
1408 			*(volatile int *)ptr = 0x55555555;
1409 			if (*(volatile int *)ptr != 0x55555555)
1410 				page_bad = TRUE;
1411 			/*
1412 			 * Test for all 1's
1413 			 */
1414 			*(volatile int *)ptr = 0xffffffff;
1415 			if (*(volatile int *)ptr != 0xffffffff)
1416 				page_bad = TRUE;
1417 			/*
1418 			 * Test for all 0's
1419 			 */
1420 			*(volatile int *)ptr = 0x0;
1421 			if (*(volatile int *)ptr != 0x0)
1422 				page_bad = TRUE;
1423 			/*
1424 			 * Restore original value.
1425 			 */
1426 			*(int *)ptr = tmp;
1427 
1428 skip_memtest:
1429 			/*
1430 			 * Adjust array of valid/good pages.
1431 			 */
1432 			if (page_bad == TRUE)
1433 				continue;
1434 			/*
1435 			 * If this good page is a continuation of the
1436 			 * previous set of good pages, then just increase
1437 			 * the end pointer. Otherwise start a new chunk.
1438 			 * Note that "end" points one higher than end,
1439 			 * making the range >= start and < end.
1440 			 * If we're also doing a speculative memory
1441 			 * test and we at or past the end, bump up Maxmem
1442 			 * so that we keep going. The first bad page
1443 			 * will terminate the loop.
1444 			 */
1445 			if (phys_avail[pa_indx] == pa) {
1446 				phys_avail[pa_indx] += PAGE_SIZE;
1447 			} else {
1448 				pa_indx++;
1449 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
1450 					printf(
1451 		"Too many holes in the physical address space, giving up\n");
1452 					pa_indx--;
1453 					full = TRUE;
1454 					goto do_dump_avail;
1455 				}
1456 				phys_avail[pa_indx++] = pa;	/* start */
1457 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1458 			}
1459 			physmem++;
1460 do_dump_avail:
1461 			if (dump_avail[da_indx] == pa) {
1462 				dump_avail[da_indx] += PAGE_SIZE;
1463 			} else {
1464 				da_indx++;
1465 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
1466 					da_indx--;
1467 					goto do_next;
1468 				}
1469 				dump_avail[da_indx++] = pa; /* start */
1470 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1471 			}
1472 do_next:
1473 			if (full)
1474 				break;
1475 		}
1476 	}
1477 	*pte = 0;
1478 	invltlb();
1479 	if (memtest != 0)
1480 		printf("\n");
1481 
1482 	/*
1483 	 * XXX
1484 	 * The last chunk must contain at least one page plus the message
1485 	 * buffer to avoid complicating other code (message buffer address
1486 	 * calculation, etc.).
1487 	 */
1488 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1489 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1490 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1491 		phys_avail[pa_indx--] = 0;
1492 		phys_avail[pa_indx--] = 0;
1493 	}
1494 
1495 	Maxmem = atop(phys_avail[pa_indx]);
1496 
1497 	/* Trim off space for the message buffer. */
1498 	phys_avail[pa_indx] -= round_page(msgbufsize);
1499 
1500 	/* Map the message buffer. */
1501 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1502 }
1503 
1504 static caddr_t
1505 native_parse_preload_data(u_int64_t modulep)
1506 {
1507 	caddr_t kmdp;
1508 	char *envp;
1509 #ifdef DDB
1510 	vm_offset_t ksym_start;
1511 	vm_offset_t ksym_end;
1512 #endif
1513 
1514 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1515 	preload_bootstrap_relocate(KERNBASE);
1516 	kmdp = preload_search_by_type("elf kernel");
1517 	if (kmdp == NULL)
1518 		kmdp = preload_search_by_type("elf64 kernel");
1519 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1520 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1521 	if (envp != NULL)
1522 		envp += KERNBASE;
1523 	init_static_kenv(envp, 0);
1524 #ifdef DDB
1525 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1526 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1527 	db_fetch_ksymtab(ksym_start, ksym_end);
1528 #endif
1529 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1530 
1531 	return (kmdp);
1532 }
1533 
1534 static void
1535 amd64_kdb_init(void)
1536 {
1537 	kdb_init();
1538 #ifdef KDB
1539 	if (boothowto & RB_KDB)
1540 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1541 #endif
1542 }
1543 
1544 /* Set up the fast syscall stuff */
1545 void
1546 amd64_conf_fast_syscall(void)
1547 {
1548 	uint64_t msr;
1549 
1550 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1551 	wrmsr(MSR_EFER, msr);
1552 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1553 	    (u_int64_t)IDTVEC(fast_syscall));
1554 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1555 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1556 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1557 	wrmsr(MSR_STAR, msr);
1558 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1559 }
1560 
1561 u_int64_t
1562 hammer_time(u_int64_t modulep, u_int64_t physfree)
1563 {
1564 	caddr_t kmdp;
1565 	int gsel_tss, x;
1566 	struct pcpu *pc;
1567 	struct nmi_pcpu *np;
1568 	struct xstate_hdr *xhdr;
1569 	u_int64_t rsp0;
1570 	char *env;
1571 	size_t kstack0_sz;
1572 	int late_console;
1573 
1574 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1575 
1576 	kmdp = init_ops.parse_preload_data(modulep);
1577 
1578 	physfree += ucode_load_bsp(physfree + KERNBASE);
1579 	physfree = roundup2(physfree, PAGE_SIZE);
1580 
1581 	identify_cpu1();
1582 	identify_hypervisor();
1583 	identify_cpu_fixup_bsp();
1584 	identify_cpu2();
1585 	initializecpucache();
1586 
1587 	/*
1588 	 * Check for pti, pcid, and invpcid before ifuncs are
1589 	 * resolved, to correctly select the implementation for
1590 	 * pmap_activate_sw_mode().
1591 	 */
1592 	pti = pti_get_default();
1593 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1594 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1595 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1596 		invpcid_works = (cpu_stdext_feature &
1597 		    CPUID_STDEXT_INVPCID) != 0;
1598 	} else {
1599 		pmap_pcid_enabled = 0;
1600 	}
1601 
1602 	link_elf_ireloc(kmdp);
1603 
1604 	/*
1605 	 * This may be done better later if it gets more high level
1606 	 * components in it. If so just link td->td_proc here.
1607 	 */
1608 	proc_linkup0(&proc0, &thread0);
1609 
1610 	/* Init basic tunables, hz etc */
1611 	init_param1();
1612 
1613 	thread0.td_kstack = physfree + KERNBASE;
1614 	thread0.td_kstack_pages = kstack_pages;
1615 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1616 	bzero((void *)thread0.td_kstack, kstack0_sz);
1617 	physfree += kstack0_sz;
1618 
1619 	/*
1620 	 * make gdt memory segments
1621 	 */
1622 	for (x = 0; x < NGDT; x++) {
1623 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1624 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1625 			ssdtosd(&gdt_segs[x], &gdt[x]);
1626 	}
1627 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
1628 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1629 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1630 
1631 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1632 	r_gdt.rd_base =  (long) gdt;
1633 	lgdt(&r_gdt);
1634 	pc = &__pcpu[0];
1635 
1636 	wrmsr(MSR_FSBASE, 0);		/* User value */
1637 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1638 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1639 
1640 	pcpu_init(pc, 0, sizeof(struct pcpu));
1641 	dpcpu_init((void *)(physfree + KERNBASE), 0);
1642 	physfree += DPCPU_SIZE;
1643 	PCPU_SET(prvspace, pc);
1644 	PCPU_SET(curthread, &thread0);
1645 	/* Non-late cninit() and printf() can be moved up to here. */
1646 	PCPU_SET(tssp, &common_tss[0]);
1647 	PCPU_SET(commontssp, &common_tss[0]);
1648 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1649 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1650 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1651 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1652 
1653 	/*
1654 	 * Initialize mutexes.
1655 	 *
1656 	 * icu_lock: in order to allow an interrupt to occur in a critical
1657 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1658 	 *	     must be able to get the icu lock, so it can't be
1659 	 *	     under witness.
1660 	 */
1661 	mutex_init();
1662 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1663 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1664 
1665 	/* exceptions */
1666 	for (x = 0; x < NIDT; x++)
1667 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1668 		    SEL_KPL, 0);
1669 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1670 	    SEL_KPL, 0);
1671 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1672 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1673 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1674 	    SEL_UPL, 0);
1675 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1676 	    SEL_UPL, 0);
1677 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1678 	    SEL_KPL, 0);
1679 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1680 	    SEL_KPL, 0);
1681 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1682 	    SEL_KPL, 0);
1683 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1684 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1685 	    SDT_SYSIGT, SEL_KPL, 0);
1686 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1687 	    SEL_KPL, 0);
1688 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1689 	    SDT_SYSIGT, SEL_KPL, 0);
1690 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1691 	    SEL_KPL, 0);
1692 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1693 	    SEL_KPL, 0);
1694 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1695 	    SEL_KPL, 0);
1696 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1697 	    SEL_KPL, 0);
1698 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1699 	    SEL_KPL, 0);
1700 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1701 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1702 	    SEL_KPL, 0);
1703 #ifdef KDTRACE_HOOKS
1704 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1705 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1706 #endif
1707 #ifdef XENHVM
1708 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1709 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1710 #endif
1711 	r_idt.rd_limit = sizeof(idt0) - 1;
1712 	r_idt.rd_base = (long) idt;
1713 	lidt(&r_idt);
1714 
1715 	/*
1716 	 * Initialize the clock before the console so that console
1717 	 * initialization can use DELAY().
1718 	 */
1719 	clock_init();
1720 
1721 	/*
1722 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1723 	 * transition).
1724 	 * Once bootblocks have updated, we can test directly for
1725 	 * efi_systbl != NULL here...
1726 	 */
1727 	if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1728 	    != NULL)
1729 		vty_set_preferred(VTY_VT);
1730 
1731 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1732 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1733 	TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
1734 	    &syscall_ret_l1d_flush_mode);
1735 
1736 	finishidentcpu();	/* Final stage of CPU initialization */
1737 	initializecpu();	/* Initialize CPU registers */
1738 
1739 	/* doublefault stack space, runs on ist1 */
1740 	common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
1741 
1742 	/*
1743 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1744 	 * above the start of the ist2 stack.
1745 	 */
1746 	np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
1747 	np->np_pcpu = (register_t) pc;
1748 	common_tss[0].tss_ist2 = (long) np;
1749 
1750 	/*
1751 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1752 	 * above the start of the ist3 stack.
1753 	 */
1754 	np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
1755 	np->np_pcpu = (register_t) pc;
1756 	common_tss[0].tss_ist3 = (long) np;
1757 
1758 	/*
1759 	 * DB# stack, runs on ist4.
1760 	 */
1761 	np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
1762 	np->np_pcpu = (register_t) pc;
1763 	common_tss[0].tss_ist4 = (long) np;
1764 
1765 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1766 	common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
1767 
1768 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1769 	ltr(gsel_tss);
1770 
1771 	amd64_conf_fast_syscall();
1772 
1773 	/*
1774 	 * Temporary forge some valid pointer to PCB, for exception
1775 	 * handlers.  It is reinitialized properly below after FPU is
1776 	 * set up.  Also set up td_critnest to short-cut the page
1777 	 * fault handler.
1778 	 */
1779 	cpu_max_ext_state_size = sizeof(struct savefpu);
1780 	thread0.td_pcb = get_pcb_td(&thread0);
1781 	thread0.td_critnest = 1;
1782 
1783 	/*
1784 	 * The console and kdb should be initialized even earlier than here,
1785 	 * but some console drivers don't work until after getmemsize().
1786 	 * Default to late console initialization to support these drivers.
1787 	 * This loses mainly printf()s in getmemsize() and early debugging.
1788 	 */
1789 	late_console = 1;
1790 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1791 	if (!late_console) {
1792 		cninit();
1793 		amd64_kdb_init();
1794 	}
1795 
1796 	getmemsize(kmdp, physfree);
1797 	init_param2(physmem);
1798 
1799 	/* now running on new page tables, configured,and u/iom is accessible */
1800 
1801 #ifdef DEV_PCI
1802         /* This call might adjust phys_avail[]. */
1803         pci_early_quirks();
1804 #endif
1805 
1806 	if (late_console)
1807 		cninit();
1808 
1809 #ifdef DEV_ISA
1810 #ifdef DEV_ATPIC
1811 	elcr_probe();
1812 	atpic_startup();
1813 #else
1814 	/* Reset and mask the atpics and leave them shut down. */
1815 	atpic_reset();
1816 
1817 	/*
1818 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1819 	 * interrupt handler.
1820 	 */
1821 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1822 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1823 #endif
1824 #else
1825 #error "have you forgotten the isa device?";
1826 #endif
1827 
1828 	if (late_console)
1829 		amd64_kdb_init();
1830 
1831 	msgbufinit(msgbufp, msgbufsize);
1832 	fpuinit();
1833 
1834 	/*
1835 	 * Set up thread0 pcb after fpuinit calculated pcb + fpu save
1836 	 * area size.  Zero out the extended state header in fpu save
1837 	 * area.
1838 	 */
1839 	thread0.td_pcb = get_pcb_td(&thread0);
1840 	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1841 	bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
1842 	if (use_xsave) {
1843 		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1844 		    1);
1845 		xhdr->xstate_bv = xsave_mask;
1846 	}
1847 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1848 	rsp0 = (vm_offset_t)thread0.td_pcb;
1849 	/* Ensure the stack is aligned to 16 bytes */
1850 	rsp0 &= ~0xFul;
1851 	common_tss[0].tss_rsp0 = rsp0;
1852 	PCPU_SET(rsp0, rsp0);
1853 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1854 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1855 	PCPU_SET(curpcb, thread0.td_pcb);
1856 
1857 	/* transfer to user mode */
1858 
1859 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1860 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1861 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1862 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1863 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1864 
1865 	load_ds(_udatasel);
1866 	load_es(_udatasel);
1867 	load_fs(_ufssel);
1868 
1869 	/* setup proc 0's pcb */
1870 	thread0.td_pcb->pcb_flags = 0;
1871 	thread0.td_frame = &proc0_tf;
1872 
1873         env = kern_getenv("kernelname");
1874 	if (env != NULL)
1875 		strlcpy(kernelname, env, sizeof(kernelname));
1876 
1877 	cpu_probe_amdc1e();
1878 
1879 #ifdef FDT
1880 	x86_init_fdt();
1881 #endif
1882 	thread0.td_critnest = 0;
1883 
1884 	TSEXIT();
1885 
1886 	/* Location of kernel stack for locore */
1887 	return ((u_int64_t)thread0.td_pcb);
1888 }
1889 
1890 void
1891 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1892 {
1893 
1894 	pcpu->pc_acpi_id = 0xffffffff;
1895 }
1896 
1897 static int
1898 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1899 {
1900 	struct bios_smap *smapbase;
1901 	struct bios_smap_xattr smap;
1902 	caddr_t kmdp;
1903 	uint32_t *smapattr;
1904 	int count, error, i;
1905 
1906 	/* Retrieve the system memory map from the loader. */
1907 	kmdp = preload_search_by_type("elf kernel");
1908 	if (kmdp == NULL)
1909 		kmdp = preload_search_by_type("elf64 kernel");
1910 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1911 	    MODINFO_METADATA | MODINFOMD_SMAP);
1912 	if (smapbase == NULL)
1913 		return (0);
1914 	smapattr = (uint32_t *)preload_search_info(kmdp,
1915 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1916 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1917 	error = 0;
1918 	for (i = 0; i < count; i++) {
1919 		smap.base = smapbase[i].base;
1920 		smap.length = smapbase[i].length;
1921 		smap.type = smapbase[i].type;
1922 		if (smapattr != NULL)
1923 			smap.xattr = smapattr[i];
1924 		else
1925 			smap.xattr = 0;
1926 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1927 	}
1928 	return (error);
1929 }
1930 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1931     smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
1932 
1933 static int
1934 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1935 {
1936 	struct efi_map_header *efihdr;
1937 	caddr_t kmdp;
1938 	uint32_t efisize;
1939 
1940 	kmdp = preload_search_by_type("elf kernel");
1941 	if (kmdp == NULL)
1942 		kmdp = preload_search_by_type("elf64 kernel");
1943 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1944 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1945 	if (efihdr == NULL)
1946 		return (0);
1947 	efisize = *((uint32_t *)efihdr - 1);
1948 	return (SYSCTL_OUT(req, efihdr, efisize));
1949 }
1950 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1951     efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1952 
1953 void
1954 spinlock_enter(void)
1955 {
1956 	struct thread *td;
1957 	register_t flags;
1958 
1959 	td = curthread;
1960 	if (td->td_md.md_spinlock_count == 0) {
1961 		flags = intr_disable();
1962 		td->td_md.md_spinlock_count = 1;
1963 		td->td_md.md_saved_flags = flags;
1964 		critical_enter();
1965 	} else
1966 		td->td_md.md_spinlock_count++;
1967 }
1968 
1969 void
1970 spinlock_exit(void)
1971 {
1972 	struct thread *td;
1973 	register_t flags;
1974 
1975 	td = curthread;
1976 	flags = td->td_md.md_saved_flags;
1977 	td->td_md.md_spinlock_count--;
1978 	if (td->td_md.md_spinlock_count == 0) {
1979 		critical_exit();
1980 		intr_restore(flags);
1981 	}
1982 }
1983 
1984 /*
1985  * Construct a PCB from a trapframe. This is called from kdb_trap() where
1986  * we want to start a backtrace from the function that caused us to enter
1987  * the debugger. We have the context in the trapframe, but base the trace
1988  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1989  * enough for a backtrace.
1990  */
1991 void
1992 makectx(struct trapframe *tf, struct pcb *pcb)
1993 {
1994 
1995 	pcb->pcb_r12 = tf->tf_r12;
1996 	pcb->pcb_r13 = tf->tf_r13;
1997 	pcb->pcb_r14 = tf->tf_r14;
1998 	pcb->pcb_r15 = tf->tf_r15;
1999 	pcb->pcb_rbp = tf->tf_rbp;
2000 	pcb->pcb_rbx = tf->tf_rbx;
2001 	pcb->pcb_rip = tf->tf_rip;
2002 	pcb->pcb_rsp = tf->tf_rsp;
2003 }
2004 
2005 int
2006 ptrace_set_pc(struct thread *td, unsigned long addr)
2007 {
2008 
2009 	td->td_frame->tf_rip = addr;
2010 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2011 	return (0);
2012 }
2013 
2014 int
2015 ptrace_single_step(struct thread *td)
2016 {
2017 
2018 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2019 	if ((td->td_frame->tf_rflags & PSL_T) == 0) {
2020 		td->td_frame->tf_rflags |= PSL_T;
2021 		td->td_dbgflags |= TDB_STEP;
2022 	}
2023 	return (0);
2024 }
2025 
2026 int
2027 ptrace_clear_single_step(struct thread *td)
2028 {
2029 
2030 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2031 	td->td_frame->tf_rflags &= ~PSL_T;
2032 	td->td_dbgflags &= ~TDB_STEP;
2033 	return (0);
2034 }
2035 
2036 int
2037 fill_regs(struct thread *td, struct reg *regs)
2038 {
2039 	struct trapframe *tp;
2040 
2041 	tp = td->td_frame;
2042 	return (fill_frame_regs(tp, regs));
2043 }
2044 
2045 int
2046 fill_frame_regs(struct trapframe *tp, struct reg *regs)
2047 {
2048 
2049 	regs->r_r15 = tp->tf_r15;
2050 	regs->r_r14 = tp->tf_r14;
2051 	regs->r_r13 = tp->tf_r13;
2052 	regs->r_r12 = tp->tf_r12;
2053 	regs->r_r11 = tp->tf_r11;
2054 	regs->r_r10 = tp->tf_r10;
2055 	regs->r_r9  = tp->tf_r9;
2056 	regs->r_r8  = tp->tf_r8;
2057 	regs->r_rdi = tp->tf_rdi;
2058 	regs->r_rsi = tp->tf_rsi;
2059 	regs->r_rbp = tp->tf_rbp;
2060 	regs->r_rbx = tp->tf_rbx;
2061 	regs->r_rdx = tp->tf_rdx;
2062 	regs->r_rcx = tp->tf_rcx;
2063 	regs->r_rax = tp->tf_rax;
2064 	regs->r_rip = tp->tf_rip;
2065 	regs->r_cs = tp->tf_cs;
2066 	regs->r_rflags = tp->tf_rflags;
2067 	regs->r_rsp = tp->tf_rsp;
2068 	regs->r_ss = tp->tf_ss;
2069 	if (tp->tf_flags & TF_HASSEGS) {
2070 		regs->r_ds = tp->tf_ds;
2071 		regs->r_es = tp->tf_es;
2072 		regs->r_fs = tp->tf_fs;
2073 		regs->r_gs = tp->tf_gs;
2074 	} else {
2075 		regs->r_ds = 0;
2076 		regs->r_es = 0;
2077 		regs->r_fs = 0;
2078 		regs->r_gs = 0;
2079 	}
2080 	regs->r_err = 0;
2081 	regs->r_trapno = 0;
2082 	return (0);
2083 }
2084 
2085 int
2086 set_regs(struct thread *td, struct reg *regs)
2087 {
2088 	struct trapframe *tp;
2089 	register_t rflags;
2090 
2091 	tp = td->td_frame;
2092 	rflags = regs->r_rflags & 0xffffffff;
2093 	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
2094 		return (EINVAL);
2095 	tp->tf_r15 = regs->r_r15;
2096 	tp->tf_r14 = regs->r_r14;
2097 	tp->tf_r13 = regs->r_r13;
2098 	tp->tf_r12 = regs->r_r12;
2099 	tp->tf_r11 = regs->r_r11;
2100 	tp->tf_r10 = regs->r_r10;
2101 	tp->tf_r9  = regs->r_r9;
2102 	tp->tf_r8  = regs->r_r8;
2103 	tp->tf_rdi = regs->r_rdi;
2104 	tp->tf_rsi = regs->r_rsi;
2105 	tp->tf_rbp = regs->r_rbp;
2106 	tp->tf_rbx = regs->r_rbx;
2107 	tp->tf_rdx = regs->r_rdx;
2108 	tp->tf_rcx = regs->r_rcx;
2109 	tp->tf_rax = regs->r_rax;
2110 	tp->tf_rip = regs->r_rip;
2111 	tp->tf_cs = regs->r_cs;
2112 	tp->tf_rflags = rflags;
2113 	tp->tf_rsp = regs->r_rsp;
2114 	tp->tf_ss = regs->r_ss;
2115 	if (0) {	/* XXXKIB */
2116 		tp->tf_ds = regs->r_ds;
2117 		tp->tf_es = regs->r_es;
2118 		tp->tf_fs = regs->r_fs;
2119 		tp->tf_gs = regs->r_gs;
2120 		tp->tf_flags = TF_HASSEGS;
2121 	}
2122 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2123 	return (0);
2124 }
2125 
2126 /* XXX check all this stuff! */
2127 /* externalize from sv_xmm */
2128 static void
2129 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2130 {
2131 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2132 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2133 	int i;
2134 
2135 	/* pcb -> fpregs */
2136 	bzero(fpregs, sizeof(*fpregs));
2137 
2138 	/* FPU control/status */
2139 	penv_fpreg->en_cw = penv_xmm->en_cw;
2140 	penv_fpreg->en_sw = penv_xmm->en_sw;
2141 	penv_fpreg->en_tw = penv_xmm->en_tw;
2142 	penv_fpreg->en_opcode = penv_xmm->en_opcode;
2143 	penv_fpreg->en_rip = penv_xmm->en_rip;
2144 	penv_fpreg->en_rdp = penv_xmm->en_rdp;
2145 	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2146 	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2147 
2148 	/* FPU registers */
2149 	for (i = 0; i < 8; ++i)
2150 		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2151 
2152 	/* SSE registers */
2153 	for (i = 0; i < 16; ++i)
2154 		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2155 }
2156 
2157 /* internalize from fpregs into sv_xmm */
2158 static void
2159 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2160 {
2161 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2162 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2163 	int i;
2164 
2165 	/* fpregs -> pcb */
2166 	/* FPU control/status */
2167 	penv_xmm->en_cw = penv_fpreg->en_cw;
2168 	penv_xmm->en_sw = penv_fpreg->en_sw;
2169 	penv_xmm->en_tw = penv_fpreg->en_tw;
2170 	penv_xmm->en_opcode = penv_fpreg->en_opcode;
2171 	penv_xmm->en_rip = penv_fpreg->en_rip;
2172 	penv_xmm->en_rdp = penv_fpreg->en_rdp;
2173 	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2174 	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2175 
2176 	/* FPU registers */
2177 	for (i = 0; i < 8; ++i)
2178 		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2179 
2180 	/* SSE registers */
2181 	for (i = 0; i < 16; ++i)
2182 		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2183 }
2184 
2185 /* externalize from td->pcb */
2186 int
2187 fill_fpregs(struct thread *td, struct fpreg *fpregs)
2188 {
2189 
2190 	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2191 	    P_SHOULDSTOP(td->td_proc),
2192 	    ("not suspended thread %p", td));
2193 	fpugetregs(td);
2194 	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2195 	return (0);
2196 }
2197 
2198 /* internalize to td->pcb */
2199 int
2200 set_fpregs(struct thread *td, struct fpreg *fpregs)
2201 {
2202 
2203 	critical_enter();
2204 	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2205 	fpuuserinited(td);
2206 	critical_exit();
2207 	return (0);
2208 }
2209 
2210 /*
2211  * Get machine context.
2212  */
2213 int
2214 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2215 {
2216 	struct pcb *pcb;
2217 	struct trapframe *tp;
2218 
2219 	pcb = td->td_pcb;
2220 	tp = td->td_frame;
2221 	PROC_LOCK(curthread->td_proc);
2222 	mcp->mc_onstack = sigonstack(tp->tf_rsp);
2223 	PROC_UNLOCK(curthread->td_proc);
2224 	mcp->mc_r15 = tp->tf_r15;
2225 	mcp->mc_r14 = tp->tf_r14;
2226 	mcp->mc_r13 = tp->tf_r13;
2227 	mcp->mc_r12 = tp->tf_r12;
2228 	mcp->mc_r11 = tp->tf_r11;
2229 	mcp->mc_r10 = tp->tf_r10;
2230 	mcp->mc_r9  = tp->tf_r9;
2231 	mcp->mc_r8  = tp->tf_r8;
2232 	mcp->mc_rdi = tp->tf_rdi;
2233 	mcp->mc_rsi = tp->tf_rsi;
2234 	mcp->mc_rbp = tp->tf_rbp;
2235 	mcp->mc_rbx = tp->tf_rbx;
2236 	mcp->mc_rcx = tp->tf_rcx;
2237 	mcp->mc_rflags = tp->tf_rflags;
2238 	if (flags & GET_MC_CLEAR_RET) {
2239 		mcp->mc_rax = 0;
2240 		mcp->mc_rdx = 0;
2241 		mcp->mc_rflags &= ~PSL_C;
2242 	} else {
2243 		mcp->mc_rax = tp->tf_rax;
2244 		mcp->mc_rdx = tp->tf_rdx;
2245 	}
2246 	mcp->mc_rip = tp->tf_rip;
2247 	mcp->mc_cs = tp->tf_cs;
2248 	mcp->mc_rsp = tp->tf_rsp;
2249 	mcp->mc_ss = tp->tf_ss;
2250 	mcp->mc_ds = tp->tf_ds;
2251 	mcp->mc_es = tp->tf_es;
2252 	mcp->mc_fs = tp->tf_fs;
2253 	mcp->mc_gs = tp->tf_gs;
2254 	mcp->mc_flags = tp->tf_flags;
2255 	mcp->mc_len = sizeof(*mcp);
2256 	get_fpcontext(td, mcp, NULL, 0);
2257 	update_pcb_bases(pcb);
2258 	mcp->mc_fsbase = pcb->pcb_fsbase;
2259 	mcp->mc_gsbase = pcb->pcb_gsbase;
2260 	mcp->mc_xfpustate = 0;
2261 	mcp->mc_xfpustate_len = 0;
2262 	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2263 	return (0);
2264 }
2265 
2266 /*
2267  * Set machine context.
2268  *
2269  * However, we don't set any but the user modifiable flags, and we won't
2270  * touch the cs selector.
2271  */
2272 int
2273 set_mcontext(struct thread *td, mcontext_t *mcp)
2274 {
2275 	struct pcb *pcb;
2276 	struct trapframe *tp;
2277 	char *xfpustate;
2278 	long rflags;
2279 	int ret;
2280 
2281 	pcb = td->td_pcb;
2282 	tp = td->td_frame;
2283 	if (mcp->mc_len != sizeof(*mcp) ||
2284 	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2285 		return (EINVAL);
2286 	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2287 	    (tp->tf_rflags & ~PSL_USERCHANGE);
2288 	if (mcp->mc_flags & _MC_HASFPXSTATE) {
2289 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2290 		    sizeof(struct savefpu))
2291 			return (EINVAL);
2292 		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2293 		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2294 		    mcp->mc_xfpustate_len);
2295 		if (ret != 0)
2296 			return (ret);
2297 	} else
2298 		xfpustate = NULL;
2299 	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2300 	if (ret != 0)
2301 		return (ret);
2302 	tp->tf_r15 = mcp->mc_r15;
2303 	tp->tf_r14 = mcp->mc_r14;
2304 	tp->tf_r13 = mcp->mc_r13;
2305 	tp->tf_r12 = mcp->mc_r12;
2306 	tp->tf_r11 = mcp->mc_r11;
2307 	tp->tf_r10 = mcp->mc_r10;
2308 	tp->tf_r9  = mcp->mc_r9;
2309 	tp->tf_r8  = mcp->mc_r8;
2310 	tp->tf_rdi = mcp->mc_rdi;
2311 	tp->tf_rsi = mcp->mc_rsi;
2312 	tp->tf_rbp = mcp->mc_rbp;
2313 	tp->tf_rbx = mcp->mc_rbx;
2314 	tp->tf_rdx = mcp->mc_rdx;
2315 	tp->tf_rcx = mcp->mc_rcx;
2316 	tp->tf_rax = mcp->mc_rax;
2317 	tp->tf_rip = mcp->mc_rip;
2318 	tp->tf_rflags = rflags;
2319 	tp->tf_rsp = mcp->mc_rsp;
2320 	tp->tf_ss = mcp->mc_ss;
2321 	tp->tf_flags = mcp->mc_flags;
2322 	if (tp->tf_flags & TF_HASSEGS) {
2323 		tp->tf_ds = mcp->mc_ds;
2324 		tp->tf_es = mcp->mc_es;
2325 		tp->tf_fs = mcp->mc_fs;
2326 		tp->tf_gs = mcp->mc_gs;
2327 	}
2328 	set_pcb_flags(pcb, PCB_FULL_IRET);
2329 	if (mcp->mc_flags & _MC_HASBASES) {
2330 		pcb->pcb_fsbase = mcp->mc_fsbase;
2331 		pcb->pcb_gsbase = mcp->mc_gsbase;
2332 	}
2333 	return (0);
2334 }
2335 
2336 static void
2337 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2338     size_t xfpusave_len)
2339 {
2340 	size_t max_len, len;
2341 
2342 	mcp->mc_ownedfp = fpugetregs(td);
2343 	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2344 	    sizeof(mcp->mc_fpstate));
2345 	mcp->mc_fpformat = fpuformat();
2346 	if (!use_xsave || xfpusave_len == 0)
2347 		return;
2348 	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2349 	len = xfpusave_len;
2350 	if (len > max_len) {
2351 		len = max_len;
2352 		bzero(xfpusave + max_len, len - max_len);
2353 	}
2354 	mcp->mc_flags |= _MC_HASFPXSTATE;
2355 	mcp->mc_xfpustate_len = len;
2356 	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2357 }
2358 
2359 static int
2360 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2361     size_t xfpustate_len)
2362 {
2363 	int error;
2364 
2365 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2366 		return (0);
2367 	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2368 		return (EINVAL);
2369 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2370 		/* We don't care what state is left in the FPU or PCB. */
2371 		fpstate_drop(td);
2372 		error = 0;
2373 	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2374 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2375 		error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
2376 		    xfpustate, xfpustate_len);
2377 	} else
2378 		return (EINVAL);
2379 	return (error);
2380 }
2381 
2382 void
2383 fpstate_drop(struct thread *td)
2384 {
2385 
2386 	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2387 	critical_enter();
2388 	if (PCPU_GET(fpcurthread) == td)
2389 		fpudrop();
2390 	/*
2391 	 * XXX force a full drop of the fpu.  The above only drops it if we
2392 	 * owned it.
2393 	 *
2394 	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2395 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
2396 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
2397 	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2398 	 * have too many layers.
2399 	 */
2400 	clear_pcb_flags(curthread->td_pcb,
2401 	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2402 	critical_exit();
2403 }
2404 
2405 int
2406 fill_dbregs(struct thread *td, struct dbreg *dbregs)
2407 {
2408 	struct pcb *pcb;
2409 
2410 	if (td == NULL) {
2411 		dbregs->dr[0] = rdr0();
2412 		dbregs->dr[1] = rdr1();
2413 		dbregs->dr[2] = rdr2();
2414 		dbregs->dr[3] = rdr3();
2415 		dbregs->dr[6] = rdr6();
2416 		dbregs->dr[7] = rdr7();
2417 	} else {
2418 		pcb = td->td_pcb;
2419 		dbregs->dr[0] = pcb->pcb_dr0;
2420 		dbregs->dr[1] = pcb->pcb_dr1;
2421 		dbregs->dr[2] = pcb->pcb_dr2;
2422 		dbregs->dr[3] = pcb->pcb_dr3;
2423 		dbregs->dr[6] = pcb->pcb_dr6;
2424 		dbregs->dr[7] = pcb->pcb_dr7;
2425 	}
2426 	dbregs->dr[4] = 0;
2427 	dbregs->dr[5] = 0;
2428 	dbregs->dr[8] = 0;
2429 	dbregs->dr[9] = 0;
2430 	dbregs->dr[10] = 0;
2431 	dbregs->dr[11] = 0;
2432 	dbregs->dr[12] = 0;
2433 	dbregs->dr[13] = 0;
2434 	dbregs->dr[14] = 0;
2435 	dbregs->dr[15] = 0;
2436 	return (0);
2437 }
2438 
2439 int
2440 set_dbregs(struct thread *td, struct dbreg *dbregs)
2441 {
2442 	struct pcb *pcb;
2443 	int i;
2444 
2445 	if (td == NULL) {
2446 		load_dr0(dbregs->dr[0]);
2447 		load_dr1(dbregs->dr[1]);
2448 		load_dr2(dbregs->dr[2]);
2449 		load_dr3(dbregs->dr[3]);
2450 		load_dr6(dbregs->dr[6]);
2451 		load_dr7(dbregs->dr[7]);
2452 	} else {
2453 		/*
2454 		 * Don't let an illegal value for dr7 get set.  Specifically,
2455 		 * check for undefined settings.  Setting these bit patterns
2456 		 * result in undefined behaviour and can lead to an unexpected
2457 		 * TRCTRAP or a general protection fault right here.
2458 		 * Upper bits of dr6 and dr7 must not be set
2459 		 */
2460 		for (i = 0; i < 4; i++) {
2461 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2462 				return (EINVAL);
2463 			if (td->td_frame->tf_cs == _ucode32sel &&
2464 			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2465 				return (EINVAL);
2466 		}
2467 		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2468 		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2469 			return (EINVAL);
2470 
2471 		pcb = td->td_pcb;
2472 
2473 		/*
2474 		 * Don't let a process set a breakpoint that is not within the
2475 		 * process's address space.  If a process could do this, it
2476 		 * could halt the system by setting a breakpoint in the kernel
2477 		 * (if ddb was enabled).  Thus, we need to check to make sure
2478 		 * that no breakpoints are being enabled for addresses outside
2479 		 * process's address space.
2480 		 *
2481 		 * XXX - what about when the watched area of the user's
2482 		 * address space is written into from within the kernel
2483 		 * ... wouldn't that still cause a breakpoint to be generated
2484 		 * from within kernel mode?
2485 		 */
2486 
2487 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2488 			/* dr0 is enabled */
2489 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2490 				return (EINVAL);
2491 		}
2492 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2493 			/* dr1 is enabled */
2494 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2495 				return (EINVAL);
2496 		}
2497 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2498 			/* dr2 is enabled */
2499 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2500 				return (EINVAL);
2501 		}
2502 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2503 			/* dr3 is enabled */
2504 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2505 				return (EINVAL);
2506 		}
2507 
2508 		pcb->pcb_dr0 = dbregs->dr[0];
2509 		pcb->pcb_dr1 = dbregs->dr[1];
2510 		pcb->pcb_dr2 = dbregs->dr[2];
2511 		pcb->pcb_dr3 = dbregs->dr[3];
2512 		pcb->pcb_dr6 = dbregs->dr[6];
2513 		pcb->pcb_dr7 = dbregs->dr[7];
2514 
2515 		set_pcb_flags(pcb, PCB_DBREGS);
2516 	}
2517 
2518 	return (0);
2519 }
2520 
2521 void
2522 reset_dbregs(void)
2523 {
2524 
2525 	load_dr7(0);	/* Turn off the control bits first */
2526 	load_dr0(0);
2527 	load_dr1(0);
2528 	load_dr2(0);
2529 	load_dr3(0);
2530 	load_dr6(0);
2531 }
2532 
2533 /*
2534  * Return > 0 if a hardware breakpoint has been hit, and the
2535  * breakpoint was in user space.  Return 0, otherwise.
2536  */
2537 int
2538 user_dbreg_trap(register_t dr6)
2539 {
2540         u_int64_t dr7;
2541         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
2542         int nbp;            /* number of breakpoints that triggered */
2543         caddr_t addr[4];    /* breakpoint addresses */
2544         int i;
2545 
2546         bp = dr6 & DBREG_DR6_BMASK;
2547         if (bp == 0) {
2548                 /*
2549                  * None of the breakpoint bits are set meaning this
2550                  * trap was not caused by any of the debug registers
2551                  */
2552                 return 0;
2553         }
2554 
2555         dr7 = rdr7();
2556         if ((dr7 & 0x000000ff) == 0) {
2557                 /*
2558                  * all GE and LE bits in the dr7 register are zero,
2559                  * thus the trap couldn't have been caused by the
2560                  * hardware debug registers
2561                  */
2562                 return 0;
2563         }
2564 
2565         nbp = 0;
2566 
2567         /*
2568          * at least one of the breakpoints were hit, check to see
2569          * which ones and if any of them are user space addresses
2570          */
2571 
2572         if (bp & 0x01) {
2573                 addr[nbp++] = (caddr_t)rdr0();
2574         }
2575         if (bp & 0x02) {
2576                 addr[nbp++] = (caddr_t)rdr1();
2577         }
2578         if (bp & 0x04) {
2579                 addr[nbp++] = (caddr_t)rdr2();
2580         }
2581         if (bp & 0x08) {
2582                 addr[nbp++] = (caddr_t)rdr3();
2583         }
2584 
2585         for (i = 0; i < nbp; i++) {
2586                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2587                         /*
2588                          * addr[i] is in user space
2589                          */
2590                         return nbp;
2591                 }
2592         }
2593 
2594         /*
2595          * None of the breakpoints are in user space.
2596          */
2597         return 0;
2598 }
2599 
2600 /*
2601  * The pcb_flags is only modified by current thread, or by other threads
2602  * when current thread is stopped.  However, current thread may change it
2603  * from the interrupt context in cpu_switch(), or in the trap handler.
2604  * When we read-modify-write pcb_flags from C sources, compiler may generate
2605  * code that is not atomic regarding the interrupt handler.  If a trap or
2606  * interrupt happens and any flag is modified from the handler, it can be
2607  * clobbered with the cached value later.  Therefore, we implement setting
2608  * and clearing flags with single-instruction functions, which do not race
2609  * with possible modification of the flags from the trap or interrupt context,
2610  * because traps and interrupts are executed only on instruction boundary.
2611  */
2612 void
2613 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
2614 {
2615 
2616 	__asm __volatile("orl %1,%0"
2617 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
2618 	    : "cc", "memory");
2619 
2620 }
2621 
2622 /*
2623  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
2624  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
2625  * pcb if user space modified the bases.  We must save on the context
2626  * switch or if the return to usermode happens through the doreti.
2627  *
2628  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
2629  * which have a consequence that the base MSRs must be saved each time
2630  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
2631  * context switches.
2632  */
2633 static void
2634 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
2635 {
2636 	register_t r;
2637 
2638 	if (curpcb == pcb &&
2639 	    (flags & PCB_FULL_IRET) != 0 &&
2640 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2641 		r = intr_disable();
2642 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2643 			if (rfs() == _ufssel)
2644 				pcb->pcb_fsbase = rdfsbase();
2645 			if (rgs() == _ugssel)
2646 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
2647 		}
2648 		set_pcb_flags_raw(pcb, flags);
2649 		intr_restore(r);
2650 	} else {
2651 		set_pcb_flags_raw(pcb, flags);
2652 	}
2653 }
2654 
2655 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int), static)
2656 {
2657 
2658 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
2659 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
2660 }
2661 
2662 void
2663 clear_pcb_flags(struct pcb *pcb, const u_int flags)
2664 {
2665 
2666 	__asm __volatile("andl %1,%0"
2667 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
2668 	    : "cc", "memory");
2669 }
2670 
2671 #ifdef KDB
2672 
2673 /*
2674  * Provide inb() and outb() as functions.  They are normally only available as
2675  * inline functions, thus cannot be called from the debugger.
2676  */
2677 
2678 /* silence compiler warnings */
2679 u_char inb_(u_short);
2680 void outb_(u_short, u_char);
2681 
2682 u_char
2683 inb_(u_short port)
2684 {
2685 	return inb(port);
2686 }
2687 
2688 void
2689 outb_(u_short port, u_char data)
2690 {
2691 	outb(port, data);
2692 }
2693 
2694 #endif /* KDB */
2695 
2696 #undef memset
2697 #undef memmove
2698 #undef memcpy
2699 
2700 void	*memset_std(void *buf, int c, size_t len);
2701 void	*memset_erms(void *buf, int c, size_t len);
2702 DEFINE_IFUNC(, void *, memset, (void *, int, size_t), static)
2703 {
2704 
2705 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2706 	    memset_erms : memset_std);
2707 }
2708 
2709 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
2710 	    size_t len);
2711 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
2712 	    size_t len);
2713 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
2714     size_t), static)
2715 {
2716 
2717 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2718 	    memmove_erms : memmove_std);
2719 }
2720 
2721 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
2722 	    size_t len);
2723 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
2724 	    size_t len);
2725 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t),
2726     static)
2727 {
2728 
2729 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2730 	    memcpy_erms : memcpy_std);
2731 }
2732 
2733 void	pagezero_std(void *addr);
2734 void	pagezero_erms(void *addr);
2735 DEFINE_IFUNC(, void , pagezero, (void *), static)
2736 {
2737 
2738 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2739 	    pagezero_erms : pagezero_std);
2740 }
2741