xref: /freebsd/sys/amd64/amd64/machdep.c (revision d6b92ffa)
1 /*-
2  * Copyright (c) 2003 Peter Wemm.
3  * Copyright (c) 1992 Terrence R. Lambert.
4  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * William Jolitz.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
39  */
40 
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 #include "opt_atpic.h"
45 #include "opt_compat.h"
46 #include "opt_cpu.h"
47 #include "opt_ddb.h"
48 #include "opt_inet.h"
49 #include "opt_isa.h"
50 #include "opt_kstack_pages.h"
51 #include "opt_maxmem.h"
52 #include "opt_mp_watchdog.h"
53 #include "opt_platform.h"
54 #include "opt_sched.h"
55 
56 #include <sys/param.h>
57 #include <sys/proc.h>
58 #include <sys/systm.h>
59 #include <sys/bio.h>
60 #include <sys/buf.h>
61 #include <sys/bus.h>
62 #include <sys/callout.h>
63 #include <sys/cons.h>
64 #include <sys/cpu.h>
65 #include <sys/efi.h>
66 #include <sys/eventhandler.h>
67 #include <sys/exec.h>
68 #include <sys/imgact.h>
69 #include <sys/kdb.h>
70 #include <sys/kernel.h>
71 #include <sys/ktr.h>
72 #include <sys/linker.h>
73 #include <sys/lock.h>
74 #include <sys/malloc.h>
75 #include <sys/memrange.h>
76 #include <sys/msgbuf.h>
77 #include <sys/mutex.h>
78 #include <sys/pcpu.h>
79 #include <sys/ptrace.h>
80 #include <sys/reboot.h>
81 #include <sys/rwlock.h>
82 #include <sys/sched.h>
83 #include <sys/signalvar.h>
84 #ifdef SMP
85 #include <sys/smp.h>
86 #endif
87 #include <sys/syscallsubr.h>
88 #include <sys/sysctl.h>
89 #include <sys/sysent.h>
90 #include <sys/sysproto.h>
91 #include <sys/ucontext.h>
92 #include <sys/vmmeter.h>
93 
94 #include <vm/vm.h>
95 #include <vm/vm_extern.h>
96 #include <vm/vm_kern.h>
97 #include <vm/vm_page.h>
98 #include <vm/vm_map.h>
99 #include <vm/vm_object.h>
100 #include <vm/vm_pager.h>
101 #include <vm/vm_param.h>
102 
103 #ifdef DDB
104 #ifndef KDB
105 #error KDB must be enabled in order for DDB to work!
106 #endif
107 #include <ddb/ddb.h>
108 #include <ddb/db_sym.h>
109 #endif
110 
111 #include <net/netisr.h>
112 
113 #include <machine/clock.h>
114 #include <machine/cpu.h>
115 #include <machine/cputypes.h>
116 #include <machine/intr_machdep.h>
117 #include <x86/mca.h>
118 #include <machine/md_var.h>
119 #include <machine/metadata.h>
120 #include <machine/mp_watchdog.h>
121 #include <machine/pc/bios.h>
122 #include <machine/pcb.h>
123 #include <machine/proc.h>
124 #include <machine/reg.h>
125 #include <machine/sigframe.h>
126 #include <machine/specialreg.h>
127 #include <machine/tss.h>
128 #ifdef SMP
129 #include <machine/smp.h>
130 #endif
131 #ifdef FDT
132 #include <x86/fdt.h>
133 #endif
134 
135 #ifdef DEV_ATPIC
136 #include <x86/isa/icu.h>
137 #else
138 #include <x86/apicvar.h>
139 #endif
140 
141 #include <isa/isareg.h>
142 #include <isa/rtc.h>
143 #include <x86/init.h>
144 
145 /* Sanity check for __curthread() */
146 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
147 
148 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
149 
150 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
151 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
152 
153 static void cpu_startup(void *);
154 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
155     char *xfpusave, size_t xfpusave_len);
156 static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
157     char *xfpustate, size_t xfpustate_len);
158 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
159 
160 /* Preload data parse function */
161 static caddr_t native_parse_preload_data(u_int64_t);
162 
163 /* Native function to fetch and parse the e820 map */
164 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
165 
166 /* Default init_ops implementation. */
167 struct init_ops init_ops = {
168 	.parse_preload_data =	native_parse_preload_data,
169 	.early_clock_source_init =	i8254_init,
170 	.early_delay =			i8254_delay,
171 	.parse_memmap =			native_parse_memmap,
172 #ifdef SMP
173 	.mp_bootaddress =		mp_bootaddress,
174 	.start_all_aps =		native_start_all_aps,
175 #endif
176 	.msi_init =			msi_init,
177 };
178 
179 /*
180  * The file "conf/ldscript.amd64" defines the symbol "kernphys".  Its value is
181  * the physical address at which the kernel is loaded.
182  */
183 extern char kernphys[];
184 
185 struct msgbuf *msgbufp;
186 
187 /*
188  * Physical address of the EFI System Table. Stashed from the metadata hints
189  * passed into the kernel and used by the EFI code to call runtime services.
190  */
191 vm_paddr_t efi_systbl_phys;
192 
193 /* Intel ICH registers */
194 #define ICH_PMBASE	0x400
195 #define ICH_SMI_EN	ICH_PMBASE + 0x30
196 
197 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
198 
199 int cold = 1;
200 
201 long Maxmem = 0;
202 long realmem = 0;
203 
204 /*
205  * The number of PHYSMAP entries must be one less than the number of
206  * PHYSSEG entries because the PHYSMAP entry that spans the largest
207  * physical address that is accessible by ISA DMA is split into two
208  * PHYSSEG entries.
209  */
210 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
211 
212 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
213 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
214 
215 /* must be 2 less so 0 0 can signal end of chunks */
216 #define	PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
217 #define	DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
218 
219 struct kva_md_info kmi;
220 
221 static struct trapframe proc0_tf;
222 struct region_descriptor r_gdt, r_idt;
223 
224 struct pcpu __pcpu[MAXCPU];
225 
226 struct mtx icu_lock;
227 
228 struct mem_range_softc mem_range_softc;
229 
230 struct mtx dt_lock;	/* lock for GDT and LDT */
231 
232 void (*vmm_resume_p)(void);
233 
234 static void
235 cpu_startup(dummy)
236 	void *dummy;
237 {
238 	uintmax_t memsize;
239 	char *sysenv;
240 
241 	/*
242 	 * On MacBooks, we need to disallow the legacy USB circuit to
243 	 * generate an SMI# because this can cause several problems,
244 	 * namely: incorrect CPU frequency detection and failure to
245 	 * start the APs.
246 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
247 	 * Enable register) of the Intel ICH LPC Interface Bridge.
248 	 */
249 	sysenv = kern_getenv("smbios.system.product");
250 	if (sysenv != NULL) {
251 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
252 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
253 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
254 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
255 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
256 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
257 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
258 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
259 			if (bootverbose)
260 				printf("Disabling LEGACY_USB_EN bit on "
261 				    "Intel ICH.\n");
262 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
263 		}
264 		freeenv(sysenv);
265 	}
266 
267 	/*
268 	 * Good {morning,afternoon,evening,night}.
269 	 */
270 	startrtclock();
271 	printcpuinfo();
272 
273 	/*
274 	 * Display physical memory if SMBIOS reports reasonable amount.
275 	 */
276 	memsize = 0;
277 	sysenv = kern_getenv("smbios.memory.enabled");
278 	if (sysenv != NULL) {
279 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
280 		freeenv(sysenv);
281 	}
282 	if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
283 		memsize = ptoa((uintmax_t)Maxmem);
284 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
285 	realmem = atop(memsize);
286 
287 	/*
288 	 * Display any holes after the first chunk of extended memory.
289 	 */
290 	if (bootverbose) {
291 		int indx;
292 
293 		printf("Physical memory chunk(s):\n");
294 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
295 			vm_paddr_t size;
296 
297 			size = phys_avail[indx + 1] - phys_avail[indx];
298 			printf(
299 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
300 			    (uintmax_t)phys_avail[indx],
301 			    (uintmax_t)phys_avail[indx + 1] - 1,
302 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
303 		}
304 	}
305 
306 	vm_ksubmap_init(&kmi);
307 
308 	printf("avail memory = %ju (%ju MB)\n",
309 	    ptoa((uintmax_t)vm_cnt.v_free_count),
310 	    ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
311 
312 	/*
313 	 * Set up buffers, so they can be used to read disk labels.
314 	 */
315 	bufinit();
316 	vm_pager_bufferinit();
317 
318 	cpu_setregs();
319 }
320 
321 /*
322  * Send an interrupt to process.
323  *
324  * Stack is set up to allow sigcode stored
325  * at top to call routine, followed by call
326  * to sigreturn routine below.  After sigreturn
327  * resets the signal mask, the stack, and the
328  * frame pointer, it returns to the user
329  * specified pc, psl.
330  */
331 void
332 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
333 {
334 	struct sigframe sf, *sfp;
335 	struct pcb *pcb;
336 	struct proc *p;
337 	struct thread *td;
338 	struct sigacts *psp;
339 	char *sp;
340 	struct trapframe *regs;
341 	char *xfpusave;
342 	size_t xfpusave_len;
343 	int sig;
344 	int oonstack;
345 
346 	td = curthread;
347 	pcb = td->td_pcb;
348 	p = td->td_proc;
349 	PROC_LOCK_ASSERT(p, MA_OWNED);
350 	sig = ksi->ksi_signo;
351 	psp = p->p_sigacts;
352 	mtx_assert(&psp->ps_mtx, MA_OWNED);
353 	regs = td->td_frame;
354 	oonstack = sigonstack(regs->tf_rsp);
355 
356 	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
357 		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
358 		xfpusave = __builtin_alloca(xfpusave_len);
359 	} else {
360 		xfpusave_len = 0;
361 		xfpusave = NULL;
362 	}
363 
364 	/* Save user context. */
365 	bzero(&sf, sizeof(sf));
366 	sf.sf_uc.uc_sigmask = *mask;
367 	sf.sf_uc.uc_stack = td->td_sigstk;
368 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
369 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
370 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
371 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
372 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
373 	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
374 	fpstate_drop(td);
375 	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
376 	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
377 	bzero(sf.sf_uc.uc_mcontext.mc_spare,
378 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
379 	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
380 
381 	/* Allocate space for the signal handler context. */
382 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
383 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
384 		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
385 #if defined(COMPAT_43)
386 		td->td_sigstk.ss_flags |= SS_ONSTACK;
387 #endif
388 	} else
389 		sp = (char *)regs->tf_rsp - 128;
390 	if (xfpusave != NULL) {
391 		sp -= xfpusave_len;
392 		sp = (char *)((unsigned long)sp & ~0x3Ful);
393 		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
394 	}
395 	sp -= sizeof(struct sigframe);
396 	/* Align to 16 bytes. */
397 	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
398 
399 	/* Build the argument list for the signal handler. */
400 	regs->tf_rdi = sig;			/* arg 1 in %rdi */
401 	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
402 	bzero(&sf.sf_si, sizeof(sf.sf_si));
403 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
404 		/* Signal handler installed with SA_SIGINFO. */
405 		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
406 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
407 
408 		/* Fill in POSIX parts */
409 		sf.sf_si = ksi->ksi_info;
410 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
411 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
412 	} else {
413 		/* Old FreeBSD-style arguments. */
414 		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
415 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
416 		sf.sf_ahu.sf_handler = catcher;
417 	}
418 	mtx_unlock(&psp->ps_mtx);
419 	PROC_UNLOCK(p);
420 
421 	/*
422 	 * Copy the sigframe out to the user's stack.
423 	 */
424 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
425 	    (xfpusave != NULL && copyout(xfpusave,
426 	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
427 	    != 0)) {
428 #ifdef DEBUG
429 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
430 #endif
431 		PROC_LOCK(p);
432 		sigexit(td, SIGILL);
433 	}
434 
435 	regs->tf_rsp = (long)sfp;
436 	regs->tf_rip = p->p_sysent->sv_sigcode_base;
437 	regs->tf_rflags &= ~(PSL_T | PSL_D);
438 	regs->tf_cs = _ucodesel;
439 	regs->tf_ds = _udatasel;
440 	regs->tf_ss = _udatasel;
441 	regs->tf_es = _udatasel;
442 	regs->tf_fs = _ufssel;
443 	regs->tf_gs = _ugssel;
444 	regs->tf_flags = TF_HASSEGS;
445 	set_pcb_flags(pcb, PCB_FULL_IRET);
446 	PROC_LOCK(p);
447 	mtx_lock(&psp->ps_mtx);
448 }
449 
450 /*
451  * System call to cleanup state after a signal
452  * has been taken.  Reset signal mask and
453  * stack state from context left by sendsig (above).
454  * Return to previous pc and psl as specified by
455  * context left by sendsig. Check carefully to
456  * make sure that the user has not modified the
457  * state to gain improper privileges.
458  *
459  * MPSAFE
460  */
461 int
462 sys_sigreturn(td, uap)
463 	struct thread *td;
464 	struct sigreturn_args /* {
465 		const struct __ucontext *sigcntxp;
466 	} */ *uap;
467 {
468 	ucontext_t uc;
469 	struct pcb *pcb;
470 	struct proc *p;
471 	struct trapframe *regs;
472 	ucontext_t *ucp;
473 	char *xfpustate;
474 	size_t xfpustate_len;
475 	long rflags;
476 	int cs, error, ret;
477 	ksiginfo_t ksi;
478 
479 	pcb = td->td_pcb;
480 	p = td->td_proc;
481 
482 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
483 	if (error != 0) {
484 		uprintf("pid %d (%s): sigreturn copyin failed\n",
485 		    p->p_pid, td->td_name);
486 		return (error);
487 	}
488 	ucp = &uc;
489 	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
490 		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
491 		    td->td_name, ucp->uc_mcontext.mc_flags);
492 		return (EINVAL);
493 	}
494 	regs = td->td_frame;
495 	rflags = ucp->uc_mcontext.mc_rflags;
496 	/*
497 	 * Don't allow users to change privileged or reserved flags.
498 	 */
499 	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
500 		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
501 		    td->td_name, rflags);
502 		return (EINVAL);
503 	}
504 
505 	/*
506 	 * Don't allow users to load a valid privileged %cs.  Let the
507 	 * hardware check for invalid selectors, excess privilege in
508 	 * other selectors, invalid %eip's and invalid %esp's.
509 	 */
510 	cs = ucp->uc_mcontext.mc_cs;
511 	if (!CS_SECURE(cs)) {
512 		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
513 		    td->td_name, cs);
514 		ksiginfo_init_trap(&ksi);
515 		ksi.ksi_signo = SIGBUS;
516 		ksi.ksi_code = BUS_OBJERR;
517 		ksi.ksi_trapno = T_PROTFLT;
518 		ksi.ksi_addr = (void *)regs->tf_rip;
519 		trapsignal(td, &ksi);
520 		return (EINVAL);
521 	}
522 
523 	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
524 		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
525 		if (xfpustate_len > cpu_max_ext_state_size -
526 		    sizeof(struct savefpu)) {
527 			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
528 			    p->p_pid, td->td_name, xfpustate_len);
529 			return (EINVAL);
530 		}
531 		xfpustate = __builtin_alloca(xfpustate_len);
532 		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
533 		    xfpustate, xfpustate_len);
534 		if (error != 0) {
535 			uprintf(
536 	"pid %d (%s): sigreturn copying xfpustate failed\n",
537 			    p->p_pid, td->td_name);
538 			return (error);
539 		}
540 	} else {
541 		xfpustate = NULL;
542 		xfpustate_len = 0;
543 	}
544 	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
545 	if (ret != 0) {
546 		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
547 		    p->p_pid, td->td_name, ret);
548 		return (ret);
549 	}
550 	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
551 	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
552 	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
553 
554 #if defined(COMPAT_43)
555 	if (ucp->uc_mcontext.mc_onstack & 1)
556 		td->td_sigstk.ss_flags |= SS_ONSTACK;
557 	else
558 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
559 #endif
560 
561 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
562 	set_pcb_flags(pcb, PCB_FULL_IRET);
563 	return (EJUSTRETURN);
564 }
565 
566 #ifdef COMPAT_FREEBSD4
567 int
568 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
569 {
570 
571 	return sys_sigreturn(td, (struct sigreturn_args *)uap);
572 }
573 #endif
574 
575 /*
576  * Reset registers to default values on exec.
577  */
578 void
579 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
580 {
581 	struct trapframe *regs = td->td_frame;
582 	struct pcb *pcb = td->td_pcb;
583 
584 	mtx_lock(&dt_lock);
585 	if (td->td_proc->p_md.md_ldt != NULL)
586 		user_ldt_free(td);
587 	else
588 		mtx_unlock(&dt_lock);
589 
590 	pcb->pcb_fsbase = 0;
591 	pcb->pcb_gsbase = 0;
592 	clear_pcb_flags(pcb, PCB_32BIT);
593 	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
594 	set_pcb_flags(pcb, PCB_FULL_IRET);
595 
596 	bzero((char *)regs, sizeof(struct trapframe));
597 	regs->tf_rip = imgp->entry_addr;
598 	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
599 	regs->tf_rdi = stack;		/* argv */
600 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
601 	regs->tf_ss = _udatasel;
602 	regs->tf_cs = _ucodesel;
603 	regs->tf_ds = _udatasel;
604 	regs->tf_es = _udatasel;
605 	regs->tf_fs = _ufssel;
606 	regs->tf_gs = _ugssel;
607 	regs->tf_flags = TF_HASSEGS;
608 	td->td_retval[1] = 0;
609 
610 	/*
611 	 * Reset the hardware debug registers if they were in use.
612 	 * They won't have any meaning for the newly exec'd process.
613 	 */
614 	if (pcb->pcb_flags & PCB_DBREGS) {
615 		pcb->pcb_dr0 = 0;
616 		pcb->pcb_dr1 = 0;
617 		pcb->pcb_dr2 = 0;
618 		pcb->pcb_dr3 = 0;
619 		pcb->pcb_dr6 = 0;
620 		pcb->pcb_dr7 = 0;
621 		if (pcb == curpcb) {
622 			/*
623 			 * Clear the debug registers on the running
624 			 * CPU, otherwise they will end up affecting
625 			 * the next process we switch to.
626 			 */
627 			reset_dbregs();
628 		}
629 		clear_pcb_flags(pcb, PCB_DBREGS);
630 	}
631 
632 	/*
633 	 * Drop the FP state if we hold it, so that the process gets a
634 	 * clean FP state if it uses the FPU again.
635 	 */
636 	fpstate_drop(td);
637 }
638 
639 void
640 cpu_setregs(void)
641 {
642 	register_t cr0;
643 
644 	cr0 = rcr0();
645 	/*
646 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
647 	 * BSP.  See the comments there about why we set them.
648 	 */
649 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
650 	load_cr0(cr0);
651 }
652 
653 /*
654  * Initialize amd64 and configure to run kernel
655  */
656 
657 /*
658  * Initialize segments & interrupt table
659  */
660 
661 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
662 static struct gate_descriptor idt0[NIDT];
663 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
664 
665 static char dblfault_stack[PAGE_SIZE] __aligned(16);
666 
667 static char nmi0_stack[PAGE_SIZE] __aligned(16);
668 CTASSERT(sizeof(struct nmi_pcpu) == 16);
669 
670 struct amd64tss common_tss[MAXCPU];
671 
672 /*
673  * Software prototypes -- in more palatable form.
674  *
675  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
676  * slots as corresponding segments for i386 kernel.
677  */
678 struct soft_segment_descriptor gdt_segs[] = {
679 /* GNULL_SEL	0 Null Descriptor */
680 {	.ssd_base = 0x0,
681 	.ssd_limit = 0x0,
682 	.ssd_type = 0,
683 	.ssd_dpl = 0,
684 	.ssd_p = 0,
685 	.ssd_long = 0,
686 	.ssd_def32 = 0,
687 	.ssd_gran = 0		},
688 /* GNULL2_SEL	1 Null Descriptor */
689 {	.ssd_base = 0x0,
690 	.ssd_limit = 0x0,
691 	.ssd_type = 0,
692 	.ssd_dpl = 0,
693 	.ssd_p = 0,
694 	.ssd_long = 0,
695 	.ssd_def32 = 0,
696 	.ssd_gran = 0		},
697 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
698 {	.ssd_base = 0x0,
699 	.ssd_limit = 0xfffff,
700 	.ssd_type = SDT_MEMRWA,
701 	.ssd_dpl = SEL_UPL,
702 	.ssd_p = 1,
703 	.ssd_long = 0,
704 	.ssd_def32 = 1,
705 	.ssd_gran = 1		},
706 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
707 {	.ssd_base = 0x0,
708 	.ssd_limit = 0xfffff,
709 	.ssd_type = SDT_MEMRWA,
710 	.ssd_dpl = SEL_UPL,
711 	.ssd_p = 1,
712 	.ssd_long = 0,
713 	.ssd_def32 = 1,
714 	.ssd_gran = 1		},
715 /* GCODE_SEL	4 Code Descriptor for kernel */
716 {	.ssd_base = 0x0,
717 	.ssd_limit = 0xfffff,
718 	.ssd_type = SDT_MEMERA,
719 	.ssd_dpl = SEL_KPL,
720 	.ssd_p = 1,
721 	.ssd_long = 1,
722 	.ssd_def32 = 0,
723 	.ssd_gran = 1		},
724 /* GDATA_SEL	5 Data Descriptor for kernel */
725 {	.ssd_base = 0x0,
726 	.ssd_limit = 0xfffff,
727 	.ssd_type = SDT_MEMRWA,
728 	.ssd_dpl = SEL_KPL,
729 	.ssd_p = 1,
730 	.ssd_long = 1,
731 	.ssd_def32 = 0,
732 	.ssd_gran = 1		},
733 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
734 {	.ssd_base = 0x0,
735 	.ssd_limit = 0xfffff,
736 	.ssd_type = SDT_MEMERA,
737 	.ssd_dpl = SEL_UPL,
738 	.ssd_p = 1,
739 	.ssd_long = 0,
740 	.ssd_def32 = 1,
741 	.ssd_gran = 1		},
742 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
743 {	.ssd_base = 0x0,
744 	.ssd_limit = 0xfffff,
745 	.ssd_type = SDT_MEMRWA,
746 	.ssd_dpl = SEL_UPL,
747 	.ssd_p = 1,
748 	.ssd_long = 0,
749 	.ssd_def32 = 1,
750 	.ssd_gran = 1		},
751 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
752 {	.ssd_base = 0x0,
753 	.ssd_limit = 0xfffff,
754 	.ssd_type = SDT_MEMERA,
755 	.ssd_dpl = SEL_UPL,
756 	.ssd_p = 1,
757 	.ssd_long = 1,
758 	.ssd_def32 = 0,
759 	.ssd_gran = 1		},
760 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
761 {	.ssd_base = 0x0,
762 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
763 	.ssd_type = SDT_SYSTSS,
764 	.ssd_dpl = SEL_KPL,
765 	.ssd_p = 1,
766 	.ssd_long = 0,
767 	.ssd_def32 = 0,
768 	.ssd_gran = 0		},
769 /* Actually, the TSS is a system descriptor which is double size */
770 {	.ssd_base = 0x0,
771 	.ssd_limit = 0x0,
772 	.ssd_type = 0,
773 	.ssd_dpl = 0,
774 	.ssd_p = 0,
775 	.ssd_long = 0,
776 	.ssd_def32 = 0,
777 	.ssd_gran = 0		},
778 /* GUSERLDT_SEL	11 LDT Descriptor */
779 {	.ssd_base = 0x0,
780 	.ssd_limit = 0x0,
781 	.ssd_type = 0,
782 	.ssd_dpl = 0,
783 	.ssd_p = 0,
784 	.ssd_long = 0,
785 	.ssd_def32 = 0,
786 	.ssd_gran = 0		},
787 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
788 {	.ssd_base = 0x0,
789 	.ssd_limit = 0x0,
790 	.ssd_type = 0,
791 	.ssd_dpl = 0,
792 	.ssd_p = 0,
793 	.ssd_long = 0,
794 	.ssd_def32 = 0,
795 	.ssd_gran = 0		},
796 };
797 
798 void
799 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
800 {
801 	struct gate_descriptor *ip;
802 
803 	ip = idt + idx;
804 	ip->gd_looffset = (uintptr_t)func;
805 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
806 	ip->gd_ist = ist;
807 	ip->gd_xx = 0;
808 	ip->gd_type = typ;
809 	ip->gd_dpl = dpl;
810 	ip->gd_p = 1;
811 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
812 }
813 
814 extern inthand_t
815 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
816 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
817 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
818 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
819 	IDTVEC(xmm), IDTVEC(dblfault),
820 #ifdef KDTRACE_HOOKS
821 	IDTVEC(dtrace_ret),
822 #endif
823 #ifdef XENHVM
824 	IDTVEC(xen_intr_upcall),
825 #endif
826 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
827 
828 #ifdef DDB
829 /*
830  * Display the index and function name of any IDT entries that don't use
831  * the default 'rsvd' entry point.
832  */
833 DB_SHOW_COMMAND(idt, db_show_idt)
834 {
835 	struct gate_descriptor *ip;
836 	int idx;
837 	uintptr_t func;
838 
839 	ip = idt;
840 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
841 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
842 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
843 			db_printf("%3d\t", idx);
844 			db_printsym(func, DB_STGY_PROC);
845 			db_printf("\n");
846 		}
847 		ip++;
848 	}
849 }
850 
851 /* Show privileged registers. */
852 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
853 {
854 	struct {
855 		uint16_t limit;
856 		uint64_t base;
857 	} __packed idtr, gdtr;
858 	uint16_t ldt, tr;
859 
860 	__asm __volatile("sidt %0" : "=m" (idtr));
861 	db_printf("idtr\t0x%016lx/%04x\n",
862 	    (u_long)idtr.base, (u_int)idtr.limit);
863 	__asm __volatile("sgdt %0" : "=m" (gdtr));
864 	db_printf("gdtr\t0x%016lx/%04x\n",
865 	    (u_long)gdtr.base, (u_int)gdtr.limit);
866 	__asm __volatile("sldt %0" : "=r" (ldt));
867 	db_printf("ldtr\t0x%04x\n", ldt);
868 	__asm __volatile("str %0" : "=r" (tr));
869 	db_printf("tr\t0x%04x\n", tr);
870 	db_printf("cr0\t0x%016lx\n", rcr0());
871 	db_printf("cr2\t0x%016lx\n", rcr2());
872 	db_printf("cr3\t0x%016lx\n", rcr3());
873 	db_printf("cr4\t0x%016lx\n", rcr4());
874 	if (rcr4() & CR4_XSAVE)
875 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
876 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
877 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
878 		db_printf("FEATURES_CTL\t%016lx\n",
879 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
880 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
881 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
882 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
883 }
884 
885 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
886 {
887 
888 	db_printf("dr0\t0x%016lx\n", rdr0());
889 	db_printf("dr1\t0x%016lx\n", rdr1());
890 	db_printf("dr2\t0x%016lx\n", rdr2());
891 	db_printf("dr3\t0x%016lx\n", rdr3());
892 	db_printf("dr6\t0x%016lx\n", rdr6());
893 	db_printf("dr7\t0x%016lx\n", rdr7());
894 }
895 #endif
896 
897 void
898 sdtossd(sd, ssd)
899 	struct user_segment_descriptor *sd;
900 	struct soft_segment_descriptor *ssd;
901 {
902 
903 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
904 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
905 	ssd->ssd_type  = sd->sd_type;
906 	ssd->ssd_dpl   = sd->sd_dpl;
907 	ssd->ssd_p     = sd->sd_p;
908 	ssd->ssd_long  = sd->sd_long;
909 	ssd->ssd_def32 = sd->sd_def32;
910 	ssd->ssd_gran  = sd->sd_gran;
911 }
912 
913 void
914 ssdtosd(ssd, sd)
915 	struct soft_segment_descriptor *ssd;
916 	struct user_segment_descriptor *sd;
917 {
918 
919 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
920 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
921 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
922 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
923 	sd->sd_type  = ssd->ssd_type;
924 	sd->sd_dpl   = ssd->ssd_dpl;
925 	sd->sd_p     = ssd->ssd_p;
926 	sd->sd_long  = ssd->ssd_long;
927 	sd->sd_def32 = ssd->ssd_def32;
928 	sd->sd_gran  = ssd->ssd_gran;
929 }
930 
931 void
932 ssdtosyssd(ssd, sd)
933 	struct soft_segment_descriptor *ssd;
934 	struct system_segment_descriptor *sd;
935 {
936 
937 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
938 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
939 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
940 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
941 	sd->sd_type  = ssd->ssd_type;
942 	sd->sd_dpl   = ssd->ssd_dpl;
943 	sd->sd_p     = ssd->ssd_p;
944 	sd->sd_gran  = ssd->ssd_gran;
945 }
946 
947 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
948 #include <isa/isavar.h>
949 #include <isa/isareg.h>
950 /*
951  * Return a bitmap of the current interrupt requests.  This is 8259-specific
952  * and is only suitable for use at probe time.
953  * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
954  * It shouldn't be here.  There should probably be an APIC centric
955  * implementation in the apic driver code, if at all.
956  */
957 intrmask_t
958 isa_irq_pending(void)
959 {
960 	u_char irr1;
961 	u_char irr2;
962 
963 	irr1 = inb(IO_ICU1);
964 	irr2 = inb(IO_ICU2);
965 	return ((irr2 << 8) | irr1);
966 }
967 #endif
968 
969 u_int basemem;
970 
971 static int
972 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
973     int *physmap_idxp)
974 {
975 	int i, insert_idx, physmap_idx;
976 
977 	physmap_idx = *physmap_idxp;
978 
979 	if (length == 0)
980 		return (1);
981 
982 	/*
983 	 * Find insertion point while checking for overlap.  Start off by
984 	 * assuming the new entry will be added to the end.
985 	 *
986 	 * NB: physmap_idx points to the next free slot.
987 	 */
988 	insert_idx = physmap_idx;
989 	for (i = 0; i <= physmap_idx; i += 2) {
990 		if (base < physmap[i + 1]) {
991 			if (base + length <= physmap[i]) {
992 				insert_idx = i;
993 				break;
994 			}
995 			if (boothowto & RB_VERBOSE)
996 				printf(
997 		    "Overlapping memory regions, ignoring second region\n");
998 			return (1);
999 		}
1000 	}
1001 
1002 	/* See if we can prepend to the next entry. */
1003 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1004 		physmap[insert_idx] = base;
1005 		return (1);
1006 	}
1007 
1008 	/* See if we can append to the previous entry. */
1009 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1010 		physmap[insert_idx - 1] += length;
1011 		return (1);
1012 	}
1013 
1014 	physmap_idx += 2;
1015 	*physmap_idxp = physmap_idx;
1016 	if (physmap_idx == PHYSMAP_SIZE) {
1017 		printf(
1018 		"Too many segments in the physical address map, giving up\n");
1019 		return (0);
1020 	}
1021 
1022 	/*
1023 	 * Move the last 'N' entries down to make room for the new
1024 	 * entry if needed.
1025 	 */
1026 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1027 		physmap[i] = physmap[i - 2];
1028 		physmap[i + 1] = physmap[i - 1];
1029 	}
1030 
1031 	/* Insert the new entry. */
1032 	physmap[insert_idx] = base;
1033 	physmap[insert_idx + 1] = base + length;
1034 	return (1);
1035 }
1036 
1037 void
1038 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1039                       vm_paddr_t *physmap, int *physmap_idx)
1040 {
1041 	struct bios_smap *smap, *smapend;
1042 
1043 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1044 
1045 	for (smap = smapbase; smap < smapend; smap++) {
1046 		if (boothowto & RB_VERBOSE)
1047 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
1048 			    smap->type, smap->base, smap->length);
1049 
1050 		if (smap->type != SMAP_TYPE_MEMORY)
1051 			continue;
1052 
1053 		if (!add_physmap_entry(smap->base, smap->length, physmap,
1054 		    physmap_idx))
1055 			break;
1056 	}
1057 }
1058 
1059 static void
1060 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1061     int *physmap_idx)
1062 {
1063 	struct efi_md *map, *p;
1064 	const char *type;
1065 	size_t efisz;
1066 	int ndesc, i;
1067 
1068 	static const char *types[] = {
1069 		"Reserved",
1070 		"LoaderCode",
1071 		"LoaderData",
1072 		"BootServicesCode",
1073 		"BootServicesData",
1074 		"RuntimeServicesCode",
1075 		"RuntimeServicesData",
1076 		"ConventionalMemory",
1077 		"UnusableMemory",
1078 		"ACPIReclaimMemory",
1079 		"ACPIMemoryNVS",
1080 		"MemoryMappedIO",
1081 		"MemoryMappedIOPortSpace",
1082 		"PalCode",
1083 		"PersistentMemory"
1084 	};
1085 
1086 	/*
1087 	 * Memory map data provided by UEFI via the GetMemoryMap
1088 	 * Boot Services API.
1089 	 */
1090 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1091 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1092 
1093 	if (efihdr->descriptor_size == 0)
1094 		return;
1095 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
1096 
1097 	if (boothowto & RB_VERBOSE)
1098 		printf("%23s %12s %12s %8s %4s\n",
1099 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
1100 
1101 	for (i = 0, p = map; i < ndesc; i++,
1102 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1103 		if (boothowto & RB_VERBOSE) {
1104 			if (p->md_type < nitems(types))
1105 				type = types[p->md_type];
1106 			else
1107 				type = "<INVALID>";
1108 			printf("%23s %012lx %12p %08lx ", type, p->md_phys,
1109 			    p->md_virt, p->md_pages);
1110 			if (p->md_attr & EFI_MD_ATTR_UC)
1111 				printf("UC ");
1112 			if (p->md_attr & EFI_MD_ATTR_WC)
1113 				printf("WC ");
1114 			if (p->md_attr & EFI_MD_ATTR_WT)
1115 				printf("WT ");
1116 			if (p->md_attr & EFI_MD_ATTR_WB)
1117 				printf("WB ");
1118 			if (p->md_attr & EFI_MD_ATTR_UCE)
1119 				printf("UCE ");
1120 			if (p->md_attr & EFI_MD_ATTR_WP)
1121 				printf("WP ");
1122 			if (p->md_attr & EFI_MD_ATTR_RP)
1123 				printf("RP ");
1124 			if (p->md_attr & EFI_MD_ATTR_XP)
1125 				printf("XP ");
1126 			if (p->md_attr & EFI_MD_ATTR_NV)
1127 				printf("NV ");
1128 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1129 				printf("MORE_RELIABLE ");
1130 			if (p->md_attr & EFI_MD_ATTR_RO)
1131 				printf("RO ");
1132 			if (p->md_attr & EFI_MD_ATTR_RT)
1133 				printf("RUNTIME");
1134 			printf("\n");
1135 		}
1136 
1137 		switch (p->md_type) {
1138 		case EFI_MD_TYPE_CODE:
1139 		case EFI_MD_TYPE_DATA:
1140 		case EFI_MD_TYPE_BS_CODE:
1141 		case EFI_MD_TYPE_BS_DATA:
1142 		case EFI_MD_TYPE_FREE:
1143 			/*
1144 			 * We're allowed to use any entry with these types.
1145 			 */
1146 			break;
1147 		default:
1148 			continue;
1149 		}
1150 
1151 		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1152 		    physmap, physmap_idx))
1153 			break;
1154 	}
1155 }
1156 
1157 static char bootmethod[16] = "";
1158 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
1159     "System firmware boot method");
1160 
1161 static void
1162 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1163 {
1164 	struct bios_smap *smap;
1165 	struct efi_map_header *efihdr;
1166 	u_int32_t size;
1167 
1168 	/*
1169 	 * Memory map from INT 15:E820.
1170 	 *
1171 	 * subr_module.c says:
1172 	 * "Consumer may safely assume that size value precedes data."
1173 	 * ie: an int32_t immediately precedes smap.
1174 	 */
1175 
1176 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1177 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1178 	smap = (struct bios_smap *)preload_search_info(kmdp,
1179 	    MODINFO_METADATA | MODINFOMD_SMAP);
1180 	if (efihdr == NULL && smap == NULL)
1181 		panic("No BIOS smap or EFI map info from loader!");
1182 
1183 	if (efihdr != NULL) {
1184 		add_efi_map_entries(efihdr, physmap, physmap_idx);
1185 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1186 	} else {
1187 		size = *((u_int32_t *)smap - 1);
1188 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
1189 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1190 	}
1191 }
1192 
1193 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
1194 
1195 /*
1196  * Populate the (physmap) array with base/bound pairs describing the
1197  * available physical memory in the system, then test this memory and
1198  * build the phys_avail array describing the actually-available memory.
1199  *
1200  * Total memory size may be set by the kernel environment variable
1201  * hw.physmem or the compile-time define MAXMEM.
1202  *
1203  * XXX first should be vm_paddr_t.
1204  */
1205 static void
1206 getmemsize(caddr_t kmdp, u_int64_t first)
1207 {
1208 	int i, physmap_idx, pa_indx, da_indx;
1209 	vm_paddr_t pa, physmap[PHYSMAP_SIZE];
1210 	u_long physmem_start, physmem_tunable, memtest;
1211 	pt_entry_t *pte;
1212 	quad_t dcons_addr, dcons_size;
1213 	int page_counter;
1214 
1215 	bzero(physmap, sizeof(physmap));
1216 	physmap_idx = 0;
1217 
1218 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1219 	physmap_idx -= 2;
1220 
1221 	/*
1222 	 * Find the 'base memory' segment for SMP
1223 	 */
1224 	basemem = 0;
1225 	for (i = 0; i <= physmap_idx; i += 2) {
1226 		if (physmap[i] <= 0xA0000) {
1227 			basemem = physmap[i + 1] / 1024;
1228 			break;
1229 		}
1230 	}
1231 	if (basemem == 0 || basemem > 640) {
1232 		if (bootverbose)
1233 			printf(
1234 		"Memory map doesn't contain a basemem segment, faking it");
1235 		basemem = 640;
1236 	}
1237 
1238 	/*
1239 	 * Make hole for "AP -> long mode" bootstrap code.  The
1240 	 * mp_bootaddress vector is only available when the kernel
1241 	 * is configured to support APs and APs for the system start
1242 	 * in 32bit mode (e.g. SMP bare metal).
1243 	 */
1244 	if (init_ops.mp_bootaddress) {
1245 		if (physmap[1] >= 0x100000000)
1246 			panic(
1247 	"Basemem segment is not suitable for AP bootstrap code!");
1248 		physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024);
1249 	}
1250 
1251 	/*
1252 	 * Maxmem isn't the "maximum memory", it's one larger than the
1253 	 * highest page of the physical address space.  It should be
1254 	 * called something like "Maxphyspage".  We may adjust this
1255 	 * based on ``hw.physmem'' and the results of the memory test.
1256 	 */
1257 	Maxmem = atop(physmap[physmap_idx + 1]);
1258 
1259 #ifdef MAXMEM
1260 	Maxmem = MAXMEM / 4;
1261 #endif
1262 
1263 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1264 		Maxmem = atop(physmem_tunable);
1265 
1266 	/*
1267 	 * The boot memory test is disabled by default, as it takes a
1268 	 * significant amount of time on large-memory systems, and is
1269 	 * unfriendly to virtual machines as it unnecessarily touches all
1270 	 * pages.
1271 	 *
1272 	 * A general name is used as the code may be extended to support
1273 	 * additional tests beyond the current "page present" test.
1274 	 */
1275 	memtest = 0;
1276 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1277 
1278 	/*
1279 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1280 	 * in the system.
1281 	 */
1282 	if (Maxmem > atop(physmap[physmap_idx + 1]))
1283 		Maxmem = atop(physmap[physmap_idx + 1]);
1284 
1285 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1286 	    (boothowto & RB_VERBOSE))
1287 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
1288 
1289 	/* call pmap initialization to make new kernel address space */
1290 	pmap_bootstrap(&first);
1291 
1292 	/*
1293 	 * Size up each available chunk of physical memory.
1294 	 *
1295 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1296 	 * By default, mask off the first 16 pages unless we appear to be
1297 	 * running in a VM.
1298 	 */
1299 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1300 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1301 	if (physmap[0] < physmem_start) {
1302 		if (physmem_start < PAGE_SIZE)
1303 			physmap[0] = PAGE_SIZE;
1304 		else if (physmem_start >= physmap[1])
1305 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1306 		else
1307 			physmap[0] = round_page(physmem_start);
1308 	}
1309 	pa_indx = 0;
1310 	da_indx = 1;
1311 	phys_avail[pa_indx++] = physmap[0];
1312 	phys_avail[pa_indx] = physmap[0];
1313 	dump_avail[da_indx] = physmap[0];
1314 	pte = CMAP1;
1315 
1316 	/*
1317 	 * Get dcons buffer address
1318 	 */
1319 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1320 	    getenv_quad("dcons.size", &dcons_size) == 0)
1321 		dcons_addr = 0;
1322 
1323 	/*
1324 	 * physmap is in bytes, so when converting to page boundaries,
1325 	 * round up the start address and round down the end address.
1326 	 */
1327 	page_counter = 0;
1328 	if (memtest != 0)
1329 		printf("Testing system memory");
1330 	for (i = 0; i <= physmap_idx; i += 2) {
1331 		vm_paddr_t end;
1332 
1333 		end = ptoa((vm_paddr_t)Maxmem);
1334 		if (physmap[i + 1] < end)
1335 			end = trunc_page(physmap[i + 1]);
1336 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1337 			int tmp, page_bad, full;
1338 			int *ptr = (int *)CADDR1;
1339 
1340 			full = FALSE;
1341 			/*
1342 			 * block out kernel memory as not available.
1343 			 */
1344 			if (pa >= (vm_paddr_t)kernphys && pa < first)
1345 				goto do_dump_avail;
1346 
1347 			/*
1348 			 * block out dcons buffer
1349 			 */
1350 			if (dcons_addr > 0
1351 			    && pa >= trunc_page(dcons_addr)
1352 			    && pa < dcons_addr + dcons_size)
1353 				goto do_dump_avail;
1354 
1355 			page_bad = FALSE;
1356 			if (memtest == 0)
1357 				goto skip_memtest;
1358 
1359 			/*
1360 			 * Print a "." every GB to show we're making
1361 			 * progress.
1362 			 */
1363 			page_counter++;
1364 			if ((page_counter % PAGES_PER_GB) == 0)
1365 				printf(".");
1366 
1367 			/*
1368 			 * map page into kernel: valid, read/write,non-cacheable
1369 			 */
1370 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1371 			invltlb();
1372 
1373 			tmp = *(int *)ptr;
1374 			/*
1375 			 * Test for alternating 1's and 0's
1376 			 */
1377 			*(volatile int *)ptr = 0xaaaaaaaa;
1378 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1379 				page_bad = TRUE;
1380 			/*
1381 			 * Test for alternating 0's and 1's
1382 			 */
1383 			*(volatile int *)ptr = 0x55555555;
1384 			if (*(volatile int *)ptr != 0x55555555)
1385 				page_bad = TRUE;
1386 			/*
1387 			 * Test for all 1's
1388 			 */
1389 			*(volatile int *)ptr = 0xffffffff;
1390 			if (*(volatile int *)ptr != 0xffffffff)
1391 				page_bad = TRUE;
1392 			/*
1393 			 * Test for all 0's
1394 			 */
1395 			*(volatile int *)ptr = 0x0;
1396 			if (*(volatile int *)ptr != 0x0)
1397 				page_bad = TRUE;
1398 			/*
1399 			 * Restore original value.
1400 			 */
1401 			*(int *)ptr = tmp;
1402 
1403 skip_memtest:
1404 			/*
1405 			 * Adjust array of valid/good pages.
1406 			 */
1407 			if (page_bad == TRUE)
1408 				continue;
1409 			/*
1410 			 * If this good page is a continuation of the
1411 			 * previous set of good pages, then just increase
1412 			 * the end pointer. Otherwise start a new chunk.
1413 			 * Note that "end" points one higher than end,
1414 			 * making the range >= start and < end.
1415 			 * If we're also doing a speculative memory
1416 			 * test and we at or past the end, bump up Maxmem
1417 			 * so that we keep going. The first bad page
1418 			 * will terminate the loop.
1419 			 */
1420 			if (phys_avail[pa_indx] == pa) {
1421 				phys_avail[pa_indx] += PAGE_SIZE;
1422 			} else {
1423 				pa_indx++;
1424 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
1425 					printf(
1426 		"Too many holes in the physical address space, giving up\n");
1427 					pa_indx--;
1428 					full = TRUE;
1429 					goto do_dump_avail;
1430 				}
1431 				phys_avail[pa_indx++] = pa;	/* start */
1432 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1433 			}
1434 			physmem++;
1435 do_dump_avail:
1436 			if (dump_avail[da_indx] == pa) {
1437 				dump_avail[da_indx] += PAGE_SIZE;
1438 			} else {
1439 				da_indx++;
1440 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
1441 					da_indx--;
1442 					goto do_next;
1443 				}
1444 				dump_avail[da_indx++] = pa; /* start */
1445 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1446 			}
1447 do_next:
1448 			if (full)
1449 				break;
1450 		}
1451 	}
1452 	*pte = 0;
1453 	invltlb();
1454 	if (memtest != 0)
1455 		printf("\n");
1456 
1457 	/*
1458 	 * XXX
1459 	 * The last chunk must contain at least one page plus the message
1460 	 * buffer to avoid complicating other code (message buffer address
1461 	 * calculation, etc.).
1462 	 */
1463 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1464 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1465 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1466 		phys_avail[pa_indx--] = 0;
1467 		phys_avail[pa_indx--] = 0;
1468 	}
1469 
1470 	Maxmem = atop(phys_avail[pa_indx]);
1471 
1472 	/* Trim off space for the message buffer. */
1473 	phys_avail[pa_indx] -= round_page(msgbufsize);
1474 
1475 	/* Map the message buffer. */
1476 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1477 }
1478 
1479 static caddr_t
1480 native_parse_preload_data(u_int64_t modulep)
1481 {
1482 	caddr_t kmdp;
1483 	char *envp;
1484 #ifdef DDB
1485 	vm_offset_t ksym_start;
1486 	vm_offset_t ksym_end;
1487 #endif
1488 
1489 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1490 	preload_bootstrap_relocate(KERNBASE);
1491 	kmdp = preload_search_by_type("elf kernel");
1492 	if (kmdp == NULL)
1493 		kmdp = preload_search_by_type("elf64 kernel");
1494 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1495 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1496 	if (envp != NULL)
1497 		envp += KERNBASE;
1498 	init_static_kenv(envp, 0);
1499 #ifdef DDB
1500 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1501 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1502 	db_fetch_ksymtab(ksym_start, ksym_end);
1503 #endif
1504 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1505 
1506 	return (kmdp);
1507 }
1508 
1509 static void
1510 amd64_kdb_init(void)
1511 {
1512 	kdb_init();
1513 #ifdef KDB
1514 	if (boothowto & RB_KDB)
1515 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1516 #endif
1517 }
1518 
1519 u_int64_t
1520 hammer_time(u_int64_t modulep, u_int64_t physfree)
1521 {
1522 	caddr_t kmdp;
1523 	int gsel_tss, x;
1524 	struct pcpu *pc;
1525 	struct nmi_pcpu *np;
1526 	struct xstate_hdr *xhdr;
1527 	u_int64_t msr;
1528 	char *env;
1529 	size_t kstack0_sz;
1530 	int late_console;
1531 
1532 	/*
1533  	 * This may be done better later if it gets more high level
1534  	 * components in it. If so just link td->td_proc here.
1535 	 */
1536 	proc_linkup0(&proc0, &thread0);
1537 
1538 	kmdp = init_ops.parse_preload_data(modulep);
1539 
1540 	/* Init basic tunables, hz etc */
1541 	init_param1();
1542 
1543 	thread0.td_kstack = physfree + KERNBASE;
1544 	thread0.td_kstack_pages = kstack_pages;
1545 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1546 	bzero((void *)thread0.td_kstack, kstack0_sz);
1547 	physfree += kstack0_sz;
1548 
1549 	/*
1550 	 * make gdt memory segments
1551 	 */
1552 	for (x = 0; x < NGDT; x++) {
1553 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1554 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1555 			ssdtosd(&gdt_segs[x], &gdt[x]);
1556 	}
1557 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
1558 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1559 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1560 
1561 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1562 	r_gdt.rd_base =  (long) gdt;
1563 	lgdt(&r_gdt);
1564 	pc = &__pcpu[0];
1565 
1566 	wrmsr(MSR_FSBASE, 0);		/* User value */
1567 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1568 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1569 
1570 	pcpu_init(pc, 0, sizeof(struct pcpu));
1571 	dpcpu_init((void *)(physfree + KERNBASE), 0);
1572 	physfree += DPCPU_SIZE;
1573 	PCPU_SET(prvspace, pc);
1574 	PCPU_SET(curthread, &thread0);
1575 	/* Non-late cninit() and printf() can be moved up to here. */
1576 	PCPU_SET(tssp, &common_tss[0]);
1577 	PCPU_SET(commontssp, &common_tss[0]);
1578 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1579 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1580 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1581 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1582 
1583 	/*
1584 	 * Initialize mutexes.
1585 	 *
1586 	 * icu_lock: in order to allow an interrupt to occur in a critical
1587 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1588 	 *	     must be able to get the icu lock, so it can't be
1589 	 *	     under witness.
1590 	 */
1591 	mutex_init();
1592 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1593 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1594 
1595 	/* exceptions */
1596 	for (x = 0; x < NIDT; x++)
1597 		setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
1598 	setidt(IDT_DE, &IDTVEC(div),  SDT_SYSIGT, SEL_KPL, 0);
1599 	setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYSIGT, SEL_KPL, 0);
1600 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1601  	setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYSIGT, SEL_UPL, 0);
1602 	setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYSIGT, SEL_KPL, 0);
1603 	setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYSIGT, SEL_KPL, 0);
1604 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYSIGT, SEL_KPL, 0);
1605 	setidt(IDT_NM, &IDTVEC(dna),  SDT_SYSIGT, SEL_KPL, 0);
1606 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1607 	setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYSIGT, SEL_KPL, 0);
1608 	setidt(IDT_TS, &IDTVEC(tss),  SDT_SYSIGT, SEL_KPL, 0);
1609 	setidt(IDT_NP, &IDTVEC(missing),  SDT_SYSIGT, SEL_KPL, 0);
1610 	setidt(IDT_SS, &IDTVEC(stk),  SDT_SYSIGT, SEL_KPL, 0);
1611 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYSIGT, SEL_KPL, 0);
1612 	setidt(IDT_PF, &IDTVEC(page),  SDT_SYSIGT, SEL_KPL, 0);
1613 	setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYSIGT, SEL_KPL, 0);
1614 	setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
1615 	setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYSIGT, SEL_KPL, 0);
1616 	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
1617 #ifdef KDTRACE_HOOKS
1618 	setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1619 #endif
1620 #ifdef XENHVM
1621 	setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_UPL, 0);
1622 #endif
1623 
1624 	r_idt.rd_limit = sizeof(idt0) - 1;
1625 	r_idt.rd_base = (long) idt;
1626 	lidt(&r_idt);
1627 
1628 	/*
1629 	 * Initialize the clock before the console so that console
1630 	 * initialization can use DELAY().
1631 	 */
1632 	clock_init();
1633 
1634 	/*
1635 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1636 	 * transition).
1637 	 * Once bootblocks have updated, we can test directly for
1638 	 * efi_systbl != NULL here...
1639 	 */
1640 	if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1641 	    != NULL)
1642 		vty_set_preferred(VTY_VT);
1643 
1644 	identify_cpu();		/* Final stage of CPU initialization */
1645 	initializecpu();	/* Initialize CPU registers */
1646 	initializecpucache();
1647 
1648 	/* doublefault stack space, runs on ist1 */
1649 	common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
1650 
1651 	/*
1652 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1653 	 * above the start of the ist2 stack.
1654 	 */
1655 	np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
1656 	np->np_pcpu = (register_t) pc;
1657 	common_tss[0].tss_ist2 = (long) np;
1658 
1659 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1660 	common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
1661 
1662 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1663 	ltr(gsel_tss);
1664 
1665 	/* Set up the fast syscall stuff */
1666 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1667 	wrmsr(MSR_EFER, msr);
1668 	wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
1669 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1670 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1671 	      ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1672 	wrmsr(MSR_STAR, msr);
1673 	wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
1674 
1675 	/*
1676 	 * Temporary forge some valid pointer to PCB, for exception
1677 	 * handlers.  It is reinitialized properly below after FPU is
1678 	 * set up.  Also set up td_critnest to short-cut the page
1679 	 * fault handler.
1680 	 */
1681 	cpu_max_ext_state_size = sizeof(struct savefpu);
1682 	thread0.td_pcb = get_pcb_td(&thread0);
1683 	thread0.td_critnest = 1;
1684 
1685 	/*
1686 	 * The console and kdb should be initialized even earlier than here,
1687 	 * but some console drivers don't work until after getmemsize().
1688 	 * Default to late console initialization to support these drivers.
1689 	 * This loses mainly printf()s in getmemsize() and early debugging.
1690 	 */
1691 	late_console = 1;
1692 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1693 	if (!late_console) {
1694 		cninit();
1695 		amd64_kdb_init();
1696 	}
1697 
1698 	getmemsize(kmdp, physfree);
1699 	init_param2(physmem);
1700 
1701 	/* now running on new page tables, configured,and u/iom is accessible */
1702 
1703 	if (late_console)
1704 		cninit();
1705 
1706 #ifdef DEV_ISA
1707 #ifdef DEV_ATPIC
1708 	elcr_probe();
1709 	atpic_startup();
1710 #else
1711 	/* Reset and mask the atpics and leave them shut down. */
1712 	atpic_reset();
1713 
1714 	/*
1715 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1716 	 * interrupt handler.
1717 	 */
1718 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1719 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1720 #endif
1721 #else
1722 #error "have you forgotten the isa device?";
1723 #endif
1724 
1725 	if (late_console)
1726 		amd64_kdb_init();
1727 
1728 	msgbufinit(msgbufp, msgbufsize);
1729 	fpuinit();
1730 
1731 	/*
1732 	 * Set up thread0 pcb after fpuinit calculated pcb + fpu save
1733 	 * area size.  Zero out the extended state header in fpu save
1734 	 * area.
1735 	 */
1736 	thread0.td_pcb = get_pcb_td(&thread0);
1737 	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1738 	bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
1739 	if (use_xsave) {
1740 		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1741 		    1);
1742 		xhdr->xstate_bv = xsave_mask;
1743 	}
1744 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1745 	common_tss[0].tss_rsp0 = (vm_offset_t)thread0.td_pcb;
1746 	/* Ensure the stack is aligned to 16 bytes */
1747 	common_tss[0].tss_rsp0 &= ~0xFul;
1748 	PCPU_SET(rsp0, common_tss[0].tss_rsp0);
1749 	PCPU_SET(curpcb, thread0.td_pcb);
1750 
1751 	/* transfer to user mode */
1752 
1753 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1754 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1755 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1756 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1757 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1758 
1759 	load_ds(_udatasel);
1760 	load_es(_udatasel);
1761 	load_fs(_ufssel);
1762 
1763 	/* setup proc 0's pcb */
1764 	thread0.td_pcb->pcb_flags = 0;
1765 	thread0.td_frame = &proc0_tf;
1766 
1767         env = kern_getenv("kernelname");
1768 	if (env != NULL)
1769 		strlcpy(kernelname, env, sizeof(kernelname));
1770 
1771 	cpu_probe_amdc1e();
1772 
1773 #ifdef FDT
1774 	x86_init_fdt();
1775 #endif
1776 	thread0.td_critnest = 0;
1777 
1778 	/* Location of kernel stack for locore */
1779 	return ((u_int64_t)thread0.td_pcb);
1780 }
1781 
1782 void
1783 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1784 {
1785 
1786 	pcpu->pc_acpi_id = 0xffffffff;
1787 }
1788 
1789 static int
1790 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1791 {
1792 	struct bios_smap *smapbase;
1793 	struct bios_smap_xattr smap;
1794 	caddr_t kmdp;
1795 	uint32_t *smapattr;
1796 	int count, error, i;
1797 
1798 	/* Retrieve the system memory map from the loader. */
1799 	kmdp = preload_search_by_type("elf kernel");
1800 	if (kmdp == NULL)
1801 		kmdp = preload_search_by_type("elf64 kernel");
1802 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1803 	    MODINFO_METADATA | MODINFOMD_SMAP);
1804 	if (smapbase == NULL)
1805 		return (0);
1806 	smapattr = (uint32_t *)preload_search_info(kmdp,
1807 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1808 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1809 	error = 0;
1810 	for (i = 0; i < count; i++) {
1811 		smap.base = smapbase[i].base;
1812 		smap.length = smapbase[i].length;
1813 		smap.type = smapbase[i].type;
1814 		if (smapattr != NULL)
1815 			smap.xattr = smapattr[i];
1816 		else
1817 			smap.xattr = 0;
1818 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1819 	}
1820 	return (error);
1821 }
1822 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1823     smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
1824 
1825 static int
1826 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1827 {
1828 	struct efi_map_header *efihdr;
1829 	caddr_t kmdp;
1830 	uint32_t efisize;
1831 
1832 	kmdp = preload_search_by_type("elf kernel");
1833 	if (kmdp == NULL)
1834 		kmdp = preload_search_by_type("elf64 kernel");
1835 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1836 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1837 	if (efihdr == NULL)
1838 		return (0);
1839 	efisize = *((uint32_t *)efihdr - 1);
1840 	return (SYSCTL_OUT(req, efihdr, efisize));
1841 }
1842 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1843     efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1844 
1845 void
1846 spinlock_enter(void)
1847 {
1848 	struct thread *td;
1849 	register_t flags;
1850 
1851 	td = curthread;
1852 	if (td->td_md.md_spinlock_count == 0) {
1853 		flags = intr_disable();
1854 		td->td_md.md_spinlock_count = 1;
1855 		td->td_md.md_saved_flags = flags;
1856 	} else
1857 		td->td_md.md_spinlock_count++;
1858 	critical_enter();
1859 }
1860 
1861 void
1862 spinlock_exit(void)
1863 {
1864 	struct thread *td;
1865 	register_t flags;
1866 
1867 	td = curthread;
1868 	critical_exit();
1869 	flags = td->td_md.md_saved_flags;
1870 	td->td_md.md_spinlock_count--;
1871 	if (td->td_md.md_spinlock_count == 0)
1872 		intr_restore(flags);
1873 }
1874 
1875 /*
1876  * Construct a PCB from a trapframe. This is called from kdb_trap() where
1877  * we want to start a backtrace from the function that caused us to enter
1878  * the debugger. We have the context in the trapframe, but base the trace
1879  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1880  * enough for a backtrace.
1881  */
1882 void
1883 makectx(struct trapframe *tf, struct pcb *pcb)
1884 {
1885 
1886 	pcb->pcb_r12 = tf->tf_r12;
1887 	pcb->pcb_r13 = tf->tf_r13;
1888 	pcb->pcb_r14 = tf->tf_r14;
1889 	pcb->pcb_r15 = tf->tf_r15;
1890 	pcb->pcb_rbp = tf->tf_rbp;
1891 	pcb->pcb_rbx = tf->tf_rbx;
1892 	pcb->pcb_rip = tf->tf_rip;
1893 	pcb->pcb_rsp = tf->tf_rsp;
1894 }
1895 
1896 int
1897 ptrace_set_pc(struct thread *td, unsigned long addr)
1898 {
1899 
1900 	td->td_frame->tf_rip = addr;
1901 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
1902 	return (0);
1903 }
1904 
1905 int
1906 ptrace_single_step(struct thread *td)
1907 {
1908 	td->td_frame->tf_rflags |= PSL_T;
1909 	return (0);
1910 }
1911 
1912 int
1913 ptrace_clear_single_step(struct thread *td)
1914 {
1915 	td->td_frame->tf_rflags &= ~PSL_T;
1916 	return (0);
1917 }
1918 
1919 int
1920 fill_regs(struct thread *td, struct reg *regs)
1921 {
1922 	struct trapframe *tp;
1923 
1924 	tp = td->td_frame;
1925 	return (fill_frame_regs(tp, regs));
1926 }
1927 
1928 int
1929 fill_frame_regs(struct trapframe *tp, struct reg *regs)
1930 {
1931 	regs->r_r15 = tp->tf_r15;
1932 	regs->r_r14 = tp->tf_r14;
1933 	regs->r_r13 = tp->tf_r13;
1934 	regs->r_r12 = tp->tf_r12;
1935 	regs->r_r11 = tp->tf_r11;
1936 	regs->r_r10 = tp->tf_r10;
1937 	regs->r_r9  = tp->tf_r9;
1938 	regs->r_r8  = tp->tf_r8;
1939 	regs->r_rdi = tp->tf_rdi;
1940 	regs->r_rsi = tp->tf_rsi;
1941 	regs->r_rbp = tp->tf_rbp;
1942 	regs->r_rbx = tp->tf_rbx;
1943 	regs->r_rdx = tp->tf_rdx;
1944 	regs->r_rcx = tp->tf_rcx;
1945 	regs->r_rax = tp->tf_rax;
1946 	regs->r_rip = tp->tf_rip;
1947 	regs->r_cs = tp->tf_cs;
1948 	regs->r_rflags = tp->tf_rflags;
1949 	regs->r_rsp = tp->tf_rsp;
1950 	regs->r_ss = tp->tf_ss;
1951 	if (tp->tf_flags & TF_HASSEGS) {
1952 		regs->r_ds = tp->tf_ds;
1953 		regs->r_es = tp->tf_es;
1954 		regs->r_fs = tp->tf_fs;
1955 		regs->r_gs = tp->tf_gs;
1956 	} else {
1957 		regs->r_ds = 0;
1958 		regs->r_es = 0;
1959 		regs->r_fs = 0;
1960 		regs->r_gs = 0;
1961 	}
1962 	return (0);
1963 }
1964 
1965 int
1966 set_regs(struct thread *td, struct reg *regs)
1967 {
1968 	struct trapframe *tp;
1969 	register_t rflags;
1970 
1971 	tp = td->td_frame;
1972 	rflags = regs->r_rflags & 0xffffffff;
1973 	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
1974 		return (EINVAL);
1975 	tp->tf_r15 = regs->r_r15;
1976 	tp->tf_r14 = regs->r_r14;
1977 	tp->tf_r13 = regs->r_r13;
1978 	tp->tf_r12 = regs->r_r12;
1979 	tp->tf_r11 = regs->r_r11;
1980 	tp->tf_r10 = regs->r_r10;
1981 	tp->tf_r9  = regs->r_r9;
1982 	tp->tf_r8  = regs->r_r8;
1983 	tp->tf_rdi = regs->r_rdi;
1984 	tp->tf_rsi = regs->r_rsi;
1985 	tp->tf_rbp = regs->r_rbp;
1986 	tp->tf_rbx = regs->r_rbx;
1987 	tp->tf_rdx = regs->r_rdx;
1988 	tp->tf_rcx = regs->r_rcx;
1989 	tp->tf_rax = regs->r_rax;
1990 	tp->tf_rip = regs->r_rip;
1991 	tp->tf_cs = regs->r_cs;
1992 	tp->tf_rflags = rflags;
1993 	tp->tf_rsp = regs->r_rsp;
1994 	tp->tf_ss = regs->r_ss;
1995 	if (0) {	/* XXXKIB */
1996 		tp->tf_ds = regs->r_ds;
1997 		tp->tf_es = regs->r_es;
1998 		tp->tf_fs = regs->r_fs;
1999 		tp->tf_gs = regs->r_gs;
2000 		tp->tf_flags = TF_HASSEGS;
2001 	}
2002 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2003 	return (0);
2004 }
2005 
2006 /* XXX check all this stuff! */
2007 /* externalize from sv_xmm */
2008 static void
2009 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2010 {
2011 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2012 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2013 	int i;
2014 
2015 	/* pcb -> fpregs */
2016 	bzero(fpregs, sizeof(*fpregs));
2017 
2018 	/* FPU control/status */
2019 	penv_fpreg->en_cw = penv_xmm->en_cw;
2020 	penv_fpreg->en_sw = penv_xmm->en_sw;
2021 	penv_fpreg->en_tw = penv_xmm->en_tw;
2022 	penv_fpreg->en_opcode = penv_xmm->en_opcode;
2023 	penv_fpreg->en_rip = penv_xmm->en_rip;
2024 	penv_fpreg->en_rdp = penv_xmm->en_rdp;
2025 	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2026 	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2027 
2028 	/* FPU registers */
2029 	for (i = 0; i < 8; ++i)
2030 		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2031 
2032 	/* SSE registers */
2033 	for (i = 0; i < 16; ++i)
2034 		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2035 }
2036 
2037 /* internalize from fpregs into sv_xmm */
2038 static void
2039 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2040 {
2041 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2042 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2043 	int i;
2044 
2045 	/* fpregs -> pcb */
2046 	/* FPU control/status */
2047 	penv_xmm->en_cw = penv_fpreg->en_cw;
2048 	penv_xmm->en_sw = penv_fpreg->en_sw;
2049 	penv_xmm->en_tw = penv_fpreg->en_tw;
2050 	penv_xmm->en_opcode = penv_fpreg->en_opcode;
2051 	penv_xmm->en_rip = penv_fpreg->en_rip;
2052 	penv_xmm->en_rdp = penv_fpreg->en_rdp;
2053 	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2054 	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2055 
2056 	/* FPU registers */
2057 	for (i = 0; i < 8; ++i)
2058 		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2059 
2060 	/* SSE registers */
2061 	for (i = 0; i < 16; ++i)
2062 		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2063 }
2064 
2065 /* externalize from td->pcb */
2066 int
2067 fill_fpregs(struct thread *td, struct fpreg *fpregs)
2068 {
2069 
2070 	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2071 	    P_SHOULDSTOP(td->td_proc),
2072 	    ("not suspended thread %p", td));
2073 	fpugetregs(td);
2074 	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2075 	return (0);
2076 }
2077 
2078 /* internalize to td->pcb */
2079 int
2080 set_fpregs(struct thread *td, struct fpreg *fpregs)
2081 {
2082 
2083 	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2084 	fpuuserinited(td);
2085 	return (0);
2086 }
2087 
2088 /*
2089  * Get machine context.
2090  */
2091 int
2092 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2093 {
2094 	struct pcb *pcb;
2095 	struct trapframe *tp;
2096 
2097 	pcb = td->td_pcb;
2098 	tp = td->td_frame;
2099 	PROC_LOCK(curthread->td_proc);
2100 	mcp->mc_onstack = sigonstack(tp->tf_rsp);
2101 	PROC_UNLOCK(curthread->td_proc);
2102 	mcp->mc_r15 = tp->tf_r15;
2103 	mcp->mc_r14 = tp->tf_r14;
2104 	mcp->mc_r13 = tp->tf_r13;
2105 	mcp->mc_r12 = tp->tf_r12;
2106 	mcp->mc_r11 = tp->tf_r11;
2107 	mcp->mc_r10 = tp->tf_r10;
2108 	mcp->mc_r9  = tp->tf_r9;
2109 	mcp->mc_r8  = tp->tf_r8;
2110 	mcp->mc_rdi = tp->tf_rdi;
2111 	mcp->mc_rsi = tp->tf_rsi;
2112 	mcp->mc_rbp = tp->tf_rbp;
2113 	mcp->mc_rbx = tp->tf_rbx;
2114 	mcp->mc_rcx = tp->tf_rcx;
2115 	mcp->mc_rflags = tp->tf_rflags;
2116 	if (flags & GET_MC_CLEAR_RET) {
2117 		mcp->mc_rax = 0;
2118 		mcp->mc_rdx = 0;
2119 		mcp->mc_rflags &= ~PSL_C;
2120 	} else {
2121 		mcp->mc_rax = tp->tf_rax;
2122 		mcp->mc_rdx = tp->tf_rdx;
2123 	}
2124 	mcp->mc_rip = tp->tf_rip;
2125 	mcp->mc_cs = tp->tf_cs;
2126 	mcp->mc_rsp = tp->tf_rsp;
2127 	mcp->mc_ss = tp->tf_ss;
2128 	mcp->mc_ds = tp->tf_ds;
2129 	mcp->mc_es = tp->tf_es;
2130 	mcp->mc_fs = tp->tf_fs;
2131 	mcp->mc_gs = tp->tf_gs;
2132 	mcp->mc_flags = tp->tf_flags;
2133 	mcp->mc_len = sizeof(*mcp);
2134 	get_fpcontext(td, mcp, NULL, 0);
2135 	mcp->mc_fsbase = pcb->pcb_fsbase;
2136 	mcp->mc_gsbase = pcb->pcb_gsbase;
2137 	mcp->mc_xfpustate = 0;
2138 	mcp->mc_xfpustate_len = 0;
2139 	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2140 	return (0);
2141 }
2142 
2143 /*
2144  * Set machine context.
2145  *
2146  * However, we don't set any but the user modifiable flags, and we won't
2147  * touch the cs selector.
2148  */
2149 int
2150 set_mcontext(struct thread *td, mcontext_t *mcp)
2151 {
2152 	struct pcb *pcb;
2153 	struct trapframe *tp;
2154 	char *xfpustate;
2155 	long rflags;
2156 	int ret;
2157 
2158 	pcb = td->td_pcb;
2159 	tp = td->td_frame;
2160 	if (mcp->mc_len != sizeof(*mcp) ||
2161 	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2162 		return (EINVAL);
2163 	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2164 	    (tp->tf_rflags & ~PSL_USERCHANGE);
2165 	if (mcp->mc_flags & _MC_HASFPXSTATE) {
2166 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2167 		    sizeof(struct savefpu))
2168 			return (EINVAL);
2169 		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2170 		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2171 		    mcp->mc_xfpustate_len);
2172 		if (ret != 0)
2173 			return (ret);
2174 	} else
2175 		xfpustate = NULL;
2176 	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2177 	if (ret != 0)
2178 		return (ret);
2179 	tp->tf_r15 = mcp->mc_r15;
2180 	tp->tf_r14 = mcp->mc_r14;
2181 	tp->tf_r13 = mcp->mc_r13;
2182 	tp->tf_r12 = mcp->mc_r12;
2183 	tp->tf_r11 = mcp->mc_r11;
2184 	tp->tf_r10 = mcp->mc_r10;
2185 	tp->tf_r9  = mcp->mc_r9;
2186 	tp->tf_r8  = mcp->mc_r8;
2187 	tp->tf_rdi = mcp->mc_rdi;
2188 	tp->tf_rsi = mcp->mc_rsi;
2189 	tp->tf_rbp = mcp->mc_rbp;
2190 	tp->tf_rbx = mcp->mc_rbx;
2191 	tp->tf_rdx = mcp->mc_rdx;
2192 	tp->tf_rcx = mcp->mc_rcx;
2193 	tp->tf_rax = mcp->mc_rax;
2194 	tp->tf_rip = mcp->mc_rip;
2195 	tp->tf_rflags = rflags;
2196 	tp->tf_rsp = mcp->mc_rsp;
2197 	tp->tf_ss = mcp->mc_ss;
2198 	tp->tf_flags = mcp->mc_flags;
2199 	if (tp->tf_flags & TF_HASSEGS) {
2200 		tp->tf_ds = mcp->mc_ds;
2201 		tp->tf_es = mcp->mc_es;
2202 		tp->tf_fs = mcp->mc_fs;
2203 		tp->tf_gs = mcp->mc_gs;
2204 	}
2205 	if (mcp->mc_flags & _MC_HASBASES) {
2206 		pcb->pcb_fsbase = mcp->mc_fsbase;
2207 		pcb->pcb_gsbase = mcp->mc_gsbase;
2208 	}
2209 	set_pcb_flags(pcb, PCB_FULL_IRET);
2210 	return (0);
2211 }
2212 
2213 static void
2214 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2215     size_t xfpusave_len)
2216 {
2217 	size_t max_len, len;
2218 
2219 	mcp->mc_ownedfp = fpugetregs(td);
2220 	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2221 	    sizeof(mcp->mc_fpstate));
2222 	mcp->mc_fpformat = fpuformat();
2223 	if (!use_xsave || xfpusave_len == 0)
2224 		return;
2225 	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2226 	len = xfpusave_len;
2227 	if (len > max_len) {
2228 		len = max_len;
2229 		bzero(xfpusave + max_len, len - max_len);
2230 	}
2231 	mcp->mc_flags |= _MC_HASFPXSTATE;
2232 	mcp->mc_xfpustate_len = len;
2233 	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2234 }
2235 
2236 static int
2237 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2238     size_t xfpustate_len)
2239 {
2240 	struct savefpu *fpstate;
2241 	int error;
2242 
2243 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2244 		return (0);
2245 	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2246 		return (EINVAL);
2247 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2248 		/* We don't care what state is left in the FPU or PCB. */
2249 		fpstate_drop(td);
2250 		error = 0;
2251 	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2252 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2253 		fpstate = (struct savefpu *)&mcp->mc_fpstate;
2254 		fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask;
2255 		error = fpusetregs(td, fpstate, xfpustate, xfpustate_len);
2256 	} else
2257 		return (EINVAL);
2258 	return (error);
2259 }
2260 
2261 void
2262 fpstate_drop(struct thread *td)
2263 {
2264 
2265 	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2266 	critical_enter();
2267 	if (PCPU_GET(fpcurthread) == td)
2268 		fpudrop();
2269 	/*
2270 	 * XXX force a full drop of the fpu.  The above only drops it if we
2271 	 * owned it.
2272 	 *
2273 	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2274 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
2275 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
2276 	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2277 	 * have too many layers.
2278 	 */
2279 	clear_pcb_flags(curthread->td_pcb,
2280 	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2281 	critical_exit();
2282 }
2283 
2284 int
2285 fill_dbregs(struct thread *td, struct dbreg *dbregs)
2286 {
2287 	struct pcb *pcb;
2288 
2289 	if (td == NULL) {
2290 		dbregs->dr[0] = rdr0();
2291 		dbregs->dr[1] = rdr1();
2292 		dbregs->dr[2] = rdr2();
2293 		dbregs->dr[3] = rdr3();
2294 		dbregs->dr[6] = rdr6();
2295 		dbregs->dr[7] = rdr7();
2296 	} else {
2297 		pcb = td->td_pcb;
2298 		dbregs->dr[0] = pcb->pcb_dr0;
2299 		dbregs->dr[1] = pcb->pcb_dr1;
2300 		dbregs->dr[2] = pcb->pcb_dr2;
2301 		dbregs->dr[3] = pcb->pcb_dr3;
2302 		dbregs->dr[6] = pcb->pcb_dr6;
2303 		dbregs->dr[7] = pcb->pcb_dr7;
2304 	}
2305 	dbregs->dr[4] = 0;
2306 	dbregs->dr[5] = 0;
2307 	dbregs->dr[8] = 0;
2308 	dbregs->dr[9] = 0;
2309 	dbregs->dr[10] = 0;
2310 	dbregs->dr[11] = 0;
2311 	dbregs->dr[12] = 0;
2312 	dbregs->dr[13] = 0;
2313 	dbregs->dr[14] = 0;
2314 	dbregs->dr[15] = 0;
2315 	return (0);
2316 }
2317 
2318 int
2319 set_dbregs(struct thread *td, struct dbreg *dbregs)
2320 {
2321 	struct pcb *pcb;
2322 	int i;
2323 
2324 	if (td == NULL) {
2325 		load_dr0(dbregs->dr[0]);
2326 		load_dr1(dbregs->dr[1]);
2327 		load_dr2(dbregs->dr[2]);
2328 		load_dr3(dbregs->dr[3]);
2329 		load_dr6(dbregs->dr[6]);
2330 		load_dr7(dbregs->dr[7]);
2331 	} else {
2332 		/*
2333 		 * Don't let an illegal value for dr7 get set.  Specifically,
2334 		 * check for undefined settings.  Setting these bit patterns
2335 		 * result in undefined behaviour and can lead to an unexpected
2336 		 * TRCTRAP or a general protection fault right here.
2337 		 * Upper bits of dr6 and dr7 must not be set
2338 		 */
2339 		for (i = 0; i < 4; i++) {
2340 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2341 				return (EINVAL);
2342 			if (td->td_frame->tf_cs == _ucode32sel &&
2343 			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2344 				return (EINVAL);
2345 		}
2346 		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2347 		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2348 			return (EINVAL);
2349 
2350 		pcb = td->td_pcb;
2351 
2352 		/*
2353 		 * Don't let a process set a breakpoint that is not within the
2354 		 * process's address space.  If a process could do this, it
2355 		 * could halt the system by setting a breakpoint in the kernel
2356 		 * (if ddb was enabled).  Thus, we need to check to make sure
2357 		 * that no breakpoints are being enabled for addresses outside
2358 		 * process's address space.
2359 		 *
2360 		 * XXX - what about when the watched area of the user's
2361 		 * address space is written into from within the kernel
2362 		 * ... wouldn't that still cause a breakpoint to be generated
2363 		 * from within kernel mode?
2364 		 */
2365 
2366 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2367 			/* dr0 is enabled */
2368 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2369 				return (EINVAL);
2370 		}
2371 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2372 			/* dr1 is enabled */
2373 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2374 				return (EINVAL);
2375 		}
2376 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2377 			/* dr2 is enabled */
2378 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2379 				return (EINVAL);
2380 		}
2381 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2382 			/* dr3 is enabled */
2383 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2384 				return (EINVAL);
2385 		}
2386 
2387 		pcb->pcb_dr0 = dbregs->dr[0];
2388 		pcb->pcb_dr1 = dbregs->dr[1];
2389 		pcb->pcb_dr2 = dbregs->dr[2];
2390 		pcb->pcb_dr3 = dbregs->dr[3];
2391 		pcb->pcb_dr6 = dbregs->dr[6];
2392 		pcb->pcb_dr7 = dbregs->dr[7];
2393 
2394 		set_pcb_flags(pcb, PCB_DBREGS);
2395 	}
2396 
2397 	return (0);
2398 }
2399 
2400 void
2401 reset_dbregs(void)
2402 {
2403 
2404 	load_dr7(0);	/* Turn off the control bits first */
2405 	load_dr0(0);
2406 	load_dr1(0);
2407 	load_dr2(0);
2408 	load_dr3(0);
2409 	load_dr6(0);
2410 }
2411 
2412 /*
2413  * Return > 0 if a hardware breakpoint has been hit, and the
2414  * breakpoint was in user space.  Return 0, otherwise.
2415  */
2416 int
2417 user_dbreg_trap(void)
2418 {
2419         u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
2420         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
2421         int nbp;            /* number of breakpoints that triggered */
2422         caddr_t addr[4];    /* breakpoint addresses */
2423         int i;
2424 
2425         dr7 = rdr7();
2426         if ((dr7 & 0x000000ff) == 0) {
2427                 /*
2428                  * all GE and LE bits in the dr7 register are zero,
2429                  * thus the trap couldn't have been caused by the
2430                  * hardware debug registers
2431                  */
2432                 return 0;
2433         }
2434 
2435         nbp = 0;
2436         dr6 = rdr6();
2437         bp = dr6 & 0x0000000f;
2438 
2439         if (!bp) {
2440                 /*
2441                  * None of the breakpoint bits are set meaning this
2442                  * trap was not caused by any of the debug registers
2443                  */
2444                 return 0;
2445         }
2446 
2447         /*
2448          * at least one of the breakpoints were hit, check to see
2449          * which ones and if any of them are user space addresses
2450          */
2451 
2452         if (bp & 0x01) {
2453                 addr[nbp++] = (caddr_t)rdr0();
2454         }
2455         if (bp & 0x02) {
2456                 addr[nbp++] = (caddr_t)rdr1();
2457         }
2458         if (bp & 0x04) {
2459                 addr[nbp++] = (caddr_t)rdr2();
2460         }
2461         if (bp & 0x08) {
2462                 addr[nbp++] = (caddr_t)rdr3();
2463         }
2464 
2465         for (i = 0; i < nbp; i++) {
2466                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2467                         /*
2468                          * addr[i] is in user space
2469                          */
2470                         return nbp;
2471                 }
2472         }
2473 
2474         /*
2475          * None of the breakpoints are in user space.
2476          */
2477         return 0;
2478 }
2479 
2480 #ifdef KDB
2481 
2482 /*
2483  * Provide inb() and outb() as functions.  They are normally only available as
2484  * inline functions, thus cannot be called from the debugger.
2485  */
2486 
2487 /* silence compiler warnings */
2488 u_char inb_(u_short);
2489 void outb_(u_short, u_char);
2490 
2491 u_char
2492 inb_(u_short port)
2493 {
2494 	return inb(port);
2495 }
2496 
2497 void
2498 outb_(u_short port, u_char data)
2499 {
2500 	outb(port, data);
2501 }
2502 
2503 #endif /* KDB */
2504