1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 2003 Peter Wemm. 5 * Copyright (c) 1992 Terrence R. Lambert. 6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * William Jolitz. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. All advertising materials mentioning features or use of this software 21 * must display the following acknowledgement: 22 * This product includes software developed by the University of 23 * California, Berkeley and its contributors. 24 * 4. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_atpic.h" 47 #include "opt_cpu.h" 48 #include "opt_ddb.h" 49 #include "opt_inet.h" 50 #include "opt_isa.h" 51 #include "opt_kstack_pages.h" 52 #include "opt_maxmem.h" 53 #include "opt_mp_watchdog.h" 54 #include "opt_pci.h" 55 #include "opt_platform.h" 56 #include "opt_sched.h" 57 58 #include <sys/param.h> 59 #include <sys/proc.h> 60 #include <sys/systm.h> 61 #include <sys/bio.h> 62 #include <sys/buf.h> 63 #include <sys/bus.h> 64 #include <sys/callout.h> 65 #include <sys/cons.h> 66 #include <sys/cpu.h> 67 #include <sys/efi.h> 68 #include <sys/eventhandler.h> 69 #include <sys/exec.h> 70 #include <sys/imgact.h> 71 #include <sys/kdb.h> 72 #include <sys/kernel.h> 73 #include <sys/ktr.h> 74 #include <sys/linker.h> 75 #include <sys/lock.h> 76 #include <sys/malloc.h> 77 #include <sys/memrange.h> 78 #include <sys/msgbuf.h> 79 #include <sys/mutex.h> 80 #include <sys/pcpu.h> 81 #include <sys/ptrace.h> 82 #include <sys/reboot.h> 83 #include <sys/rwlock.h> 84 #include <sys/sched.h> 85 #include <sys/signalvar.h> 86 #ifdef SMP 87 #include <sys/smp.h> 88 #endif 89 #include <sys/syscallsubr.h> 90 #include <sys/sysctl.h> 91 #include <sys/sysent.h> 92 #include <sys/sysproto.h> 93 #include <sys/ucontext.h> 94 #include <sys/vmmeter.h> 95 96 #include <vm/vm.h> 97 #include <vm/vm_extern.h> 98 #include <vm/vm_kern.h> 99 #include <vm/vm_page.h> 100 #include <vm/vm_map.h> 101 #include <vm/vm_object.h> 102 #include <vm/vm_pager.h> 103 #include <vm/vm_param.h> 104 #include <vm/vm_phys.h> 105 106 #ifdef DDB 107 #ifndef KDB 108 #error KDB must be enabled in order for DDB to work! 109 #endif 110 #include <ddb/ddb.h> 111 #include <ddb/db_sym.h> 112 #endif 113 114 #include <net/netisr.h> 115 116 #include <machine/clock.h> 117 #include <machine/cpu.h> 118 #include <machine/cputypes.h> 119 #include <machine/frame.h> 120 #include <machine/intr_machdep.h> 121 #include <x86/mca.h> 122 #include <machine/md_var.h> 123 #include <machine/metadata.h> 124 #include <machine/mp_watchdog.h> 125 #include <machine/pc/bios.h> 126 #include <machine/pcb.h> 127 #include <machine/proc.h> 128 #include <machine/reg.h> 129 #include <machine/sigframe.h> 130 #include <machine/specialreg.h> 131 #include <machine/trap.h> 132 #include <machine/tss.h> 133 #include <x86/ucode.h> 134 #ifdef SMP 135 #include <machine/smp.h> 136 #endif 137 #ifdef FDT 138 #include <x86/fdt.h> 139 #endif 140 141 #ifdef DEV_ATPIC 142 #include <x86/isa/icu.h> 143 #else 144 #include <x86/apicvar.h> 145 #endif 146 147 #include <isa/isareg.h> 148 #include <isa/rtc.h> 149 #include <x86/init.h> 150 151 /* Sanity check for __curthread() */ 152 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 153 154 /* 155 * The PTI trampoline stack needs enough space for a hardware trapframe and a 156 * couple of scratch registers, as well as the trapframe left behind after an 157 * iret fault. 158 */ 159 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) - 160 offsetof(struct pti_frame, pti_rip)); 161 162 extern u_int64_t hammer_time(u_int64_t, u_int64_t); 163 164 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 165 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 166 167 static void cpu_startup(void *); 168 static void get_fpcontext(struct thread *td, mcontext_t *mcp, 169 char *xfpusave, size_t xfpusave_len); 170 static int set_fpcontext(struct thread *td, mcontext_t *mcp, 171 char *xfpustate, size_t xfpustate_len); 172 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 173 174 /* Preload data parse function */ 175 static caddr_t native_parse_preload_data(u_int64_t); 176 177 /* Native function to fetch and parse the e820 map */ 178 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *); 179 180 /* Default init_ops implementation. */ 181 struct init_ops init_ops = { 182 .parse_preload_data = native_parse_preload_data, 183 .early_clock_source_init = i8254_init, 184 .early_delay = i8254_delay, 185 .parse_memmap = native_parse_memmap, 186 #ifdef SMP 187 .mp_bootaddress = mp_bootaddress, 188 .start_all_aps = native_start_all_aps, 189 #endif 190 #ifdef DEV_PCI 191 .msi_init = msi_init, 192 #endif 193 }; 194 195 /* 196 * Physical address of the EFI System Table. Stashed from the metadata hints 197 * passed into the kernel and used by the EFI code to call runtime services. 198 */ 199 vm_paddr_t efi_systbl_phys; 200 201 /* Intel ICH registers */ 202 #define ICH_PMBASE 0x400 203 #define ICH_SMI_EN ICH_PMBASE + 0x30 204 205 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; 206 207 int cold = 1; 208 209 long Maxmem = 0; 210 long realmem = 0; 211 212 /* 213 * The number of PHYSMAP entries must be one less than the number of 214 * PHYSSEG entries because the PHYSMAP entry that spans the largest 215 * physical address that is accessible by ISA DMA is split into two 216 * PHYSSEG entries. 217 */ 218 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 219 220 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 221 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 222 223 /* must be 2 less so 0 0 can signal end of chunks */ 224 #define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2) 225 #define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2) 226 227 struct kva_md_info kmi; 228 229 static struct trapframe proc0_tf; 230 struct region_descriptor r_gdt, r_idt; 231 232 struct pcpu __pcpu[MAXCPU]; 233 234 struct mtx icu_lock; 235 236 struct mem_range_softc mem_range_softc; 237 238 struct mtx dt_lock; /* lock for GDT and LDT */ 239 240 void (*vmm_resume_p)(void); 241 242 static void 243 cpu_startup(dummy) 244 void *dummy; 245 { 246 uintmax_t memsize; 247 char *sysenv; 248 249 /* 250 * On MacBooks, we need to disallow the legacy USB circuit to 251 * generate an SMI# because this can cause several problems, 252 * namely: incorrect CPU frequency detection and failure to 253 * start the APs. 254 * We do this by disabling a bit in the SMI_EN (SMI Control and 255 * Enable register) of the Intel ICH LPC Interface Bridge. 256 */ 257 sysenv = kern_getenv("smbios.system.product"); 258 if (sysenv != NULL) { 259 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 260 strncmp(sysenv, "MacBook3,1", 10) == 0 || 261 strncmp(sysenv, "MacBook4,1", 10) == 0 || 262 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 263 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 264 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 265 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 266 strncmp(sysenv, "Macmini1,1", 10) == 0) { 267 if (bootverbose) 268 printf("Disabling LEGACY_USB_EN bit on " 269 "Intel ICH.\n"); 270 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 271 } 272 freeenv(sysenv); 273 } 274 275 /* 276 * Good {morning,afternoon,evening,night}. 277 */ 278 startrtclock(); 279 printcpuinfo(); 280 281 /* 282 * Display physical memory if SMBIOS reports reasonable amount. 283 */ 284 memsize = 0; 285 sysenv = kern_getenv("smbios.memory.enabled"); 286 if (sysenv != NULL) { 287 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 288 freeenv(sysenv); 289 } 290 if (memsize < ptoa((uintmax_t)vm_free_count())) 291 memsize = ptoa((uintmax_t)Maxmem); 292 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 293 realmem = atop(memsize); 294 295 /* 296 * Display any holes after the first chunk of extended memory. 297 */ 298 if (bootverbose) { 299 int indx; 300 301 printf("Physical memory chunk(s):\n"); 302 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 303 vm_paddr_t size; 304 305 size = phys_avail[indx + 1] - phys_avail[indx]; 306 printf( 307 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 308 (uintmax_t)phys_avail[indx], 309 (uintmax_t)phys_avail[indx + 1] - 1, 310 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 311 } 312 } 313 314 vm_ksubmap_init(&kmi); 315 316 printf("avail memory = %ju (%ju MB)\n", 317 ptoa((uintmax_t)vm_free_count()), 318 ptoa((uintmax_t)vm_free_count()) / 1048576); 319 320 /* 321 * Set up buffers, so they can be used to read disk labels. 322 */ 323 bufinit(); 324 vm_pager_bufferinit(); 325 326 cpu_setregs(); 327 } 328 329 /* 330 * Send an interrupt to process. 331 * 332 * Stack is set up to allow sigcode stored 333 * at top to call routine, followed by call 334 * to sigreturn routine below. After sigreturn 335 * resets the signal mask, the stack, and the 336 * frame pointer, it returns to the user 337 * specified pc, psl. 338 */ 339 void 340 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 341 { 342 struct sigframe sf, *sfp; 343 struct pcb *pcb; 344 struct proc *p; 345 struct thread *td; 346 struct sigacts *psp; 347 char *sp; 348 struct trapframe *regs; 349 char *xfpusave; 350 size_t xfpusave_len; 351 int sig; 352 int oonstack; 353 354 td = curthread; 355 pcb = td->td_pcb; 356 p = td->td_proc; 357 PROC_LOCK_ASSERT(p, MA_OWNED); 358 sig = ksi->ksi_signo; 359 psp = p->p_sigacts; 360 mtx_assert(&psp->ps_mtx, MA_OWNED); 361 regs = td->td_frame; 362 oonstack = sigonstack(regs->tf_rsp); 363 364 if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { 365 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); 366 xfpusave = __builtin_alloca(xfpusave_len); 367 } else { 368 xfpusave_len = 0; 369 xfpusave = NULL; 370 } 371 372 /* Save user context. */ 373 bzero(&sf, sizeof(sf)); 374 sf.sf_uc.uc_sigmask = *mask; 375 sf.sf_uc.uc_stack = td->td_sigstk; 376 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 377 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 378 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 379 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs)); 380 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 381 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); 382 fpstate_drop(td); 383 update_pcb_bases(pcb); 384 sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase; 385 sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase; 386 bzero(sf.sf_uc.uc_mcontext.mc_spare, 387 sizeof(sf.sf_uc.uc_mcontext.mc_spare)); 388 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 389 390 /* Allocate space for the signal handler context. */ 391 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 392 SIGISMEMBER(psp->ps_sigonstack, sig)) { 393 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; 394 #if defined(COMPAT_43) 395 td->td_sigstk.ss_flags |= SS_ONSTACK; 396 #endif 397 } else 398 sp = (char *)regs->tf_rsp - 128; 399 if (xfpusave != NULL) { 400 sp -= xfpusave_len; 401 sp = (char *)((unsigned long)sp & ~0x3Ful); 402 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; 403 } 404 sp -= sizeof(struct sigframe); 405 /* Align to 16 bytes. */ 406 sfp = (struct sigframe *)((unsigned long)sp & ~0xFul); 407 408 /* Build the argument list for the signal handler. */ 409 regs->tf_rdi = sig; /* arg 1 in %rdi */ 410 regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */ 411 bzero(&sf.sf_si, sizeof(sf.sf_si)); 412 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 413 /* Signal handler installed with SA_SIGINFO. */ 414 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ 415 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 416 417 /* Fill in POSIX parts */ 418 sf.sf_si = ksi->ksi_info; 419 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 420 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 421 } else { 422 /* Old FreeBSD-style arguments. */ 423 regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */ 424 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 425 sf.sf_ahu.sf_handler = catcher; 426 } 427 mtx_unlock(&psp->ps_mtx); 428 PROC_UNLOCK(p); 429 430 /* 431 * Copy the sigframe out to the user's stack. 432 */ 433 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || 434 (xfpusave != NULL && copyout(xfpusave, 435 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) 436 != 0)) { 437 #ifdef DEBUG 438 printf("process %ld has trashed its stack\n", (long)p->p_pid); 439 #endif 440 PROC_LOCK(p); 441 sigexit(td, SIGILL); 442 } 443 444 regs->tf_rsp = (long)sfp; 445 regs->tf_rip = p->p_sysent->sv_sigcode_base; 446 regs->tf_rflags &= ~(PSL_T | PSL_D); 447 regs->tf_cs = _ucodesel; 448 regs->tf_ds = _udatasel; 449 regs->tf_ss = _udatasel; 450 regs->tf_es = _udatasel; 451 regs->tf_fs = _ufssel; 452 regs->tf_gs = _ugssel; 453 regs->tf_flags = TF_HASSEGS; 454 PROC_LOCK(p); 455 mtx_lock(&psp->ps_mtx); 456 } 457 458 /* 459 * System call to cleanup state after a signal 460 * has been taken. Reset signal mask and 461 * stack state from context left by sendsig (above). 462 * Return to previous pc and psl as specified by 463 * context left by sendsig. Check carefully to 464 * make sure that the user has not modified the 465 * state to gain improper privileges. 466 * 467 * MPSAFE 468 */ 469 int 470 sys_sigreturn(td, uap) 471 struct thread *td; 472 struct sigreturn_args /* { 473 const struct __ucontext *sigcntxp; 474 } */ *uap; 475 { 476 ucontext_t uc; 477 struct pcb *pcb; 478 struct proc *p; 479 struct trapframe *regs; 480 ucontext_t *ucp; 481 char *xfpustate; 482 size_t xfpustate_len; 483 long rflags; 484 int cs, error, ret; 485 ksiginfo_t ksi; 486 487 pcb = td->td_pcb; 488 p = td->td_proc; 489 490 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 491 if (error != 0) { 492 uprintf("pid %d (%s): sigreturn copyin failed\n", 493 p->p_pid, td->td_name); 494 return (error); 495 } 496 ucp = &uc; 497 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { 498 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, 499 td->td_name, ucp->uc_mcontext.mc_flags); 500 return (EINVAL); 501 } 502 regs = td->td_frame; 503 rflags = ucp->uc_mcontext.mc_rflags; 504 /* 505 * Don't allow users to change privileged or reserved flags. 506 */ 507 if (!EFL_SECURE(rflags, regs->tf_rflags)) { 508 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid, 509 td->td_name, rflags); 510 return (EINVAL); 511 } 512 513 /* 514 * Don't allow users to load a valid privileged %cs. Let the 515 * hardware check for invalid selectors, excess privilege in 516 * other selectors, invalid %eip's and invalid %esp's. 517 */ 518 cs = ucp->uc_mcontext.mc_cs; 519 if (!CS_SECURE(cs)) { 520 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid, 521 td->td_name, cs); 522 ksiginfo_init_trap(&ksi); 523 ksi.ksi_signo = SIGBUS; 524 ksi.ksi_code = BUS_OBJERR; 525 ksi.ksi_trapno = T_PROTFLT; 526 ksi.ksi_addr = (void *)regs->tf_rip; 527 trapsignal(td, &ksi); 528 return (EINVAL); 529 } 530 531 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { 532 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; 533 if (xfpustate_len > cpu_max_ext_state_size - 534 sizeof(struct savefpu)) { 535 uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", 536 p->p_pid, td->td_name, xfpustate_len); 537 return (EINVAL); 538 } 539 xfpustate = __builtin_alloca(xfpustate_len); 540 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, 541 xfpustate, xfpustate_len); 542 if (error != 0) { 543 uprintf( 544 "pid %d (%s): sigreturn copying xfpustate failed\n", 545 p->p_pid, td->td_name); 546 return (error); 547 } 548 } else { 549 xfpustate = NULL; 550 xfpustate_len = 0; 551 } 552 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); 553 if (ret != 0) { 554 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n", 555 p->p_pid, td->td_name, ret); 556 return (ret); 557 } 558 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); 559 update_pcb_bases(pcb); 560 pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase; 561 pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase; 562 563 #if defined(COMPAT_43) 564 if (ucp->uc_mcontext.mc_onstack & 1) 565 td->td_sigstk.ss_flags |= SS_ONSTACK; 566 else 567 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 568 #endif 569 570 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 571 return (EJUSTRETURN); 572 } 573 574 #ifdef COMPAT_FREEBSD4 575 int 576 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) 577 { 578 579 return sys_sigreturn(td, (struct sigreturn_args *)uap); 580 } 581 #endif 582 583 /* 584 * Reset registers to default values on exec. 585 */ 586 void 587 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) 588 { 589 struct trapframe *regs; 590 struct pcb *pcb; 591 register_t saved_rflags; 592 593 regs = td->td_frame; 594 pcb = td->td_pcb; 595 596 if (td->td_proc->p_md.md_ldt != NULL) 597 user_ldt_free(td); 598 599 update_pcb_bases(pcb); 600 pcb->pcb_fsbase = 0; 601 pcb->pcb_gsbase = 0; 602 clear_pcb_flags(pcb, PCB_32BIT); 603 pcb->pcb_initial_fpucw = __INITIAL_FPUCW__; 604 605 saved_rflags = regs->tf_rflags & PSL_T; 606 bzero((char *)regs, sizeof(struct trapframe)); 607 regs->tf_rip = imgp->entry_addr; 608 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; 609 regs->tf_rdi = stack; /* argv */ 610 regs->tf_rflags = PSL_USER | saved_rflags; 611 regs->tf_ss = _udatasel; 612 regs->tf_cs = _ucodesel; 613 regs->tf_ds = _udatasel; 614 regs->tf_es = _udatasel; 615 regs->tf_fs = _ufssel; 616 regs->tf_gs = _ugssel; 617 regs->tf_flags = TF_HASSEGS; 618 619 /* 620 * Reset the hardware debug registers if they were in use. 621 * They won't have any meaning for the newly exec'd process. 622 */ 623 if (pcb->pcb_flags & PCB_DBREGS) { 624 pcb->pcb_dr0 = 0; 625 pcb->pcb_dr1 = 0; 626 pcb->pcb_dr2 = 0; 627 pcb->pcb_dr3 = 0; 628 pcb->pcb_dr6 = 0; 629 pcb->pcb_dr7 = 0; 630 if (pcb == curpcb) { 631 /* 632 * Clear the debug registers on the running 633 * CPU, otherwise they will end up affecting 634 * the next process we switch to. 635 */ 636 reset_dbregs(); 637 } 638 clear_pcb_flags(pcb, PCB_DBREGS); 639 } 640 641 /* 642 * Drop the FP state if we hold it, so that the process gets a 643 * clean FP state if it uses the FPU again. 644 */ 645 fpstate_drop(td); 646 } 647 648 void 649 cpu_setregs(void) 650 { 651 register_t cr0; 652 653 cr0 = rcr0(); 654 /* 655 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the 656 * BSP. See the comments there about why we set them. 657 */ 658 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 659 load_cr0(cr0); 660 } 661 662 /* 663 * Initialize amd64 and configure to run kernel 664 */ 665 666 /* 667 * Initialize segments & interrupt table 668 */ 669 670 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */ 671 static struct gate_descriptor idt0[NIDT]; 672 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 673 674 static char dblfault_stack[PAGE_SIZE] __aligned(16); 675 static char mce0_stack[PAGE_SIZE] __aligned(16); 676 static char nmi0_stack[PAGE_SIZE] __aligned(16); 677 static char dbg0_stack[PAGE_SIZE] __aligned(16); 678 CTASSERT(sizeof(struct nmi_pcpu) == 16); 679 680 struct amd64tss common_tss[MAXCPU]; 681 682 /* 683 * Software prototypes -- in more palatable form. 684 * 685 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same 686 * slots as corresponding segments for i386 kernel. 687 */ 688 struct soft_segment_descriptor gdt_segs[] = { 689 /* GNULL_SEL 0 Null Descriptor */ 690 { .ssd_base = 0x0, 691 .ssd_limit = 0x0, 692 .ssd_type = 0, 693 .ssd_dpl = 0, 694 .ssd_p = 0, 695 .ssd_long = 0, 696 .ssd_def32 = 0, 697 .ssd_gran = 0 }, 698 /* GNULL2_SEL 1 Null Descriptor */ 699 { .ssd_base = 0x0, 700 .ssd_limit = 0x0, 701 .ssd_type = 0, 702 .ssd_dpl = 0, 703 .ssd_p = 0, 704 .ssd_long = 0, 705 .ssd_def32 = 0, 706 .ssd_gran = 0 }, 707 /* GUFS32_SEL 2 32 bit %gs Descriptor for user */ 708 { .ssd_base = 0x0, 709 .ssd_limit = 0xfffff, 710 .ssd_type = SDT_MEMRWA, 711 .ssd_dpl = SEL_UPL, 712 .ssd_p = 1, 713 .ssd_long = 0, 714 .ssd_def32 = 1, 715 .ssd_gran = 1 }, 716 /* GUGS32_SEL 3 32 bit %fs Descriptor for user */ 717 { .ssd_base = 0x0, 718 .ssd_limit = 0xfffff, 719 .ssd_type = SDT_MEMRWA, 720 .ssd_dpl = SEL_UPL, 721 .ssd_p = 1, 722 .ssd_long = 0, 723 .ssd_def32 = 1, 724 .ssd_gran = 1 }, 725 /* GCODE_SEL 4 Code Descriptor for kernel */ 726 { .ssd_base = 0x0, 727 .ssd_limit = 0xfffff, 728 .ssd_type = SDT_MEMERA, 729 .ssd_dpl = SEL_KPL, 730 .ssd_p = 1, 731 .ssd_long = 1, 732 .ssd_def32 = 0, 733 .ssd_gran = 1 }, 734 /* GDATA_SEL 5 Data Descriptor for kernel */ 735 { .ssd_base = 0x0, 736 .ssd_limit = 0xfffff, 737 .ssd_type = SDT_MEMRWA, 738 .ssd_dpl = SEL_KPL, 739 .ssd_p = 1, 740 .ssd_long = 1, 741 .ssd_def32 = 0, 742 .ssd_gran = 1 }, 743 /* GUCODE32_SEL 6 32 bit Code Descriptor for user */ 744 { .ssd_base = 0x0, 745 .ssd_limit = 0xfffff, 746 .ssd_type = SDT_MEMERA, 747 .ssd_dpl = SEL_UPL, 748 .ssd_p = 1, 749 .ssd_long = 0, 750 .ssd_def32 = 1, 751 .ssd_gran = 1 }, 752 /* GUDATA_SEL 7 32/64 bit Data Descriptor for user */ 753 { .ssd_base = 0x0, 754 .ssd_limit = 0xfffff, 755 .ssd_type = SDT_MEMRWA, 756 .ssd_dpl = SEL_UPL, 757 .ssd_p = 1, 758 .ssd_long = 0, 759 .ssd_def32 = 1, 760 .ssd_gran = 1 }, 761 /* GUCODE_SEL 8 64 bit Code Descriptor for user */ 762 { .ssd_base = 0x0, 763 .ssd_limit = 0xfffff, 764 .ssd_type = SDT_MEMERA, 765 .ssd_dpl = SEL_UPL, 766 .ssd_p = 1, 767 .ssd_long = 1, 768 .ssd_def32 = 0, 769 .ssd_gran = 1 }, 770 /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 771 { .ssd_base = 0x0, 772 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1, 773 .ssd_type = SDT_SYSTSS, 774 .ssd_dpl = SEL_KPL, 775 .ssd_p = 1, 776 .ssd_long = 0, 777 .ssd_def32 = 0, 778 .ssd_gran = 0 }, 779 /* Actually, the TSS is a system descriptor which is double size */ 780 { .ssd_base = 0x0, 781 .ssd_limit = 0x0, 782 .ssd_type = 0, 783 .ssd_dpl = 0, 784 .ssd_p = 0, 785 .ssd_long = 0, 786 .ssd_def32 = 0, 787 .ssd_gran = 0 }, 788 /* GUSERLDT_SEL 11 LDT Descriptor */ 789 { .ssd_base = 0x0, 790 .ssd_limit = 0x0, 791 .ssd_type = 0, 792 .ssd_dpl = 0, 793 .ssd_p = 0, 794 .ssd_long = 0, 795 .ssd_def32 = 0, 796 .ssd_gran = 0 }, 797 /* GUSERLDT_SEL 12 LDT Descriptor, double size */ 798 { .ssd_base = 0x0, 799 .ssd_limit = 0x0, 800 .ssd_type = 0, 801 .ssd_dpl = 0, 802 .ssd_p = 0, 803 .ssd_long = 0, 804 .ssd_def32 = 0, 805 .ssd_gran = 0 }, 806 }; 807 808 void 809 setidt(int idx, inthand_t *func, int typ, int dpl, int ist) 810 { 811 struct gate_descriptor *ip; 812 813 ip = idt + idx; 814 ip->gd_looffset = (uintptr_t)func; 815 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 816 ip->gd_ist = ist; 817 ip->gd_xx = 0; 818 ip->gd_type = typ; 819 ip->gd_dpl = dpl; 820 ip->gd_p = 1; 821 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 822 } 823 824 extern inthand_t 825 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 826 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 827 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 828 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 829 IDTVEC(xmm), IDTVEC(dblfault), 830 IDTVEC(div_pti), IDTVEC(bpt_pti), 831 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti), 832 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti), 833 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), 834 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti), 835 IDTVEC(xmm_pti), 836 #ifdef KDTRACE_HOOKS 837 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti), 838 #endif 839 #ifdef XENHVM 840 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti), 841 #endif 842 IDTVEC(fast_syscall), IDTVEC(fast_syscall32), 843 IDTVEC(fast_syscall_pti); 844 845 #ifdef DDB 846 /* 847 * Display the index and function name of any IDT entries that don't use 848 * the default 'rsvd' entry point. 849 */ 850 DB_SHOW_COMMAND(idt, db_show_idt) 851 { 852 struct gate_descriptor *ip; 853 int idx; 854 uintptr_t func; 855 856 ip = idt; 857 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 858 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); 859 if (func != (uintptr_t)&IDTVEC(rsvd)) { 860 db_printf("%3d\t", idx); 861 db_printsym(func, DB_STGY_PROC); 862 db_printf("\n"); 863 } 864 ip++; 865 } 866 } 867 868 /* Show privileged registers. */ 869 DB_SHOW_COMMAND(sysregs, db_show_sysregs) 870 { 871 struct { 872 uint16_t limit; 873 uint64_t base; 874 } __packed idtr, gdtr; 875 uint16_t ldt, tr; 876 877 __asm __volatile("sidt %0" : "=m" (idtr)); 878 db_printf("idtr\t0x%016lx/%04x\n", 879 (u_long)idtr.base, (u_int)idtr.limit); 880 __asm __volatile("sgdt %0" : "=m" (gdtr)); 881 db_printf("gdtr\t0x%016lx/%04x\n", 882 (u_long)gdtr.base, (u_int)gdtr.limit); 883 __asm __volatile("sldt %0" : "=r" (ldt)); 884 db_printf("ldtr\t0x%04x\n", ldt); 885 __asm __volatile("str %0" : "=r" (tr)); 886 db_printf("tr\t0x%04x\n", tr); 887 db_printf("cr0\t0x%016lx\n", rcr0()); 888 db_printf("cr2\t0x%016lx\n", rcr2()); 889 db_printf("cr3\t0x%016lx\n", rcr3()); 890 db_printf("cr4\t0x%016lx\n", rcr4()); 891 if (rcr4() & CR4_XSAVE) 892 db_printf("xcr0\t0x%016lx\n", rxcr(0)); 893 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER)); 894 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) 895 db_printf("FEATURES_CTL\t%016lx\n", 896 rdmsr(MSR_IA32_FEATURE_CONTROL)); 897 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR)); 898 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT)); 899 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE)); 900 } 901 902 DB_SHOW_COMMAND(dbregs, db_show_dbregs) 903 { 904 905 db_printf("dr0\t0x%016lx\n", rdr0()); 906 db_printf("dr1\t0x%016lx\n", rdr1()); 907 db_printf("dr2\t0x%016lx\n", rdr2()); 908 db_printf("dr3\t0x%016lx\n", rdr3()); 909 db_printf("dr6\t0x%016lx\n", rdr6()); 910 db_printf("dr7\t0x%016lx\n", rdr7()); 911 } 912 #endif 913 914 void 915 sdtossd(sd, ssd) 916 struct user_segment_descriptor *sd; 917 struct soft_segment_descriptor *ssd; 918 { 919 920 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 921 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 922 ssd->ssd_type = sd->sd_type; 923 ssd->ssd_dpl = sd->sd_dpl; 924 ssd->ssd_p = sd->sd_p; 925 ssd->ssd_long = sd->sd_long; 926 ssd->ssd_def32 = sd->sd_def32; 927 ssd->ssd_gran = sd->sd_gran; 928 } 929 930 void 931 ssdtosd(ssd, sd) 932 struct soft_segment_descriptor *ssd; 933 struct user_segment_descriptor *sd; 934 { 935 936 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 937 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 938 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 939 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 940 sd->sd_type = ssd->ssd_type; 941 sd->sd_dpl = ssd->ssd_dpl; 942 sd->sd_p = ssd->ssd_p; 943 sd->sd_long = ssd->ssd_long; 944 sd->sd_def32 = ssd->ssd_def32; 945 sd->sd_gran = ssd->ssd_gran; 946 } 947 948 void 949 ssdtosyssd(ssd, sd) 950 struct soft_segment_descriptor *ssd; 951 struct system_segment_descriptor *sd; 952 { 953 954 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 955 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 956 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 957 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 958 sd->sd_type = ssd->ssd_type; 959 sd->sd_dpl = ssd->ssd_dpl; 960 sd->sd_p = ssd->ssd_p; 961 sd->sd_gran = ssd->ssd_gran; 962 } 963 964 #if !defined(DEV_ATPIC) && defined(DEV_ISA) 965 #include <isa/isavar.h> 966 #include <isa/isareg.h> 967 /* 968 * Return a bitmap of the current interrupt requests. This is 8259-specific 969 * and is only suitable for use at probe time. 970 * This is only here to pacify sio. It is NOT FATAL if this doesn't work. 971 * It shouldn't be here. There should probably be an APIC centric 972 * implementation in the apic driver code, if at all. 973 */ 974 intrmask_t 975 isa_irq_pending(void) 976 { 977 u_char irr1; 978 u_char irr2; 979 980 irr1 = inb(IO_ICU1); 981 irr2 = inb(IO_ICU2); 982 return ((irr2 << 8) | irr1); 983 } 984 #endif 985 986 u_int basemem; 987 988 static int 989 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, 990 int *physmap_idxp) 991 { 992 int i, insert_idx, physmap_idx; 993 994 physmap_idx = *physmap_idxp; 995 996 if (length == 0) 997 return (1); 998 999 /* 1000 * Find insertion point while checking for overlap. Start off by 1001 * assuming the new entry will be added to the end. 1002 * 1003 * NB: physmap_idx points to the next free slot. 1004 */ 1005 insert_idx = physmap_idx; 1006 for (i = 0; i <= physmap_idx; i += 2) { 1007 if (base < physmap[i + 1]) { 1008 if (base + length <= physmap[i]) { 1009 insert_idx = i; 1010 break; 1011 } 1012 if (boothowto & RB_VERBOSE) 1013 printf( 1014 "Overlapping memory regions, ignoring second region\n"); 1015 return (1); 1016 } 1017 } 1018 1019 /* See if we can prepend to the next entry. */ 1020 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { 1021 physmap[insert_idx] = base; 1022 return (1); 1023 } 1024 1025 /* See if we can append to the previous entry. */ 1026 if (insert_idx > 0 && base == physmap[insert_idx - 1]) { 1027 physmap[insert_idx - 1] += length; 1028 return (1); 1029 } 1030 1031 physmap_idx += 2; 1032 *physmap_idxp = physmap_idx; 1033 if (physmap_idx == PHYSMAP_SIZE) { 1034 printf( 1035 "Too many segments in the physical address map, giving up\n"); 1036 return (0); 1037 } 1038 1039 /* 1040 * Move the last 'N' entries down to make room for the new 1041 * entry if needed. 1042 */ 1043 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) { 1044 physmap[i] = physmap[i - 2]; 1045 physmap[i + 1] = physmap[i - 1]; 1046 } 1047 1048 /* Insert the new entry. */ 1049 physmap[insert_idx] = base; 1050 physmap[insert_idx + 1] = base + length; 1051 return (1); 1052 } 1053 1054 void 1055 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize, 1056 vm_paddr_t *physmap, int *physmap_idx) 1057 { 1058 struct bios_smap *smap, *smapend; 1059 1060 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1061 1062 for (smap = smapbase; smap < smapend; smap++) { 1063 if (boothowto & RB_VERBOSE) 1064 printf("SMAP type=%02x base=%016lx len=%016lx\n", 1065 smap->type, smap->base, smap->length); 1066 1067 if (smap->type != SMAP_TYPE_MEMORY) 1068 continue; 1069 1070 if (!add_physmap_entry(smap->base, smap->length, physmap, 1071 physmap_idx)) 1072 break; 1073 } 1074 } 1075 1076 static void 1077 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, 1078 int *physmap_idx) 1079 { 1080 struct efi_md *map, *p; 1081 const char *type; 1082 size_t efisz; 1083 int ndesc, i; 1084 1085 static const char *types[] = { 1086 "Reserved", 1087 "LoaderCode", 1088 "LoaderData", 1089 "BootServicesCode", 1090 "BootServicesData", 1091 "RuntimeServicesCode", 1092 "RuntimeServicesData", 1093 "ConventionalMemory", 1094 "UnusableMemory", 1095 "ACPIReclaimMemory", 1096 "ACPIMemoryNVS", 1097 "MemoryMappedIO", 1098 "MemoryMappedIOPortSpace", 1099 "PalCode", 1100 "PersistentMemory" 1101 }; 1102 1103 /* 1104 * Memory map data provided by UEFI via the GetMemoryMap 1105 * Boot Services API. 1106 */ 1107 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; 1108 map = (struct efi_md *)((uint8_t *)efihdr + efisz); 1109 1110 if (efihdr->descriptor_size == 0) 1111 return; 1112 ndesc = efihdr->memory_size / efihdr->descriptor_size; 1113 1114 if (boothowto & RB_VERBOSE) 1115 printf("%23s %12s %12s %8s %4s\n", 1116 "Type", "Physical", "Virtual", "#Pages", "Attr"); 1117 1118 for (i = 0, p = map; i < ndesc; i++, 1119 p = efi_next_descriptor(p, efihdr->descriptor_size)) { 1120 if (boothowto & RB_VERBOSE) { 1121 if (p->md_type < nitems(types)) 1122 type = types[p->md_type]; 1123 else 1124 type = "<INVALID>"; 1125 printf("%23s %012lx %12p %08lx ", type, p->md_phys, 1126 p->md_virt, p->md_pages); 1127 if (p->md_attr & EFI_MD_ATTR_UC) 1128 printf("UC "); 1129 if (p->md_attr & EFI_MD_ATTR_WC) 1130 printf("WC "); 1131 if (p->md_attr & EFI_MD_ATTR_WT) 1132 printf("WT "); 1133 if (p->md_attr & EFI_MD_ATTR_WB) 1134 printf("WB "); 1135 if (p->md_attr & EFI_MD_ATTR_UCE) 1136 printf("UCE "); 1137 if (p->md_attr & EFI_MD_ATTR_WP) 1138 printf("WP "); 1139 if (p->md_attr & EFI_MD_ATTR_RP) 1140 printf("RP "); 1141 if (p->md_attr & EFI_MD_ATTR_XP) 1142 printf("XP "); 1143 if (p->md_attr & EFI_MD_ATTR_NV) 1144 printf("NV "); 1145 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) 1146 printf("MORE_RELIABLE "); 1147 if (p->md_attr & EFI_MD_ATTR_RO) 1148 printf("RO "); 1149 if (p->md_attr & EFI_MD_ATTR_RT) 1150 printf("RUNTIME"); 1151 printf("\n"); 1152 } 1153 1154 switch (p->md_type) { 1155 case EFI_MD_TYPE_CODE: 1156 case EFI_MD_TYPE_DATA: 1157 case EFI_MD_TYPE_BS_CODE: 1158 case EFI_MD_TYPE_BS_DATA: 1159 case EFI_MD_TYPE_FREE: 1160 /* 1161 * We're allowed to use any entry with these types. 1162 */ 1163 break; 1164 default: 1165 continue; 1166 } 1167 1168 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE), 1169 physmap, physmap_idx)) 1170 break; 1171 } 1172 } 1173 1174 static char bootmethod[16] = ""; 1175 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, 1176 "System firmware boot method"); 1177 1178 static void 1179 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) 1180 { 1181 struct bios_smap *smap; 1182 struct efi_map_header *efihdr; 1183 u_int32_t size; 1184 1185 /* 1186 * Memory map from INT 15:E820. 1187 * 1188 * subr_module.c says: 1189 * "Consumer may safely assume that size value precedes data." 1190 * ie: an int32_t immediately precedes smap. 1191 */ 1192 1193 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1194 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1195 smap = (struct bios_smap *)preload_search_info(kmdp, 1196 MODINFO_METADATA | MODINFOMD_SMAP); 1197 if (efihdr == NULL && smap == NULL) 1198 panic("No BIOS smap or EFI map info from loader!"); 1199 1200 if (efihdr != NULL) { 1201 add_efi_map_entries(efihdr, physmap, physmap_idx); 1202 strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); 1203 } else { 1204 size = *((u_int32_t *)smap - 1); 1205 bios_add_smap_entries(smap, size, physmap, physmap_idx); 1206 strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); 1207 } 1208 } 1209 1210 #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE) 1211 1212 /* 1213 * Populate the (physmap) array with base/bound pairs describing the 1214 * available physical memory in the system, then test this memory and 1215 * build the phys_avail array describing the actually-available memory. 1216 * 1217 * Total memory size may be set by the kernel environment variable 1218 * hw.physmem or the compile-time define MAXMEM. 1219 * 1220 * XXX first should be vm_paddr_t. 1221 */ 1222 static void 1223 getmemsize(caddr_t kmdp, u_int64_t first) 1224 { 1225 int i, physmap_idx, pa_indx, da_indx; 1226 vm_paddr_t pa, physmap[PHYSMAP_SIZE]; 1227 u_long physmem_start, physmem_tunable, memtest; 1228 pt_entry_t *pte; 1229 quad_t dcons_addr, dcons_size; 1230 int page_counter; 1231 1232 /* 1233 * Tell the physical memory allocator about pages used to store 1234 * the kernel and preloaded data. See kmem_bootstrap_free(). 1235 */ 1236 vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first)); 1237 1238 bzero(physmap, sizeof(physmap)); 1239 physmap_idx = 0; 1240 1241 init_ops.parse_memmap(kmdp, physmap, &physmap_idx); 1242 physmap_idx -= 2; 1243 1244 /* 1245 * Find the 'base memory' segment for SMP 1246 */ 1247 basemem = 0; 1248 for (i = 0; i <= physmap_idx; i += 2) { 1249 if (physmap[i] <= 0xA0000) { 1250 basemem = physmap[i + 1] / 1024; 1251 break; 1252 } 1253 } 1254 if (basemem == 0 || basemem > 640) { 1255 if (bootverbose) 1256 printf( 1257 "Memory map doesn't contain a basemem segment, faking it"); 1258 basemem = 640; 1259 } 1260 1261 /* 1262 * Maxmem isn't the "maximum memory", it's one larger than the 1263 * highest page of the physical address space. It should be 1264 * called something like "Maxphyspage". We may adjust this 1265 * based on ``hw.physmem'' and the results of the memory test. 1266 */ 1267 Maxmem = atop(physmap[physmap_idx + 1]); 1268 1269 #ifdef MAXMEM 1270 Maxmem = MAXMEM / 4; 1271 #endif 1272 1273 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 1274 Maxmem = atop(physmem_tunable); 1275 1276 /* 1277 * The boot memory test is disabled by default, as it takes a 1278 * significant amount of time on large-memory systems, and is 1279 * unfriendly to virtual machines as it unnecessarily touches all 1280 * pages. 1281 * 1282 * A general name is used as the code may be extended to support 1283 * additional tests beyond the current "page present" test. 1284 */ 1285 memtest = 0; 1286 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 1287 1288 /* 1289 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 1290 * in the system. 1291 */ 1292 if (Maxmem > atop(physmap[physmap_idx + 1])) 1293 Maxmem = atop(physmap[physmap_idx + 1]); 1294 1295 if (atop(physmap[physmap_idx + 1]) != Maxmem && 1296 (boothowto & RB_VERBOSE)) 1297 printf("Physical memory use set to %ldK\n", Maxmem * 4); 1298 1299 /* 1300 * Make hole for "AP -> long mode" bootstrap code. The 1301 * mp_bootaddress vector is only available when the kernel 1302 * is configured to support APs and APs for the system start 1303 * in real mode mode (e.g. SMP bare metal). 1304 */ 1305 if (init_ops.mp_bootaddress) 1306 init_ops.mp_bootaddress(physmap, &physmap_idx); 1307 1308 /* call pmap initialization to make new kernel address space */ 1309 pmap_bootstrap(&first); 1310 1311 /* 1312 * Size up each available chunk of physical memory. 1313 * 1314 * XXX Some BIOSes corrupt low 64KB between suspend and resume. 1315 * By default, mask off the first 16 pages unless we appear to be 1316 * running in a VM. 1317 */ 1318 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT; 1319 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start); 1320 if (physmap[0] < physmem_start) { 1321 if (physmem_start < PAGE_SIZE) 1322 physmap[0] = PAGE_SIZE; 1323 else if (physmem_start >= physmap[1]) 1324 physmap[0] = round_page(physmap[1] - PAGE_SIZE); 1325 else 1326 physmap[0] = round_page(physmem_start); 1327 } 1328 pa_indx = 0; 1329 da_indx = 1; 1330 phys_avail[pa_indx++] = physmap[0]; 1331 phys_avail[pa_indx] = physmap[0]; 1332 dump_avail[da_indx] = physmap[0]; 1333 pte = CMAP1; 1334 1335 /* 1336 * Get dcons buffer address 1337 */ 1338 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 1339 getenv_quad("dcons.size", &dcons_size) == 0) 1340 dcons_addr = 0; 1341 1342 /* 1343 * physmap is in bytes, so when converting to page boundaries, 1344 * round up the start address and round down the end address. 1345 */ 1346 page_counter = 0; 1347 if (memtest != 0) 1348 printf("Testing system memory"); 1349 for (i = 0; i <= physmap_idx; i += 2) { 1350 vm_paddr_t end; 1351 1352 end = ptoa((vm_paddr_t)Maxmem); 1353 if (physmap[i + 1] < end) 1354 end = trunc_page(physmap[i + 1]); 1355 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 1356 int tmp, page_bad, full; 1357 int *ptr = (int *)CADDR1; 1358 1359 full = FALSE; 1360 /* 1361 * block out kernel memory as not available. 1362 */ 1363 if (pa >= (vm_paddr_t)kernphys && pa < first) 1364 goto do_dump_avail; 1365 1366 /* 1367 * block out dcons buffer 1368 */ 1369 if (dcons_addr > 0 1370 && pa >= trunc_page(dcons_addr) 1371 && pa < dcons_addr + dcons_size) 1372 goto do_dump_avail; 1373 1374 page_bad = FALSE; 1375 if (memtest == 0) 1376 goto skip_memtest; 1377 1378 /* 1379 * Print a "." every GB to show we're making 1380 * progress. 1381 */ 1382 page_counter++; 1383 if ((page_counter % PAGES_PER_GB) == 0) 1384 printf("."); 1385 1386 /* 1387 * map page into kernel: valid, read/write,non-cacheable 1388 */ 1389 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; 1390 invltlb(); 1391 1392 tmp = *(int *)ptr; 1393 /* 1394 * Test for alternating 1's and 0's 1395 */ 1396 *(volatile int *)ptr = 0xaaaaaaaa; 1397 if (*(volatile int *)ptr != 0xaaaaaaaa) 1398 page_bad = TRUE; 1399 /* 1400 * Test for alternating 0's and 1's 1401 */ 1402 *(volatile int *)ptr = 0x55555555; 1403 if (*(volatile int *)ptr != 0x55555555) 1404 page_bad = TRUE; 1405 /* 1406 * Test for all 1's 1407 */ 1408 *(volatile int *)ptr = 0xffffffff; 1409 if (*(volatile int *)ptr != 0xffffffff) 1410 page_bad = TRUE; 1411 /* 1412 * Test for all 0's 1413 */ 1414 *(volatile int *)ptr = 0x0; 1415 if (*(volatile int *)ptr != 0x0) 1416 page_bad = TRUE; 1417 /* 1418 * Restore original value. 1419 */ 1420 *(int *)ptr = tmp; 1421 1422 skip_memtest: 1423 /* 1424 * Adjust array of valid/good pages. 1425 */ 1426 if (page_bad == TRUE) 1427 continue; 1428 /* 1429 * If this good page is a continuation of the 1430 * previous set of good pages, then just increase 1431 * the end pointer. Otherwise start a new chunk. 1432 * Note that "end" points one higher than end, 1433 * making the range >= start and < end. 1434 * If we're also doing a speculative memory 1435 * test and we at or past the end, bump up Maxmem 1436 * so that we keep going. The first bad page 1437 * will terminate the loop. 1438 */ 1439 if (phys_avail[pa_indx] == pa) { 1440 phys_avail[pa_indx] += PAGE_SIZE; 1441 } else { 1442 pa_indx++; 1443 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 1444 printf( 1445 "Too many holes in the physical address space, giving up\n"); 1446 pa_indx--; 1447 full = TRUE; 1448 goto do_dump_avail; 1449 } 1450 phys_avail[pa_indx++] = pa; /* start */ 1451 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 1452 } 1453 physmem++; 1454 do_dump_avail: 1455 if (dump_avail[da_indx] == pa) { 1456 dump_avail[da_indx] += PAGE_SIZE; 1457 } else { 1458 da_indx++; 1459 if (da_indx == DUMP_AVAIL_ARRAY_END) { 1460 da_indx--; 1461 goto do_next; 1462 } 1463 dump_avail[da_indx++] = pa; /* start */ 1464 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 1465 } 1466 do_next: 1467 if (full) 1468 break; 1469 } 1470 } 1471 *pte = 0; 1472 invltlb(); 1473 if (memtest != 0) 1474 printf("\n"); 1475 1476 /* 1477 * XXX 1478 * The last chunk must contain at least one page plus the message 1479 * buffer to avoid complicating other code (message buffer address 1480 * calculation, etc.). 1481 */ 1482 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 1483 round_page(msgbufsize) >= phys_avail[pa_indx]) { 1484 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 1485 phys_avail[pa_indx--] = 0; 1486 phys_avail[pa_indx--] = 0; 1487 } 1488 1489 Maxmem = atop(phys_avail[pa_indx]); 1490 1491 /* Trim off space for the message buffer. */ 1492 phys_avail[pa_indx] -= round_page(msgbufsize); 1493 1494 /* Map the message buffer. */ 1495 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]); 1496 } 1497 1498 static caddr_t 1499 native_parse_preload_data(u_int64_t modulep) 1500 { 1501 caddr_t kmdp; 1502 char *envp; 1503 #ifdef DDB 1504 vm_offset_t ksym_start; 1505 vm_offset_t ksym_end; 1506 #endif 1507 1508 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); 1509 preload_bootstrap_relocate(KERNBASE); 1510 kmdp = preload_search_by_type("elf kernel"); 1511 if (kmdp == NULL) 1512 kmdp = preload_search_by_type("elf64 kernel"); 1513 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 1514 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); 1515 if (envp != NULL) 1516 envp += KERNBASE; 1517 init_static_kenv(envp, 0); 1518 #ifdef DDB 1519 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 1520 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 1521 db_fetch_ksymtab(ksym_start, ksym_end); 1522 #endif 1523 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); 1524 1525 return (kmdp); 1526 } 1527 1528 static void 1529 amd64_kdb_init(void) 1530 { 1531 kdb_init(); 1532 #ifdef KDB 1533 if (boothowto & RB_KDB) 1534 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 1535 #endif 1536 } 1537 1538 /* Set up the fast syscall stuff */ 1539 void 1540 amd64_conf_fast_syscall(void) 1541 { 1542 uint64_t msr; 1543 1544 msr = rdmsr(MSR_EFER) | EFER_SCE; 1545 wrmsr(MSR_EFER, msr); 1546 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) : 1547 (u_int64_t)IDTVEC(fast_syscall)); 1548 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 1549 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1550 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 1551 wrmsr(MSR_STAR, msr); 1552 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC); 1553 } 1554 1555 u_int64_t 1556 hammer_time(u_int64_t modulep, u_int64_t physfree) 1557 { 1558 caddr_t kmdp; 1559 int gsel_tss, x; 1560 struct pcpu *pc; 1561 struct nmi_pcpu *np; 1562 struct xstate_hdr *xhdr; 1563 u_int64_t rsp0; 1564 char *env; 1565 size_t kstack0_sz; 1566 int late_console; 1567 1568 TSRAW(&thread0, TS_ENTER, __func__, NULL); 1569 1570 kmdp = init_ops.parse_preload_data(modulep); 1571 1572 physfree += ucode_load_bsp(physfree + KERNBASE); 1573 physfree = roundup2(physfree, PAGE_SIZE); 1574 1575 identify_cpu1(); 1576 identify_hypervisor(); 1577 /* 1578 * hw.cpu_stdext_disable is ignored by the call, it will be 1579 * re-evaluted by the below call to finishidentcpu(). 1580 */ 1581 identify_cpu2(); 1582 1583 link_elf_ireloc(kmdp); 1584 1585 /* 1586 * This may be done better later if it gets more high level 1587 * components in it. If so just link td->td_proc here. 1588 */ 1589 proc_linkup0(&proc0, &thread0); 1590 1591 /* Init basic tunables, hz etc */ 1592 init_param1(); 1593 1594 thread0.td_kstack = physfree + KERNBASE; 1595 thread0.td_kstack_pages = kstack_pages; 1596 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 1597 bzero((void *)thread0.td_kstack, kstack0_sz); 1598 physfree += kstack0_sz; 1599 1600 /* 1601 * make gdt memory segments 1602 */ 1603 for (x = 0; x < NGDT; x++) { 1604 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && 1605 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) 1606 ssdtosd(&gdt_segs[x], &gdt[x]); 1607 } 1608 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0]; 1609 ssdtosyssd(&gdt_segs[GPROC0_SEL], 1610 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1611 1612 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 1613 r_gdt.rd_base = (long) gdt; 1614 lgdt(&r_gdt); 1615 pc = &__pcpu[0]; 1616 1617 wrmsr(MSR_FSBASE, 0); /* User value */ 1618 wrmsr(MSR_GSBASE, (u_int64_t)pc); 1619 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 1620 1621 pcpu_init(pc, 0, sizeof(struct pcpu)); 1622 dpcpu_init((void *)(physfree + KERNBASE), 0); 1623 physfree += DPCPU_SIZE; 1624 PCPU_SET(prvspace, pc); 1625 PCPU_SET(curthread, &thread0); 1626 /* Non-late cninit() and printf() can be moved up to here. */ 1627 PCPU_SET(tssp, &common_tss[0]); 1628 PCPU_SET(commontssp, &common_tss[0]); 1629 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1630 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); 1631 PCPU_SET(fs32p, &gdt[GUFS32_SEL]); 1632 PCPU_SET(gs32p, &gdt[GUGS32_SEL]); 1633 1634 /* 1635 * Initialize mutexes. 1636 * 1637 * icu_lock: in order to allow an interrupt to occur in a critical 1638 * section, to set pcpu->ipending (etc...) properly, we 1639 * must be able to get the icu lock, so it can't be 1640 * under witness. 1641 */ 1642 mutex_init(); 1643 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); 1644 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); 1645 1646 /* exceptions */ 1647 pti = pti_get_default(); 1648 TUNABLE_INT_FETCH("vm.pmap.pti", &pti); 1649 1650 for (x = 0; x < NIDT; x++) 1651 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, 1652 SEL_KPL, 0); 1653 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT, 1654 SEL_KPL, 0); 1655 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4); 1656 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); 1657 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT, 1658 SEL_UPL, 0); 1659 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT, 1660 SEL_UPL, 0); 1661 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT, 1662 SEL_KPL, 0); 1663 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT, 1664 SEL_KPL, 0); 1665 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT, 1666 SEL_KPL, 0); 1667 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 1668 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm), 1669 SDT_SYSIGT, SEL_KPL, 0); 1670 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT, 1671 SEL_KPL, 0); 1672 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing), 1673 SDT_SYSIGT, SEL_KPL, 0); 1674 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT, 1675 SEL_KPL, 0); 1676 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT, 1677 SEL_KPL, 0); 1678 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT, 1679 SEL_KPL, 0); 1680 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT, 1681 SEL_KPL, 0); 1682 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT, 1683 SEL_KPL, 0); 1684 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3); 1685 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT, 1686 SEL_KPL, 0); 1687 #ifdef KDTRACE_HOOKS 1688 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) : 1689 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); 1690 #endif 1691 #ifdef XENHVM 1692 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) : 1693 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0); 1694 #endif 1695 r_idt.rd_limit = sizeof(idt0) - 1; 1696 r_idt.rd_base = (long) idt; 1697 lidt(&r_idt); 1698 1699 /* 1700 * Initialize the clock before the console so that console 1701 * initialization can use DELAY(). 1702 */ 1703 clock_init(); 1704 1705 /* 1706 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4) 1707 * transition). 1708 * Once bootblocks have updated, we can test directly for 1709 * efi_systbl != NULL here... 1710 */ 1711 if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP) 1712 != NULL) 1713 vty_set_preferred(VTY_VT); 1714 1715 finishidentcpu(); /* Final stage of CPU initialization */ 1716 initializecpu(); /* Initialize CPU registers */ 1717 initializecpucache(); 1718 1719 /* doublefault stack space, runs on ist1 */ 1720 common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)]; 1721 1722 /* 1723 * NMI stack, runs on ist2. The pcpu pointer is stored just 1724 * above the start of the ist2 stack. 1725 */ 1726 np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1; 1727 np->np_pcpu = (register_t) pc; 1728 common_tss[0].tss_ist2 = (long) np; 1729 1730 /* 1731 * MC# stack, runs on ist3. The pcpu pointer is stored just 1732 * above the start of the ist3 stack. 1733 */ 1734 np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1; 1735 np->np_pcpu = (register_t) pc; 1736 common_tss[0].tss_ist3 = (long) np; 1737 1738 /* 1739 * DB# stack, runs on ist4. 1740 */ 1741 np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1; 1742 np->np_pcpu = (register_t) pc; 1743 common_tss[0].tss_ist4 = (long) np; 1744 1745 /* Set the IO permission bitmap (empty due to tss seg limit) */ 1746 common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; 1747 1748 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 1749 ltr(gsel_tss); 1750 1751 amd64_conf_fast_syscall(); 1752 1753 /* 1754 * Temporary forge some valid pointer to PCB, for exception 1755 * handlers. It is reinitialized properly below after FPU is 1756 * set up. Also set up td_critnest to short-cut the page 1757 * fault handler. 1758 */ 1759 cpu_max_ext_state_size = sizeof(struct savefpu); 1760 thread0.td_pcb = get_pcb_td(&thread0); 1761 thread0.td_critnest = 1; 1762 1763 /* 1764 * The console and kdb should be initialized even earlier than here, 1765 * but some console drivers don't work until after getmemsize(). 1766 * Default to late console initialization to support these drivers. 1767 * This loses mainly printf()s in getmemsize() and early debugging. 1768 */ 1769 late_console = 1; 1770 TUNABLE_INT_FETCH("debug.late_console", &late_console); 1771 if (!late_console) { 1772 cninit(); 1773 amd64_kdb_init(); 1774 } 1775 1776 getmemsize(kmdp, physfree); 1777 init_param2(physmem); 1778 1779 /* now running on new page tables, configured,and u/iom is accessible */ 1780 1781 if (late_console) 1782 cninit(); 1783 1784 #ifdef DEV_ISA 1785 #ifdef DEV_ATPIC 1786 elcr_probe(); 1787 atpic_startup(); 1788 #else 1789 /* Reset and mask the atpics and leave them shut down. */ 1790 atpic_reset(); 1791 1792 /* 1793 * Point the ICU spurious interrupt vectors at the APIC spurious 1794 * interrupt handler. 1795 */ 1796 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1797 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1798 #endif 1799 #else 1800 #error "have you forgotten the isa device?"; 1801 #endif 1802 1803 if (late_console) 1804 amd64_kdb_init(); 1805 1806 msgbufinit(msgbufp, msgbufsize); 1807 fpuinit(); 1808 1809 /* 1810 * Set up thread0 pcb after fpuinit calculated pcb + fpu save 1811 * area size. Zero out the extended state header in fpu save 1812 * area. 1813 */ 1814 thread0.td_pcb = get_pcb_td(&thread0); 1815 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); 1816 bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); 1817 if (use_xsave) { 1818 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 1819 1); 1820 xhdr->xstate_bv = xsave_mask; 1821 } 1822 /* make an initial tss so cpu can get interrupt stack on syscall! */ 1823 rsp0 = (vm_offset_t)thread0.td_pcb; 1824 /* Ensure the stack is aligned to 16 bytes */ 1825 rsp0 &= ~0xFul; 1826 common_tss[0].tss_rsp0 = rsp0; 1827 PCPU_SET(rsp0, rsp0); 1828 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) + 1829 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful); 1830 PCPU_SET(curpcb, thread0.td_pcb); 1831 1832 /* transfer to user mode */ 1833 1834 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 1835 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 1836 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 1837 _ufssel = GSEL(GUFS32_SEL, SEL_UPL); 1838 _ugssel = GSEL(GUGS32_SEL, SEL_UPL); 1839 1840 load_ds(_udatasel); 1841 load_es(_udatasel); 1842 load_fs(_ufssel); 1843 1844 /* setup proc 0's pcb */ 1845 thread0.td_pcb->pcb_flags = 0; 1846 thread0.td_frame = &proc0_tf; 1847 1848 env = kern_getenv("kernelname"); 1849 if (env != NULL) 1850 strlcpy(kernelname, env, sizeof(kernelname)); 1851 1852 cpu_probe_amdc1e(); 1853 1854 #ifdef FDT 1855 x86_init_fdt(); 1856 #endif 1857 thread0.td_critnest = 0; 1858 1859 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable); 1860 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable); 1861 1862 TSEXIT(); 1863 1864 /* Location of kernel stack for locore */ 1865 return ((u_int64_t)thread0.td_pcb); 1866 } 1867 1868 void 1869 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 1870 { 1871 1872 pcpu->pc_acpi_id = 0xffffffff; 1873 } 1874 1875 static int 1876 smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 1877 { 1878 struct bios_smap *smapbase; 1879 struct bios_smap_xattr smap; 1880 caddr_t kmdp; 1881 uint32_t *smapattr; 1882 int count, error, i; 1883 1884 /* Retrieve the system memory map from the loader. */ 1885 kmdp = preload_search_by_type("elf kernel"); 1886 if (kmdp == NULL) 1887 kmdp = preload_search_by_type("elf64 kernel"); 1888 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1889 MODINFO_METADATA | MODINFOMD_SMAP); 1890 if (smapbase == NULL) 1891 return (0); 1892 smapattr = (uint32_t *)preload_search_info(kmdp, 1893 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 1894 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase); 1895 error = 0; 1896 for (i = 0; i < count; i++) { 1897 smap.base = smapbase[i].base; 1898 smap.length = smapbase[i].length; 1899 smap.type = smapbase[i].type; 1900 if (smapattr != NULL) 1901 smap.xattr = smapattr[i]; 1902 else 1903 smap.xattr = 0; 1904 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 1905 } 1906 return (error); 1907 } 1908 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 1909 smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); 1910 1911 static int 1912 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) 1913 { 1914 struct efi_map_header *efihdr; 1915 caddr_t kmdp; 1916 uint32_t efisize; 1917 1918 kmdp = preload_search_by_type("elf kernel"); 1919 if (kmdp == NULL) 1920 kmdp = preload_search_by_type("elf64 kernel"); 1921 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1922 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1923 if (efihdr == NULL) 1924 return (0); 1925 efisize = *((uint32_t *)efihdr - 1); 1926 return (SYSCTL_OUT(req, efihdr, efisize)); 1927 } 1928 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 1929 efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map"); 1930 1931 void 1932 spinlock_enter(void) 1933 { 1934 struct thread *td; 1935 register_t flags; 1936 1937 td = curthread; 1938 if (td->td_md.md_spinlock_count == 0) { 1939 flags = intr_disable(); 1940 td->td_md.md_spinlock_count = 1; 1941 td->td_md.md_saved_flags = flags; 1942 critical_enter(); 1943 } else 1944 td->td_md.md_spinlock_count++; 1945 } 1946 1947 void 1948 spinlock_exit(void) 1949 { 1950 struct thread *td; 1951 register_t flags; 1952 1953 td = curthread; 1954 flags = td->td_md.md_saved_flags; 1955 td->td_md.md_spinlock_count--; 1956 if (td->td_md.md_spinlock_count == 0) { 1957 critical_exit(); 1958 intr_restore(flags); 1959 } 1960 } 1961 1962 /* 1963 * Construct a PCB from a trapframe. This is called from kdb_trap() where 1964 * we want to start a backtrace from the function that caused us to enter 1965 * the debugger. We have the context in the trapframe, but base the trace 1966 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 1967 * enough for a backtrace. 1968 */ 1969 void 1970 makectx(struct trapframe *tf, struct pcb *pcb) 1971 { 1972 1973 pcb->pcb_r12 = tf->tf_r12; 1974 pcb->pcb_r13 = tf->tf_r13; 1975 pcb->pcb_r14 = tf->tf_r14; 1976 pcb->pcb_r15 = tf->tf_r15; 1977 pcb->pcb_rbp = tf->tf_rbp; 1978 pcb->pcb_rbx = tf->tf_rbx; 1979 pcb->pcb_rip = tf->tf_rip; 1980 pcb->pcb_rsp = tf->tf_rsp; 1981 } 1982 1983 int 1984 ptrace_set_pc(struct thread *td, unsigned long addr) 1985 { 1986 1987 td->td_frame->tf_rip = addr; 1988 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 1989 return (0); 1990 } 1991 1992 int 1993 ptrace_single_step(struct thread *td) 1994 { 1995 1996 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 1997 if ((td->td_frame->tf_rflags & PSL_T) == 0) { 1998 td->td_frame->tf_rflags |= PSL_T; 1999 td->td_dbgflags |= TDB_STEP; 2000 } 2001 return (0); 2002 } 2003 2004 int 2005 ptrace_clear_single_step(struct thread *td) 2006 { 2007 2008 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2009 td->td_frame->tf_rflags &= ~PSL_T; 2010 td->td_dbgflags &= ~TDB_STEP; 2011 return (0); 2012 } 2013 2014 int 2015 fill_regs(struct thread *td, struct reg *regs) 2016 { 2017 struct trapframe *tp; 2018 2019 tp = td->td_frame; 2020 return (fill_frame_regs(tp, regs)); 2021 } 2022 2023 int 2024 fill_frame_regs(struct trapframe *tp, struct reg *regs) 2025 { 2026 regs->r_r15 = tp->tf_r15; 2027 regs->r_r14 = tp->tf_r14; 2028 regs->r_r13 = tp->tf_r13; 2029 regs->r_r12 = tp->tf_r12; 2030 regs->r_r11 = tp->tf_r11; 2031 regs->r_r10 = tp->tf_r10; 2032 regs->r_r9 = tp->tf_r9; 2033 regs->r_r8 = tp->tf_r8; 2034 regs->r_rdi = tp->tf_rdi; 2035 regs->r_rsi = tp->tf_rsi; 2036 regs->r_rbp = tp->tf_rbp; 2037 regs->r_rbx = tp->tf_rbx; 2038 regs->r_rdx = tp->tf_rdx; 2039 regs->r_rcx = tp->tf_rcx; 2040 regs->r_rax = tp->tf_rax; 2041 regs->r_rip = tp->tf_rip; 2042 regs->r_cs = tp->tf_cs; 2043 regs->r_rflags = tp->tf_rflags; 2044 regs->r_rsp = tp->tf_rsp; 2045 regs->r_ss = tp->tf_ss; 2046 if (tp->tf_flags & TF_HASSEGS) { 2047 regs->r_ds = tp->tf_ds; 2048 regs->r_es = tp->tf_es; 2049 regs->r_fs = tp->tf_fs; 2050 regs->r_gs = tp->tf_gs; 2051 } else { 2052 regs->r_ds = 0; 2053 regs->r_es = 0; 2054 regs->r_fs = 0; 2055 regs->r_gs = 0; 2056 } 2057 return (0); 2058 } 2059 2060 int 2061 set_regs(struct thread *td, struct reg *regs) 2062 { 2063 struct trapframe *tp; 2064 register_t rflags; 2065 2066 tp = td->td_frame; 2067 rflags = regs->r_rflags & 0xffffffff; 2068 if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs)) 2069 return (EINVAL); 2070 tp->tf_r15 = regs->r_r15; 2071 tp->tf_r14 = regs->r_r14; 2072 tp->tf_r13 = regs->r_r13; 2073 tp->tf_r12 = regs->r_r12; 2074 tp->tf_r11 = regs->r_r11; 2075 tp->tf_r10 = regs->r_r10; 2076 tp->tf_r9 = regs->r_r9; 2077 tp->tf_r8 = regs->r_r8; 2078 tp->tf_rdi = regs->r_rdi; 2079 tp->tf_rsi = regs->r_rsi; 2080 tp->tf_rbp = regs->r_rbp; 2081 tp->tf_rbx = regs->r_rbx; 2082 tp->tf_rdx = regs->r_rdx; 2083 tp->tf_rcx = regs->r_rcx; 2084 tp->tf_rax = regs->r_rax; 2085 tp->tf_rip = regs->r_rip; 2086 tp->tf_cs = regs->r_cs; 2087 tp->tf_rflags = rflags; 2088 tp->tf_rsp = regs->r_rsp; 2089 tp->tf_ss = regs->r_ss; 2090 if (0) { /* XXXKIB */ 2091 tp->tf_ds = regs->r_ds; 2092 tp->tf_es = regs->r_es; 2093 tp->tf_fs = regs->r_fs; 2094 tp->tf_gs = regs->r_gs; 2095 tp->tf_flags = TF_HASSEGS; 2096 } 2097 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 2098 return (0); 2099 } 2100 2101 /* XXX check all this stuff! */ 2102 /* externalize from sv_xmm */ 2103 static void 2104 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs) 2105 { 2106 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2107 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2108 int i; 2109 2110 /* pcb -> fpregs */ 2111 bzero(fpregs, sizeof(*fpregs)); 2112 2113 /* FPU control/status */ 2114 penv_fpreg->en_cw = penv_xmm->en_cw; 2115 penv_fpreg->en_sw = penv_xmm->en_sw; 2116 penv_fpreg->en_tw = penv_xmm->en_tw; 2117 penv_fpreg->en_opcode = penv_xmm->en_opcode; 2118 penv_fpreg->en_rip = penv_xmm->en_rip; 2119 penv_fpreg->en_rdp = penv_xmm->en_rdp; 2120 penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr; 2121 penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask; 2122 2123 /* FPU registers */ 2124 for (i = 0; i < 8; ++i) 2125 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10); 2126 2127 /* SSE registers */ 2128 for (i = 0; i < 16; ++i) 2129 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16); 2130 } 2131 2132 /* internalize from fpregs into sv_xmm */ 2133 static void 2134 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm) 2135 { 2136 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2137 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2138 int i; 2139 2140 /* fpregs -> pcb */ 2141 /* FPU control/status */ 2142 penv_xmm->en_cw = penv_fpreg->en_cw; 2143 penv_xmm->en_sw = penv_fpreg->en_sw; 2144 penv_xmm->en_tw = penv_fpreg->en_tw; 2145 penv_xmm->en_opcode = penv_fpreg->en_opcode; 2146 penv_xmm->en_rip = penv_fpreg->en_rip; 2147 penv_xmm->en_rdp = penv_fpreg->en_rdp; 2148 penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr; 2149 penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask; 2150 2151 /* FPU registers */ 2152 for (i = 0; i < 8; ++i) 2153 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10); 2154 2155 /* SSE registers */ 2156 for (i = 0; i < 16; ++i) 2157 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16); 2158 } 2159 2160 /* externalize from td->pcb */ 2161 int 2162 fill_fpregs(struct thread *td, struct fpreg *fpregs) 2163 { 2164 2165 KASSERT(td == curthread || TD_IS_SUSPENDED(td) || 2166 P_SHOULDSTOP(td->td_proc), 2167 ("not suspended thread %p", td)); 2168 fpugetregs(td); 2169 fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs); 2170 return (0); 2171 } 2172 2173 /* internalize to td->pcb */ 2174 int 2175 set_fpregs(struct thread *td, struct fpreg *fpregs) 2176 { 2177 2178 critical_enter(); 2179 set_fpregs_xmm(fpregs, get_pcb_user_save_td(td)); 2180 fpuuserinited(td); 2181 critical_exit(); 2182 return (0); 2183 } 2184 2185 /* 2186 * Get machine context. 2187 */ 2188 int 2189 get_mcontext(struct thread *td, mcontext_t *mcp, int flags) 2190 { 2191 struct pcb *pcb; 2192 struct trapframe *tp; 2193 2194 pcb = td->td_pcb; 2195 tp = td->td_frame; 2196 PROC_LOCK(curthread->td_proc); 2197 mcp->mc_onstack = sigonstack(tp->tf_rsp); 2198 PROC_UNLOCK(curthread->td_proc); 2199 mcp->mc_r15 = tp->tf_r15; 2200 mcp->mc_r14 = tp->tf_r14; 2201 mcp->mc_r13 = tp->tf_r13; 2202 mcp->mc_r12 = tp->tf_r12; 2203 mcp->mc_r11 = tp->tf_r11; 2204 mcp->mc_r10 = tp->tf_r10; 2205 mcp->mc_r9 = tp->tf_r9; 2206 mcp->mc_r8 = tp->tf_r8; 2207 mcp->mc_rdi = tp->tf_rdi; 2208 mcp->mc_rsi = tp->tf_rsi; 2209 mcp->mc_rbp = tp->tf_rbp; 2210 mcp->mc_rbx = tp->tf_rbx; 2211 mcp->mc_rcx = tp->tf_rcx; 2212 mcp->mc_rflags = tp->tf_rflags; 2213 if (flags & GET_MC_CLEAR_RET) { 2214 mcp->mc_rax = 0; 2215 mcp->mc_rdx = 0; 2216 mcp->mc_rflags &= ~PSL_C; 2217 } else { 2218 mcp->mc_rax = tp->tf_rax; 2219 mcp->mc_rdx = tp->tf_rdx; 2220 } 2221 mcp->mc_rip = tp->tf_rip; 2222 mcp->mc_cs = tp->tf_cs; 2223 mcp->mc_rsp = tp->tf_rsp; 2224 mcp->mc_ss = tp->tf_ss; 2225 mcp->mc_ds = tp->tf_ds; 2226 mcp->mc_es = tp->tf_es; 2227 mcp->mc_fs = tp->tf_fs; 2228 mcp->mc_gs = tp->tf_gs; 2229 mcp->mc_flags = tp->tf_flags; 2230 mcp->mc_len = sizeof(*mcp); 2231 get_fpcontext(td, mcp, NULL, 0); 2232 update_pcb_bases(pcb); 2233 mcp->mc_fsbase = pcb->pcb_fsbase; 2234 mcp->mc_gsbase = pcb->pcb_gsbase; 2235 mcp->mc_xfpustate = 0; 2236 mcp->mc_xfpustate_len = 0; 2237 bzero(mcp->mc_spare, sizeof(mcp->mc_spare)); 2238 return (0); 2239 } 2240 2241 /* 2242 * Set machine context. 2243 * 2244 * However, we don't set any but the user modifiable flags, and we won't 2245 * touch the cs selector. 2246 */ 2247 int 2248 set_mcontext(struct thread *td, mcontext_t *mcp) 2249 { 2250 struct pcb *pcb; 2251 struct trapframe *tp; 2252 char *xfpustate; 2253 long rflags; 2254 int ret; 2255 2256 pcb = td->td_pcb; 2257 tp = td->td_frame; 2258 if (mcp->mc_len != sizeof(*mcp) || 2259 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) 2260 return (EINVAL); 2261 rflags = (mcp->mc_rflags & PSL_USERCHANGE) | 2262 (tp->tf_rflags & ~PSL_USERCHANGE); 2263 if (mcp->mc_flags & _MC_HASFPXSTATE) { 2264 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - 2265 sizeof(struct savefpu)) 2266 return (EINVAL); 2267 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); 2268 ret = copyin((void *)mcp->mc_xfpustate, xfpustate, 2269 mcp->mc_xfpustate_len); 2270 if (ret != 0) 2271 return (ret); 2272 } else 2273 xfpustate = NULL; 2274 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); 2275 if (ret != 0) 2276 return (ret); 2277 tp->tf_r15 = mcp->mc_r15; 2278 tp->tf_r14 = mcp->mc_r14; 2279 tp->tf_r13 = mcp->mc_r13; 2280 tp->tf_r12 = mcp->mc_r12; 2281 tp->tf_r11 = mcp->mc_r11; 2282 tp->tf_r10 = mcp->mc_r10; 2283 tp->tf_r9 = mcp->mc_r9; 2284 tp->tf_r8 = mcp->mc_r8; 2285 tp->tf_rdi = mcp->mc_rdi; 2286 tp->tf_rsi = mcp->mc_rsi; 2287 tp->tf_rbp = mcp->mc_rbp; 2288 tp->tf_rbx = mcp->mc_rbx; 2289 tp->tf_rdx = mcp->mc_rdx; 2290 tp->tf_rcx = mcp->mc_rcx; 2291 tp->tf_rax = mcp->mc_rax; 2292 tp->tf_rip = mcp->mc_rip; 2293 tp->tf_rflags = rflags; 2294 tp->tf_rsp = mcp->mc_rsp; 2295 tp->tf_ss = mcp->mc_ss; 2296 tp->tf_flags = mcp->mc_flags; 2297 if (tp->tf_flags & TF_HASSEGS) { 2298 tp->tf_ds = mcp->mc_ds; 2299 tp->tf_es = mcp->mc_es; 2300 tp->tf_fs = mcp->mc_fs; 2301 tp->tf_gs = mcp->mc_gs; 2302 } 2303 set_pcb_flags(pcb, PCB_FULL_IRET); 2304 if (mcp->mc_flags & _MC_HASBASES) { 2305 pcb->pcb_fsbase = mcp->mc_fsbase; 2306 pcb->pcb_gsbase = mcp->mc_gsbase; 2307 } 2308 return (0); 2309 } 2310 2311 static void 2312 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, 2313 size_t xfpusave_len) 2314 { 2315 size_t max_len, len; 2316 2317 mcp->mc_ownedfp = fpugetregs(td); 2318 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], 2319 sizeof(mcp->mc_fpstate)); 2320 mcp->mc_fpformat = fpuformat(); 2321 if (!use_xsave || xfpusave_len == 0) 2322 return; 2323 max_len = cpu_max_ext_state_size - sizeof(struct savefpu); 2324 len = xfpusave_len; 2325 if (len > max_len) { 2326 len = max_len; 2327 bzero(xfpusave + max_len, len - max_len); 2328 } 2329 mcp->mc_flags |= _MC_HASFPXSTATE; 2330 mcp->mc_xfpustate_len = len; 2331 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); 2332 } 2333 2334 static int 2335 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, 2336 size_t xfpustate_len) 2337 { 2338 int error; 2339 2340 if (mcp->mc_fpformat == _MC_FPFMT_NODEV) 2341 return (0); 2342 else if (mcp->mc_fpformat != _MC_FPFMT_XMM) 2343 return (EINVAL); 2344 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { 2345 /* We don't care what state is left in the FPU or PCB. */ 2346 fpstate_drop(td); 2347 error = 0; 2348 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || 2349 mcp->mc_ownedfp == _MC_FPOWNED_PCB) { 2350 error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate, 2351 xfpustate, xfpustate_len); 2352 } else 2353 return (EINVAL); 2354 return (error); 2355 } 2356 2357 void 2358 fpstate_drop(struct thread *td) 2359 { 2360 2361 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); 2362 critical_enter(); 2363 if (PCPU_GET(fpcurthread) == td) 2364 fpudrop(); 2365 /* 2366 * XXX force a full drop of the fpu. The above only drops it if we 2367 * owned it. 2368 * 2369 * XXX I don't much like fpugetuserregs()'s semantics of doing a full 2370 * drop. Dropping only to the pcb matches fnsave's behaviour. 2371 * We only need to drop to !PCB_INITDONE in sendsig(). But 2372 * sendsig() is the only caller of fpugetuserregs()... perhaps we just 2373 * have too many layers. 2374 */ 2375 clear_pcb_flags(curthread->td_pcb, 2376 PCB_FPUINITDONE | PCB_USERFPUINITDONE); 2377 critical_exit(); 2378 } 2379 2380 int 2381 fill_dbregs(struct thread *td, struct dbreg *dbregs) 2382 { 2383 struct pcb *pcb; 2384 2385 if (td == NULL) { 2386 dbregs->dr[0] = rdr0(); 2387 dbregs->dr[1] = rdr1(); 2388 dbregs->dr[2] = rdr2(); 2389 dbregs->dr[3] = rdr3(); 2390 dbregs->dr[6] = rdr6(); 2391 dbregs->dr[7] = rdr7(); 2392 } else { 2393 pcb = td->td_pcb; 2394 dbregs->dr[0] = pcb->pcb_dr0; 2395 dbregs->dr[1] = pcb->pcb_dr1; 2396 dbregs->dr[2] = pcb->pcb_dr2; 2397 dbregs->dr[3] = pcb->pcb_dr3; 2398 dbregs->dr[6] = pcb->pcb_dr6; 2399 dbregs->dr[7] = pcb->pcb_dr7; 2400 } 2401 dbregs->dr[4] = 0; 2402 dbregs->dr[5] = 0; 2403 dbregs->dr[8] = 0; 2404 dbregs->dr[9] = 0; 2405 dbregs->dr[10] = 0; 2406 dbregs->dr[11] = 0; 2407 dbregs->dr[12] = 0; 2408 dbregs->dr[13] = 0; 2409 dbregs->dr[14] = 0; 2410 dbregs->dr[15] = 0; 2411 return (0); 2412 } 2413 2414 int 2415 set_dbregs(struct thread *td, struct dbreg *dbregs) 2416 { 2417 struct pcb *pcb; 2418 int i; 2419 2420 if (td == NULL) { 2421 load_dr0(dbregs->dr[0]); 2422 load_dr1(dbregs->dr[1]); 2423 load_dr2(dbregs->dr[2]); 2424 load_dr3(dbregs->dr[3]); 2425 load_dr6(dbregs->dr[6]); 2426 load_dr7(dbregs->dr[7]); 2427 } else { 2428 /* 2429 * Don't let an illegal value for dr7 get set. Specifically, 2430 * check for undefined settings. Setting these bit patterns 2431 * result in undefined behaviour and can lead to an unexpected 2432 * TRCTRAP or a general protection fault right here. 2433 * Upper bits of dr6 and dr7 must not be set 2434 */ 2435 for (i = 0; i < 4; i++) { 2436 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) 2437 return (EINVAL); 2438 if (td->td_frame->tf_cs == _ucode32sel && 2439 DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8) 2440 return (EINVAL); 2441 } 2442 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 || 2443 (dbregs->dr[7] & 0xffffffff00000000ul) != 0) 2444 return (EINVAL); 2445 2446 pcb = td->td_pcb; 2447 2448 /* 2449 * Don't let a process set a breakpoint that is not within the 2450 * process's address space. If a process could do this, it 2451 * could halt the system by setting a breakpoint in the kernel 2452 * (if ddb was enabled). Thus, we need to check to make sure 2453 * that no breakpoints are being enabled for addresses outside 2454 * process's address space. 2455 * 2456 * XXX - what about when the watched area of the user's 2457 * address space is written into from within the kernel 2458 * ... wouldn't that still cause a breakpoint to be generated 2459 * from within kernel mode? 2460 */ 2461 2462 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { 2463 /* dr0 is enabled */ 2464 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) 2465 return (EINVAL); 2466 } 2467 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { 2468 /* dr1 is enabled */ 2469 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) 2470 return (EINVAL); 2471 } 2472 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { 2473 /* dr2 is enabled */ 2474 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) 2475 return (EINVAL); 2476 } 2477 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { 2478 /* dr3 is enabled */ 2479 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) 2480 return (EINVAL); 2481 } 2482 2483 pcb->pcb_dr0 = dbregs->dr[0]; 2484 pcb->pcb_dr1 = dbregs->dr[1]; 2485 pcb->pcb_dr2 = dbregs->dr[2]; 2486 pcb->pcb_dr3 = dbregs->dr[3]; 2487 pcb->pcb_dr6 = dbregs->dr[6]; 2488 pcb->pcb_dr7 = dbregs->dr[7]; 2489 2490 set_pcb_flags(pcb, PCB_DBREGS); 2491 } 2492 2493 return (0); 2494 } 2495 2496 void 2497 reset_dbregs(void) 2498 { 2499 2500 load_dr7(0); /* Turn off the control bits first */ 2501 load_dr0(0); 2502 load_dr1(0); 2503 load_dr2(0); 2504 load_dr3(0); 2505 load_dr6(0); 2506 } 2507 2508 /* 2509 * Return > 0 if a hardware breakpoint has been hit, and the 2510 * breakpoint was in user space. Return 0, otherwise. 2511 */ 2512 int 2513 user_dbreg_trap(register_t dr6) 2514 { 2515 u_int64_t dr7; 2516 u_int64_t bp; /* breakpoint bits extracted from dr6 */ 2517 int nbp; /* number of breakpoints that triggered */ 2518 caddr_t addr[4]; /* breakpoint addresses */ 2519 int i; 2520 2521 bp = dr6 & DBREG_DR6_BMASK; 2522 if (bp == 0) { 2523 /* 2524 * None of the breakpoint bits are set meaning this 2525 * trap was not caused by any of the debug registers 2526 */ 2527 return 0; 2528 } 2529 2530 dr7 = rdr7(); 2531 if ((dr7 & 0x000000ff) == 0) { 2532 /* 2533 * all GE and LE bits in the dr7 register are zero, 2534 * thus the trap couldn't have been caused by the 2535 * hardware debug registers 2536 */ 2537 return 0; 2538 } 2539 2540 nbp = 0; 2541 2542 /* 2543 * at least one of the breakpoints were hit, check to see 2544 * which ones and if any of them are user space addresses 2545 */ 2546 2547 if (bp & 0x01) { 2548 addr[nbp++] = (caddr_t)rdr0(); 2549 } 2550 if (bp & 0x02) { 2551 addr[nbp++] = (caddr_t)rdr1(); 2552 } 2553 if (bp & 0x04) { 2554 addr[nbp++] = (caddr_t)rdr2(); 2555 } 2556 if (bp & 0x08) { 2557 addr[nbp++] = (caddr_t)rdr3(); 2558 } 2559 2560 for (i = 0; i < nbp; i++) { 2561 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { 2562 /* 2563 * addr[i] is in user space 2564 */ 2565 return nbp; 2566 } 2567 } 2568 2569 /* 2570 * None of the breakpoints are in user space. 2571 */ 2572 return 0; 2573 } 2574 2575 /* 2576 * The pcb_flags is only modified by current thread, or by other threads 2577 * when current thread is stopped. However, current thread may change it 2578 * from the interrupt context in cpu_switch(), or in the trap handler. 2579 * When we read-modify-write pcb_flags from C sources, compiler may generate 2580 * code that is not atomic regarding the interrupt handler. If a trap or 2581 * interrupt happens and any flag is modified from the handler, it can be 2582 * clobbered with the cached value later. Therefore, we implement setting 2583 * and clearing flags with single-instruction functions, which do not race 2584 * with possible modification of the flags from the trap or interrupt context, 2585 * because traps and interrupts are executed only on instruction boundary. 2586 */ 2587 void 2588 set_pcb_flags_raw(struct pcb *pcb, const u_int flags) 2589 { 2590 2591 __asm __volatile("orl %1,%0" 2592 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) 2593 : "cc", "memory"); 2594 2595 } 2596 2597 /* 2598 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs 2599 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into 2600 * pcb if user space modified the bases. We must save on the context 2601 * switch or if the return to usermode happens through the doreti. 2602 * 2603 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET, 2604 * which have a consequence that the base MSRs must be saved each time 2605 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with 2606 * context switches. 2607 */ 2608 void 2609 set_pcb_flags(struct pcb *pcb, const u_int flags) 2610 { 2611 register_t r; 2612 2613 if (curpcb == pcb && 2614 (flags & PCB_FULL_IRET) != 0 && 2615 (pcb->pcb_flags & PCB_FULL_IRET) == 0 && 2616 (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) { 2617 r = intr_disable(); 2618 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) { 2619 if (rfs() == _ufssel) 2620 pcb->pcb_fsbase = rdfsbase(); 2621 if (rgs() == _ugssel) 2622 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE); 2623 } 2624 set_pcb_flags_raw(pcb, flags); 2625 intr_restore(r); 2626 } else { 2627 set_pcb_flags_raw(pcb, flags); 2628 } 2629 } 2630 2631 void 2632 clear_pcb_flags(struct pcb *pcb, const u_int flags) 2633 { 2634 2635 __asm __volatile("andl %1,%0" 2636 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) 2637 : "cc", "memory"); 2638 } 2639 2640 #ifdef KDB 2641 2642 /* 2643 * Provide inb() and outb() as functions. They are normally only available as 2644 * inline functions, thus cannot be called from the debugger. 2645 */ 2646 2647 /* silence compiler warnings */ 2648 u_char inb_(u_short); 2649 void outb_(u_short, u_char); 2650 2651 u_char 2652 inb_(u_short port) 2653 { 2654 return inb(port); 2655 } 2656 2657 void 2658 outb_(u_short port, u_char data) 2659 { 2660 outb(port, data); 2661 } 2662 2663 #endif /* KDB */ 2664