1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 2018 The FreeBSD Foundation 5 * Copyright (c) 1992 Terrence R. Lambert. 6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * William Jolitz. 11 * 12 * Portions of this software were developed by A. Joseph Koshy under 13 * sponsorship from the FreeBSD Foundation and Google, Inc. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. All advertising materials mentioning features or use of this software 24 * must display the following acknowledgement: 25 * This product includes software developed by the University of 26 * California, Berkeley and its contributors. 27 * 4. Neither the name of the University nor the names of its contributors 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41 * SUCH DAMAGE. 42 * 43 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 44 */ 45 46 #include <sys/cdefs.h> 47 __FBSDID("$FreeBSD$"); 48 49 #include "opt_apic.h" 50 #include "opt_atpic.h" 51 #include "opt_cpu.h" 52 #include "opt_ddb.h" 53 #include "opt_inet.h" 54 #include "opt_isa.h" 55 #include "opt_kstack_pages.h" 56 #include "opt_maxmem.h" 57 #include "opt_mp_watchdog.h" 58 #include "opt_perfmon.h" 59 #include "opt_platform.h" 60 61 #include <sys/param.h> 62 #include <sys/proc.h> 63 #include <sys/systm.h> 64 #include <sys/bio.h> 65 #include <sys/buf.h> 66 #include <sys/bus.h> 67 #include <sys/callout.h> 68 #include <sys/cons.h> 69 #include <sys/cpu.h> 70 #include <sys/eventhandler.h> 71 #include <sys/exec.h> 72 #include <sys/imgact.h> 73 #include <sys/kdb.h> 74 #include <sys/kernel.h> 75 #include <sys/ktr.h> 76 #include <sys/linker.h> 77 #include <sys/lock.h> 78 #include <sys/malloc.h> 79 #include <sys/memrange.h> 80 #include <sys/msgbuf.h> 81 #include <sys/mutex.h> 82 #include <sys/pcpu.h> 83 #include <sys/ptrace.h> 84 #include <sys/reboot.h> 85 #include <sys/rwlock.h> 86 #include <sys/sched.h> 87 #include <sys/signalvar.h> 88 #include <sys/smp.h> 89 #include <sys/syscallsubr.h> 90 #include <sys/sysctl.h> 91 #include <sys/sysent.h> 92 #include <sys/sysproto.h> 93 #include <sys/ucontext.h> 94 #include <sys/vmmeter.h> 95 96 #include <vm/vm.h> 97 #include <vm/vm_extern.h> 98 #include <vm/vm_kern.h> 99 #include <vm/vm_page.h> 100 #include <vm/vm_map.h> 101 #include <vm/vm_object.h> 102 #include <vm/vm_pager.h> 103 #include <vm/vm_param.h> 104 #include <vm/vm_phys.h> 105 106 #ifdef DDB 107 #ifndef KDB 108 #error KDB must be enabled in order for DDB to work! 109 #endif 110 #include <ddb/ddb.h> 111 #include <ddb/db_sym.h> 112 #endif 113 114 #include <isa/rtc.h> 115 116 #include <net/netisr.h> 117 118 #include <machine/bootinfo.h> 119 #include <machine/clock.h> 120 #include <machine/cpu.h> 121 #include <machine/cputypes.h> 122 #include <machine/intr_machdep.h> 123 #include <x86/mca.h> 124 #include <machine/md_var.h> 125 #include <machine/metadata.h> 126 #include <machine/mp_watchdog.h> 127 #include <machine/pc/bios.h> 128 #include <machine/pcb.h> 129 #include <machine/pcb_ext.h> 130 #include <machine/proc.h> 131 #include <machine/reg.h> 132 #include <machine/sigframe.h> 133 #include <machine/specialreg.h> 134 #include <machine/sysarch.h> 135 #include <machine/trap.h> 136 #include <x86/ucode.h> 137 #include <machine/vm86.h> 138 #include <x86/init.h> 139 #ifdef PERFMON 140 #include <machine/perfmon.h> 141 #endif 142 #ifdef SMP 143 #include <machine/smp.h> 144 #endif 145 #ifdef FDT 146 #include <x86/fdt.h> 147 #endif 148 149 #ifdef DEV_APIC 150 #include <x86/apicvar.h> 151 #endif 152 153 #ifdef DEV_ISA 154 #include <x86/isa/icu.h> 155 #endif 156 157 /* Sanity check for __curthread() */ 158 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 159 160 register_t init386(int first); 161 void dblfault_handler(void); 162 void identify_cpu(void); 163 164 static void cpu_startup(void *); 165 static void fpstate_drop(struct thread *td); 166 static void get_fpcontext(struct thread *td, mcontext_t *mcp, 167 char *xfpusave, size_t xfpusave_len); 168 static int set_fpcontext(struct thread *td, mcontext_t *mcp, 169 char *xfpustate, size_t xfpustate_len); 170 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 171 172 /* Intel ICH registers */ 173 #define ICH_PMBASE 0x400 174 #define ICH_SMI_EN ICH_PMBASE + 0x30 175 176 int _udatasel, _ucodesel; 177 u_int basemem; 178 static int above4g_allow = 1; 179 static int above24g_allow = 0; 180 181 int cold = 1; 182 183 #ifdef COMPAT_43 184 static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 185 #endif 186 #ifdef COMPAT_FREEBSD4 187 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 188 #endif 189 190 long Maxmem = 0; 191 long realmem = 0; 192 193 #ifdef PAE 194 FEATURE(pae, "Physical Address Extensions"); 195 #endif 196 197 struct kva_md_info kmi; 198 199 static struct trapframe proc0_tf; 200 struct pcpu __pcpu[MAXCPU]; 201 202 struct mtx icu_lock; 203 204 struct mem_range_softc mem_range_softc; 205 206 extern char start_exceptions[], end_exceptions[]; 207 208 extern struct sysentvec elf32_freebsd_sysvec; 209 210 /* Default init_ops implementation. */ 211 struct init_ops init_ops = { 212 .early_clock_source_init = i8254_init, 213 .early_delay = i8254_delay, 214 #ifdef DEV_APIC 215 .msi_init = msi_init, 216 #endif 217 }; 218 219 static void 220 cpu_startup(dummy) 221 void *dummy; 222 { 223 uintmax_t memsize; 224 char *sysenv; 225 226 /* 227 * On MacBooks, we need to disallow the legacy USB circuit to 228 * generate an SMI# because this can cause several problems, 229 * namely: incorrect CPU frequency detection and failure to 230 * start the APs. 231 * We do this by disabling a bit in the SMI_EN (SMI Control and 232 * Enable register) of the Intel ICH LPC Interface Bridge. 233 */ 234 sysenv = kern_getenv("smbios.system.product"); 235 if (sysenv != NULL) { 236 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 237 strncmp(sysenv, "MacBook3,1", 10) == 0 || 238 strncmp(sysenv, "MacBook4,1", 10) == 0 || 239 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 240 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 241 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 242 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 243 strncmp(sysenv, "Macmini1,1", 10) == 0) { 244 if (bootverbose) 245 printf("Disabling LEGACY_USB_EN bit on " 246 "Intel ICH.\n"); 247 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 248 } 249 freeenv(sysenv); 250 } 251 252 /* 253 * Good {morning,afternoon,evening,night}. 254 */ 255 startrtclock(); 256 printcpuinfo(); 257 panicifcpuunsupported(); 258 #ifdef PERFMON 259 perfmon_init(); 260 #endif 261 262 /* 263 * Display physical memory if SMBIOS reports reasonable amount. 264 */ 265 memsize = 0; 266 sysenv = kern_getenv("smbios.memory.enabled"); 267 if (sysenv != NULL) { 268 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 269 freeenv(sysenv); 270 } 271 if (memsize < ptoa((uintmax_t)vm_free_count())) 272 memsize = ptoa((uintmax_t)Maxmem); 273 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 274 realmem = atop(memsize); 275 276 /* 277 * Display any holes after the first chunk of extended memory. 278 */ 279 if (bootverbose) { 280 int indx; 281 282 printf("Physical memory chunk(s):\n"); 283 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 284 vm_paddr_t size; 285 286 size = phys_avail[indx + 1] - phys_avail[indx]; 287 printf( 288 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 289 (uintmax_t)phys_avail[indx], 290 (uintmax_t)phys_avail[indx + 1] - 1, 291 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 292 } 293 } 294 295 vm_ksubmap_init(&kmi); 296 297 printf("avail memory = %ju (%ju MB)\n", 298 ptoa((uintmax_t)vm_free_count()), 299 ptoa((uintmax_t)vm_free_count()) / 1048576); 300 301 /* 302 * Set up buffers, so they can be used to read disk labels. 303 */ 304 bufinit(); 305 vm_pager_bufferinit(); 306 cpu_setregs(); 307 } 308 309 /* 310 * Send an interrupt to process. 311 * 312 * Stack is set up to allow sigcode stored 313 * at top to call routine, followed by call 314 * to sigreturn routine below. After sigreturn 315 * resets the signal mask, the stack, and the 316 * frame pointer, it returns to the user 317 * specified pc, psl. 318 */ 319 #ifdef COMPAT_43 320 static void 321 osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 322 { 323 struct osigframe sf, *fp; 324 struct proc *p; 325 struct thread *td; 326 struct sigacts *psp; 327 struct trapframe *regs; 328 int sig; 329 int oonstack; 330 331 td = curthread; 332 p = td->td_proc; 333 PROC_LOCK_ASSERT(p, MA_OWNED); 334 sig = ksi->ksi_signo; 335 psp = p->p_sigacts; 336 mtx_assert(&psp->ps_mtx, MA_OWNED); 337 regs = td->td_frame; 338 oonstack = sigonstack(regs->tf_esp); 339 340 /* Allocate space for the signal handler context. */ 341 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && 342 SIGISMEMBER(psp->ps_sigonstack, sig)) { 343 fp = (struct osigframe *)((uintptr_t)td->td_sigstk.ss_sp + 344 td->td_sigstk.ss_size - sizeof(struct osigframe)); 345 #if defined(COMPAT_43) 346 td->td_sigstk.ss_flags |= SS_ONSTACK; 347 #endif 348 } else 349 fp = (struct osigframe *)regs->tf_esp - 1; 350 351 /* Build the argument list for the signal handler. */ 352 sf.sf_signum = sig; 353 sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; 354 bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo)); 355 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 356 /* Signal handler installed with SA_SIGINFO. */ 357 sf.sf_arg2 = (register_t)&fp->sf_siginfo; 358 sf.sf_siginfo.si_signo = sig; 359 sf.sf_siginfo.si_code = ksi->ksi_code; 360 sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; 361 sf.sf_addr = 0; 362 } else { 363 /* Old FreeBSD-style arguments. */ 364 sf.sf_arg2 = ksi->ksi_code; 365 sf.sf_addr = (register_t)ksi->ksi_addr; 366 sf.sf_ahu.sf_handler = catcher; 367 } 368 mtx_unlock(&psp->ps_mtx); 369 PROC_UNLOCK(p); 370 371 /* Save most if not all of trap frame. */ 372 sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; 373 sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; 374 sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; 375 sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; 376 sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; 377 sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; 378 sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; 379 sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; 380 sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; 381 sf.sf_siginfo.si_sc.sc_es = regs->tf_es; 382 sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; 383 sf.sf_siginfo.si_sc.sc_gs = rgs(); 384 sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; 385 386 /* Build the signal context to be used by osigreturn(). */ 387 sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; 388 SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); 389 sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; 390 sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; 391 sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; 392 sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; 393 sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; 394 sf.sf_siginfo.si_sc.sc_err = regs->tf_err; 395 396 /* 397 * If we're a vm86 process, we want to save the segment registers. 398 * We also change eflags to be our emulated eflags, not the actual 399 * eflags. 400 */ 401 if (regs->tf_eflags & PSL_VM) { 402 /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */ 403 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 404 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 405 406 sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; 407 sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; 408 sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; 409 sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; 410 411 if (vm86->vm86_has_vme == 0) 412 sf.sf_siginfo.si_sc.sc_ps = 413 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 414 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 415 416 /* See sendsig() for comments. */ 417 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 418 } 419 420 /* 421 * Copy the sigframe out to the user's stack. 422 */ 423 if (copyout(&sf, fp, sizeof(*fp)) != 0) { 424 PROC_LOCK(p); 425 sigexit(td, SIGILL); 426 } 427 428 regs->tf_esp = (int)fp; 429 if (p->p_sysent->sv_sigcode_base != 0) { 430 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 431 szosigcode; 432 } else { 433 /* a.out sysentvec does not use shared page */ 434 regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode; 435 } 436 regs->tf_eflags &= ~(PSL_T | PSL_D); 437 regs->tf_cs = _ucodesel; 438 regs->tf_ds = _udatasel; 439 regs->tf_es = _udatasel; 440 regs->tf_fs = _udatasel; 441 load_gs(_udatasel); 442 regs->tf_ss = _udatasel; 443 PROC_LOCK(p); 444 mtx_lock(&psp->ps_mtx); 445 } 446 #endif /* COMPAT_43 */ 447 448 #ifdef COMPAT_FREEBSD4 449 static void 450 freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 451 { 452 struct sigframe4 sf, *sfp; 453 struct proc *p; 454 struct thread *td; 455 struct sigacts *psp; 456 struct trapframe *regs; 457 int sig; 458 int oonstack; 459 460 td = curthread; 461 p = td->td_proc; 462 PROC_LOCK_ASSERT(p, MA_OWNED); 463 sig = ksi->ksi_signo; 464 psp = p->p_sigacts; 465 mtx_assert(&psp->ps_mtx, MA_OWNED); 466 regs = td->td_frame; 467 oonstack = sigonstack(regs->tf_esp); 468 469 /* Save user context. */ 470 bzero(&sf, sizeof(sf)); 471 sf.sf_uc.uc_sigmask = *mask; 472 sf.sf_uc.uc_stack = td->td_sigstk; 473 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 474 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 475 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 476 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 477 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 478 bzero(sf.sf_uc.uc_mcontext.mc_fpregs, 479 sizeof(sf.sf_uc.uc_mcontext.mc_fpregs)); 480 bzero(sf.sf_uc.uc_mcontext.__spare__, 481 sizeof(sf.sf_uc.uc_mcontext.__spare__)); 482 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 483 484 /* Allocate space for the signal handler context. */ 485 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 486 SIGISMEMBER(psp->ps_sigonstack, sig)) { 487 sfp = (struct sigframe4 *)((uintptr_t)td->td_sigstk.ss_sp + 488 td->td_sigstk.ss_size - sizeof(struct sigframe4)); 489 #if defined(COMPAT_43) 490 td->td_sigstk.ss_flags |= SS_ONSTACK; 491 #endif 492 } else 493 sfp = (struct sigframe4 *)regs->tf_esp - 1; 494 495 /* Build the argument list for the signal handler. */ 496 sf.sf_signum = sig; 497 sf.sf_ucontext = (register_t)&sfp->sf_uc; 498 bzero(&sf.sf_si, sizeof(sf.sf_si)); 499 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 500 /* Signal handler installed with SA_SIGINFO. */ 501 sf.sf_siginfo = (register_t)&sfp->sf_si; 502 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 503 504 /* Fill in POSIX parts */ 505 sf.sf_si.si_signo = sig; 506 sf.sf_si.si_code = ksi->ksi_code; 507 sf.sf_si.si_addr = ksi->ksi_addr; 508 } else { 509 /* Old FreeBSD-style arguments. */ 510 sf.sf_siginfo = ksi->ksi_code; 511 sf.sf_addr = (register_t)ksi->ksi_addr; 512 sf.sf_ahu.sf_handler = catcher; 513 } 514 mtx_unlock(&psp->ps_mtx); 515 PROC_UNLOCK(p); 516 517 /* 518 * If we're a vm86 process, we want to save the segment registers. 519 * We also change eflags to be our emulated eflags, not the actual 520 * eflags. 521 */ 522 if (regs->tf_eflags & PSL_VM) { 523 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 524 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 525 526 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 527 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 528 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 529 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 530 531 if (vm86->vm86_has_vme == 0) 532 sf.sf_uc.uc_mcontext.mc_eflags = 533 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 534 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 535 536 /* 537 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 538 * syscalls made by the signal handler. This just avoids 539 * wasting time for our lazy fixup of such faults. PSL_NT 540 * does nothing in vm86 mode, but vm86 programs can set it 541 * almost legitimately in probes for old cpu types. 542 */ 543 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 544 } 545 546 /* 547 * Copy the sigframe out to the user's stack. 548 */ 549 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { 550 PROC_LOCK(p); 551 sigexit(td, SIGILL); 552 } 553 554 regs->tf_esp = (int)sfp; 555 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 556 szfreebsd4_sigcode; 557 regs->tf_eflags &= ~(PSL_T | PSL_D); 558 regs->tf_cs = _ucodesel; 559 regs->tf_ds = _udatasel; 560 regs->tf_es = _udatasel; 561 regs->tf_fs = _udatasel; 562 regs->tf_ss = _udatasel; 563 PROC_LOCK(p); 564 mtx_lock(&psp->ps_mtx); 565 } 566 #endif /* COMPAT_FREEBSD4 */ 567 568 void 569 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 570 { 571 struct sigframe sf, *sfp; 572 struct proc *p; 573 struct thread *td; 574 struct sigacts *psp; 575 char *sp; 576 struct trapframe *regs; 577 struct segment_descriptor *sdp; 578 char *xfpusave; 579 size_t xfpusave_len; 580 int sig; 581 int oonstack; 582 583 td = curthread; 584 p = td->td_proc; 585 PROC_LOCK_ASSERT(p, MA_OWNED); 586 sig = ksi->ksi_signo; 587 psp = p->p_sigacts; 588 mtx_assert(&psp->ps_mtx, MA_OWNED); 589 #ifdef COMPAT_FREEBSD4 590 if (SIGISMEMBER(psp->ps_freebsd4, sig)) { 591 freebsd4_sendsig(catcher, ksi, mask); 592 return; 593 } 594 #endif 595 #ifdef COMPAT_43 596 if (SIGISMEMBER(psp->ps_osigset, sig)) { 597 osendsig(catcher, ksi, mask); 598 return; 599 } 600 #endif 601 regs = td->td_frame; 602 oonstack = sigonstack(regs->tf_esp); 603 604 if (cpu_max_ext_state_size > sizeof(union savefpu) && use_xsave) { 605 xfpusave_len = cpu_max_ext_state_size - sizeof(union savefpu); 606 xfpusave = __builtin_alloca(xfpusave_len); 607 } else { 608 xfpusave_len = 0; 609 xfpusave = NULL; 610 } 611 612 /* Save user context. */ 613 bzero(&sf, sizeof(sf)); 614 sf.sf_uc.uc_sigmask = *mask; 615 sf.sf_uc.uc_stack = td->td_sigstk; 616 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 617 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 618 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 619 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 620 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 621 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 622 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); 623 fpstate_drop(td); 624 /* 625 * Unconditionally fill the fsbase and gsbase into the mcontext. 626 */ 627 sdp = &td->td_pcb->pcb_fsd; 628 sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 | 629 sdp->sd_lobase; 630 sdp = &td->td_pcb->pcb_gsd; 631 sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 | 632 sdp->sd_lobase; 633 bzero(sf.sf_uc.uc_mcontext.mc_spare2, 634 sizeof(sf.sf_uc.uc_mcontext.mc_spare2)); 635 636 /* Allocate space for the signal handler context. */ 637 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 638 SIGISMEMBER(psp->ps_sigonstack, sig)) { 639 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; 640 #if defined(COMPAT_43) 641 td->td_sigstk.ss_flags |= SS_ONSTACK; 642 #endif 643 } else 644 sp = (char *)regs->tf_esp - 128; 645 if (xfpusave != NULL) { 646 sp -= xfpusave_len; 647 sp = (char *)((unsigned int)sp & ~0x3F); 648 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; 649 } 650 sp -= sizeof(struct sigframe); 651 652 /* Align to 16 bytes. */ 653 sfp = (struct sigframe *)((unsigned int)sp & ~0xF); 654 655 /* Build the argument list for the signal handler. */ 656 sf.sf_signum = sig; 657 sf.sf_ucontext = (register_t)&sfp->sf_uc; 658 bzero(&sf.sf_si, sizeof(sf.sf_si)); 659 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 660 /* Signal handler installed with SA_SIGINFO. */ 661 sf.sf_siginfo = (register_t)&sfp->sf_si; 662 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 663 664 /* Fill in POSIX parts */ 665 sf.sf_si = ksi->ksi_info; 666 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 667 } else { 668 /* Old FreeBSD-style arguments. */ 669 sf.sf_siginfo = ksi->ksi_code; 670 sf.sf_addr = (register_t)ksi->ksi_addr; 671 sf.sf_ahu.sf_handler = catcher; 672 } 673 mtx_unlock(&psp->ps_mtx); 674 PROC_UNLOCK(p); 675 676 /* 677 * If we're a vm86 process, we want to save the segment registers. 678 * We also change eflags to be our emulated eflags, not the actual 679 * eflags. 680 */ 681 if (regs->tf_eflags & PSL_VM) { 682 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 683 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 684 685 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 686 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 687 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 688 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 689 690 if (vm86->vm86_has_vme == 0) 691 sf.sf_uc.uc_mcontext.mc_eflags = 692 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 693 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 694 695 /* 696 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 697 * syscalls made by the signal handler. This just avoids 698 * wasting time for our lazy fixup of such faults. PSL_NT 699 * does nothing in vm86 mode, but vm86 programs can set it 700 * almost legitimately in probes for old cpu types. 701 */ 702 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 703 } 704 705 /* 706 * Copy the sigframe out to the user's stack. 707 */ 708 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || 709 (xfpusave != NULL && copyout(xfpusave, 710 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) 711 != 0)) { 712 PROC_LOCK(p); 713 sigexit(td, SIGILL); 714 } 715 716 regs->tf_esp = (int)sfp; 717 regs->tf_eip = p->p_sysent->sv_sigcode_base; 718 if (regs->tf_eip == 0) 719 regs->tf_eip = p->p_sysent->sv_psstrings - szsigcode; 720 regs->tf_eflags &= ~(PSL_T | PSL_D); 721 regs->tf_cs = _ucodesel; 722 regs->tf_ds = _udatasel; 723 regs->tf_es = _udatasel; 724 regs->tf_fs = _udatasel; 725 regs->tf_ss = _udatasel; 726 PROC_LOCK(p); 727 mtx_lock(&psp->ps_mtx); 728 } 729 730 /* 731 * System call to cleanup state after a signal 732 * has been taken. Reset signal mask and 733 * stack state from context left by sendsig (above). 734 * Return to previous pc and psl as specified by 735 * context left by sendsig. Check carefully to 736 * make sure that the user has not modified the 737 * state to gain improper privileges. 738 * 739 * MPSAFE 740 */ 741 #ifdef COMPAT_43 742 int 743 osigreturn(td, uap) 744 struct thread *td; 745 struct osigreturn_args /* { 746 struct osigcontext *sigcntxp; 747 } */ *uap; 748 { 749 struct osigcontext sc; 750 struct trapframe *regs; 751 struct osigcontext *scp; 752 int eflags, error; 753 ksiginfo_t ksi; 754 755 regs = td->td_frame; 756 error = copyin(uap->sigcntxp, &sc, sizeof(sc)); 757 if (error != 0) 758 return (error); 759 scp = ≻ 760 eflags = scp->sc_ps; 761 if (eflags & PSL_VM) { 762 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 763 struct vm86_kernel *vm86; 764 765 /* 766 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 767 * set up the vm86 area, and we can't enter vm86 mode. 768 */ 769 if (td->td_pcb->pcb_ext == 0) 770 return (EINVAL); 771 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 772 if (vm86->vm86_inited == 0) 773 return (EINVAL); 774 775 /* Go back to user mode if both flags are set. */ 776 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 777 ksiginfo_init_trap(&ksi); 778 ksi.ksi_signo = SIGBUS; 779 ksi.ksi_code = BUS_OBJERR; 780 ksi.ksi_addr = (void *)regs->tf_eip; 781 trapsignal(td, &ksi); 782 } 783 784 if (vm86->vm86_has_vme) { 785 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 786 (eflags & VME_USERCHANGE) | PSL_VM; 787 } else { 788 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 789 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 790 (eflags & VM_USERCHANGE) | PSL_VM; 791 } 792 tf->tf_vm86_ds = scp->sc_ds; 793 tf->tf_vm86_es = scp->sc_es; 794 tf->tf_vm86_fs = scp->sc_fs; 795 tf->tf_vm86_gs = scp->sc_gs; 796 tf->tf_ds = _udatasel; 797 tf->tf_es = _udatasel; 798 tf->tf_fs = _udatasel; 799 } else { 800 /* 801 * Don't allow users to change privileged or reserved flags. 802 */ 803 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 804 return (EINVAL); 805 } 806 807 /* 808 * Don't allow users to load a valid privileged %cs. Let the 809 * hardware check for invalid selectors, excess privilege in 810 * other selectors, invalid %eip's and invalid %esp's. 811 */ 812 if (!CS_SECURE(scp->sc_cs)) { 813 ksiginfo_init_trap(&ksi); 814 ksi.ksi_signo = SIGBUS; 815 ksi.ksi_code = BUS_OBJERR; 816 ksi.ksi_trapno = T_PROTFLT; 817 ksi.ksi_addr = (void *)regs->tf_eip; 818 trapsignal(td, &ksi); 819 return (EINVAL); 820 } 821 regs->tf_ds = scp->sc_ds; 822 regs->tf_es = scp->sc_es; 823 regs->tf_fs = scp->sc_fs; 824 } 825 826 /* Restore remaining registers. */ 827 regs->tf_eax = scp->sc_eax; 828 regs->tf_ebx = scp->sc_ebx; 829 regs->tf_ecx = scp->sc_ecx; 830 regs->tf_edx = scp->sc_edx; 831 regs->tf_esi = scp->sc_esi; 832 regs->tf_edi = scp->sc_edi; 833 regs->tf_cs = scp->sc_cs; 834 regs->tf_ss = scp->sc_ss; 835 regs->tf_isp = scp->sc_isp; 836 regs->tf_ebp = scp->sc_fp; 837 regs->tf_esp = scp->sc_sp; 838 regs->tf_eip = scp->sc_pc; 839 regs->tf_eflags = eflags; 840 841 #if defined(COMPAT_43) 842 if (scp->sc_onstack & 1) 843 td->td_sigstk.ss_flags |= SS_ONSTACK; 844 else 845 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 846 #endif 847 kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL, 848 SIGPROCMASK_OLD); 849 return (EJUSTRETURN); 850 } 851 #endif /* COMPAT_43 */ 852 853 #ifdef COMPAT_FREEBSD4 854 /* 855 * MPSAFE 856 */ 857 int 858 freebsd4_sigreturn(td, uap) 859 struct thread *td; 860 struct freebsd4_sigreturn_args /* { 861 const ucontext4 *sigcntxp; 862 } */ *uap; 863 { 864 struct ucontext4 uc; 865 struct trapframe *regs; 866 struct ucontext4 *ucp; 867 int cs, eflags, error; 868 ksiginfo_t ksi; 869 870 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 871 if (error != 0) 872 return (error); 873 ucp = &uc; 874 regs = td->td_frame; 875 eflags = ucp->uc_mcontext.mc_eflags; 876 if (eflags & PSL_VM) { 877 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 878 struct vm86_kernel *vm86; 879 880 /* 881 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 882 * set up the vm86 area, and we can't enter vm86 mode. 883 */ 884 if (td->td_pcb->pcb_ext == 0) 885 return (EINVAL); 886 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 887 if (vm86->vm86_inited == 0) 888 return (EINVAL); 889 890 /* Go back to user mode if both flags are set. */ 891 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 892 ksiginfo_init_trap(&ksi); 893 ksi.ksi_signo = SIGBUS; 894 ksi.ksi_code = BUS_OBJERR; 895 ksi.ksi_addr = (void *)regs->tf_eip; 896 trapsignal(td, &ksi); 897 } 898 if (vm86->vm86_has_vme) { 899 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 900 (eflags & VME_USERCHANGE) | PSL_VM; 901 } else { 902 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 903 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 904 (eflags & VM_USERCHANGE) | PSL_VM; 905 } 906 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 907 tf->tf_eflags = eflags; 908 tf->tf_vm86_ds = tf->tf_ds; 909 tf->tf_vm86_es = tf->tf_es; 910 tf->tf_vm86_fs = tf->tf_fs; 911 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 912 tf->tf_ds = _udatasel; 913 tf->tf_es = _udatasel; 914 tf->tf_fs = _udatasel; 915 } else { 916 /* 917 * Don't allow users to change privileged or reserved flags. 918 */ 919 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 920 uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n", 921 td->td_proc->p_pid, td->td_name, eflags); 922 return (EINVAL); 923 } 924 925 /* 926 * Don't allow users to load a valid privileged %cs. Let the 927 * hardware check for invalid selectors, excess privilege in 928 * other selectors, invalid %eip's and invalid %esp's. 929 */ 930 cs = ucp->uc_mcontext.mc_cs; 931 if (!CS_SECURE(cs)) { 932 uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n", 933 td->td_proc->p_pid, td->td_name, cs); 934 ksiginfo_init_trap(&ksi); 935 ksi.ksi_signo = SIGBUS; 936 ksi.ksi_code = BUS_OBJERR; 937 ksi.ksi_trapno = T_PROTFLT; 938 ksi.ksi_addr = (void *)regs->tf_eip; 939 trapsignal(td, &ksi); 940 return (EINVAL); 941 } 942 943 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 944 } 945 946 #if defined(COMPAT_43) 947 if (ucp->uc_mcontext.mc_onstack & 1) 948 td->td_sigstk.ss_flags |= SS_ONSTACK; 949 else 950 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 951 #endif 952 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 953 return (EJUSTRETURN); 954 } 955 #endif /* COMPAT_FREEBSD4 */ 956 957 /* 958 * MPSAFE 959 */ 960 int 961 sys_sigreturn(td, uap) 962 struct thread *td; 963 struct sigreturn_args /* { 964 const struct __ucontext *sigcntxp; 965 } */ *uap; 966 { 967 ucontext_t uc; 968 struct proc *p; 969 struct trapframe *regs; 970 ucontext_t *ucp; 971 char *xfpustate; 972 size_t xfpustate_len; 973 int cs, eflags, error, ret; 974 ksiginfo_t ksi; 975 976 p = td->td_proc; 977 978 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 979 if (error != 0) 980 return (error); 981 ucp = &uc; 982 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { 983 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, 984 td->td_name, ucp->uc_mcontext.mc_flags); 985 return (EINVAL); 986 } 987 regs = td->td_frame; 988 eflags = ucp->uc_mcontext.mc_eflags; 989 if (eflags & PSL_VM) { 990 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 991 struct vm86_kernel *vm86; 992 993 /* 994 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 995 * set up the vm86 area, and we can't enter vm86 mode. 996 */ 997 if (td->td_pcb->pcb_ext == 0) 998 return (EINVAL); 999 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 1000 if (vm86->vm86_inited == 0) 1001 return (EINVAL); 1002 1003 /* Go back to user mode if both flags are set. */ 1004 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 1005 ksiginfo_init_trap(&ksi); 1006 ksi.ksi_signo = SIGBUS; 1007 ksi.ksi_code = BUS_OBJERR; 1008 ksi.ksi_addr = (void *)regs->tf_eip; 1009 trapsignal(td, &ksi); 1010 } 1011 1012 if (vm86->vm86_has_vme) { 1013 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 1014 (eflags & VME_USERCHANGE) | PSL_VM; 1015 } else { 1016 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 1017 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 1018 (eflags & VM_USERCHANGE) | PSL_VM; 1019 } 1020 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 1021 tf->tf_eflags = eflags; 1022 tf->tf_vm86_ds = tf->tf_ds; 1023 tf->tf_vm86_es = tf->tf_es; 1024 tf->tf_vm86_fs = tf->tf_fs; 1025 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 1026 tf->tf_ds = _udatasel; 1027 tf->tf_es = _udatasel; 1028 tf->tf_fs = _udatasel; 1029 } else { 1030 /* 1031 * Don't allow users to change privileged or reserved flags. 1032 */ 1033 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 1034 uprintf("pid %d (%s): sigreturn eflags = 0x%x\n", 1035 td->td_proc->p_pid, td->td_name, eflags); 1036 return (EINVAL); 1037 } 1038 1039 /* 1040 * Don't allow users to load a valid privileged %cs. Let the 1041 * hardware check for invalid selectors, excess privilege in 1042 * other selectors, invalid %eip's and invalid %esp's. 1043 */ 1044 cs = ucp->uc_mcontext.mc_cs; 1045 if (!CS_SECURE(cs)) { 1046 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", 1047 td->td_proc->p_pid, td->td_name, cs); 1048 ksiginfo_init_trap(&ksi); 1049 ksi.ksi_signo = SIGBUS; 1050 ksi.ksi_code = BUS_OBJERR; 1051 ksi.ksi_trapno = T_PROTFLT; 1052 ksi.ksi_addr = (void *)regs->tf_eip; 1053 trapsignal(td, &ksi); 1054 return (EINVAL); 1055 } 1056 1057 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { 1058 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; 1059 if (xfpustate_len > cpu_max_ext_state_size - 1060 sizeof(union savefpu)) { 1061 uprintf( 1062 "pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", 1063 p->p_pid, td->td_name, xfpustate_len); 1064 return (EINVAL); 1065 } 1066 xfpustate = __builtin_alloca(xfpustate_len); 1067 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, 1068 xfpustate, xfpustate_len); 1069 if (error != 0) { 1070 uprintf( 1071 "pid %d (%s): sigreturn copying xfpustate failed\n", 1072 p->p_pid, td->td_name); 1073 return (error); 1074 } 1075 } else { 1076 xfpustate = NULL; 1077 xfpustate_len = 0; 1078 } 1079 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, 1080 xfpustate_len); 1081 if (ret != 0) 1082 return (ret); 1083 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 1084 } 1085 1086 #if defined(COMPAT_43) 1087 if (ucp->uc_mcontext.mc_onstack & 1) 1088 td->td_sigstk.ss_flags |= SS_ONSTACK; 1089 else 1090 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 1091 #endif 1092 1093 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 1094 return (EJUSTRETURN); 1095 } 1096 1097 #ifdef COMPAT_43 1098 static void 1099 setup_priv_lcall_gate(struct proc *p) 1100 { 1101 struct i386_ldt_args uap; 1102 union descriptor desc; 1103 u_int lcall_addr; 1104 1105 bzero(&uap, sizeof(uap)); 1106 uap.start = 0; 1107 uap.num = 1; 1108 lcall_addr = p->p_sysent->sv_psstrings - sz_lcall_tramp; 1109 bzero(&desc, sizeof(desc)); 1110 desc.sd.sd_type = SDT_MEMERA; 1111 desc.sd.sd_dpl = SEL_UPL; 1112 desc.sd.sd_p = 1; 1113 desc.sd.sd_def32 = 1; 1114 desc.sd.sd_gran = 1; 1115 desc.sd.sd_lolimit = 0xffff; 1116 desc.sd.sd_hilimit = 0xf; 1117 desc.sd.sd_lobase = lcall_addr; 1118 desc.sd.sd_hibase = lcall_addr >> 24; 1119 i386_set_ldt(curthread, &uap, &desc); 1120 } 1121 #endif 1122 1123 /* 1124 * Reset registers to default values on exec. 1125 */ 1126 void 1127 exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) 1128 { 1129 struct trapframe *regs; 1130 struct pcb *pcb; 1131 register_t saved_eflags; 1132 1133 regs = td->td_frame; 1134 pcb = td->td_pcb; 1135 1136 /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ 1137 pcb->pcb_gs = _udatasel; 1138 load_gs(_udatasel); 1139 1140 mtx_lock_spin(&dt_lock); 1141 if (td->td_proc->p_md.md_ldt != NULL) 1142 user_ldt_free(td); 1143 else 1144 mtx_unlock_spin(&dt_lock); 1145 1146 #ifdef COMPAT_43 1147 if (td->td_proc->p_sysent->sv_psstrings != 1148 elf32_freebsd_sysvec.sv_psstrings) 1149 setup_priv_lcall_gate(td->td_proc); 1150 #endif 1151 1152 /* 1153 * Reset the fs and gs bases. The values from the old address 1154 * space do not make sense for the new program. In particular, 1155 * gsbase might be the TLS base for the old program but the new 1156 * program has no TLS now. 1157 */ 1158 set_fsbase(td, 0); 1159 set_gsbase(td, 0); 1160 1161 /* Make sure edx is 0x0 on entry. Linux binaries depend on it. */ 1162 saved_eflags = regs->tf_eflags & PSL_T; 1163 bzero((char *)regs, sizeof(struct trapframe)); 1164 regs->tf_eip = imgp->entry_addr; 1165 regs->tf_esp = stack; 1166 regs->tf_eflags = PSL_USER | saved_eflags; 1167 regs->tf_ss = _udatasel; 1168 regs->tf_ds = _udatasel; 1169 regs->tf_es = _udatasel; 1170 regs->tf_fs = _udatasel; 1171 regs->tf_cs = _ucodesel; 1172 1173 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ 1174 regs->tf_ebx = imgp->ps_strings; 1175 1176 /* 1177 * Reset the hardware debug registers if they were in use. 1178 * They won't have any meaning for the newly exec'd process. 1179 */ 1180 if (pcb->pcb_flags & PCB_DBREGS) { 1181 pcb->pcb_dr0 = 0; 1182 pcb->pcb_dr1 = 0; 1183 pcb->pcb_dr2 = 0; 1184 pcb->pcb_dr3 = 0; 1185 pcb->pcb_dr6 = 0; 1186 pcb->pcb_dr7 = 0; 1187 if (pcb == curpcb) { 1188 /* 1189 * Clear the debug registers on the running 1190 * CPU, otherwise they will end up affecting 1191 * the next process we switch to. 1192 */ 1193 reset_dbregs(); 1194 } 1195 pcb->pcb_flags &= ~PCB_DBREGS; 1196 } 1197 1198 pcb->pcb_initial_npxcw = __INITIAL_NPXCW__; 1199 1200 /* 1201 * Drop the FP state if we hold it, so that the process gets a 1202 * clean FP state if it uses the FPU again. 1203 */ 1204 fpstate_drop(td); 1205 } 1206 1207 void 1208 cpu_setregs(void) 1209 { 1210 unsigned int cr0; 1211 1212 cr0 = rcr0(); 1213 1214 /* 1215 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support: 1216 * 1217 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT 1218 * instructions. We must set the CR0_MP bit and use the CR0_TS 1219 * bit to control the trap, because setting the CR0_EM bit does 1220 * not cause WAIT instructions to trap. It's important to trap 1221 * WAIT instructions - otherwise the "wait" variants of no-wait 1222 * control instructions would degenerate to the "no-wait" variants 1223 * after FP context switches but work correctly otherwise. It's 1224 * particularly important to trap WAITs when there is no NPX - 1225 * otherwise the "wait" variants would always degenerate. 1226 * 1227 * Try setting CR0_NE to get correct error reporting on 486DX's. 1228 * Setting it should fail or do nothing on lesser processors. 1229 */ 1230 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 1231 load_cr0(cr0); 1232 load_gs(_udatasel); 1233 } 1234 1235 u_long bootdev; /* not a struct cdev *- encoding is different */ 1236 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, 1237 CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)"); 1238 1239 static char bootmethod[16] = "BIOS"; 1240 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, 1241 "System firmware boot method"); 1242 1243 /* 1244 * Initialize 386 and configure to run kernel 1245 */ 1246 1247 /* 1248 * Initialize segments & interrupt table 1249 */ 1250 1251 int _default_ldt; 1252 1253 struct mtx dt_lock; /* lock for GDT and LDT */ 1254 1255 union descriptor gdt0[NGDT]; /* initial global descriptor table */ 1256 union descriptor *gdt = gdt0; /* global descriptor table */ 1257 1258 union descriptor *ldt; /* local descriptor table */ 1259 1260 static struct gate_descriptor idt0[NIDT]; 1261 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 1262 1263 static struct i386tss *dblfault_tss; 1264 static char *dblfault_stack; 1265 1266 static struct i386tss common_tss0; 1267 1268 vm_offset_t proc0kstack; 1269 1270 /* 1271 * software prototypes -- in more palatable form. 1272 * 1273 * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret 1274 * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it) 1275 */ 1276 struct soft_segment_descriptor gdt_segs[] = { 1277 /* GNULL_SEL 0 Null Descriptor */ 1278 { .ssd_base = 0x0, 1279 .ssd_limit = 0x0, 1280 .ssd_type = 0, 1281 .ssd_dpl = SEL_KPL, 1282 .ssd_p = 0, 1283 .ssd_xx = 0, .ssd_xx1 = 0, 1284 .ssd_def32 = 0, 1285 .ssd_gran = 0 }, 1286 /* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */ 1287 { .ssd_base = 0x0, 1288 .ssd_limit = 0xfffff, 1289 .ssd_type = SDT_MEMRWA, 1290 .ssd_dpl = SEL_KPL, 1291 .ssd_p = 1, 1292 .ssd_xx = 0, .ssd_xx1 = 0, 1293 .ssd_def32 = 1, 1294 .ssd_gran = 1 }, 1295 /* GUFS_SEL 2 %fs Descriptor for user */ 1296 { .ssd_base = 0x0, 1297 .ssd_limit = 0xfffff, 1298 .ssd_type = SDT_MEMRWA, 1299 .ssd_dpl = SEL_UPL, 1300 .ssd_p = 1, 1301 .ssd_xx = 0, .ssd_xx1 = 0, 1302 .ssd_def32 = 1, 1303 .ssd_gran = 1 }, 1304 /* GUGS_SEL 3 %gs Descriptor for user */ 1305 { .ssd_base = 0x0, 1306 .ssd_limit = 0xfffff, 1307 .ssd_type = SDT_MEMRWA, 1308 .ssd_dpl = SEL_UPL, 1309 .ssd_p = 1, 1310 .ssd_xx = 0, .ssd_xx1 = 0, 1311 .ssd_def32 = 1, 1312 .ssd_gran = 1 }, 1313 /* GCODE_SEL 4 Code Descriptor for kernel */ 1314 { .ssd_base = 0x0, 1315 .ssd_limit = 0xfffff, 1316 .ssd_type = SDT_MEMERA, 1317 .ssd_dpl = SEL_KPL, 1318 .ssd_p = 1, 1319 .ssd_xx = 0, .ssd_xx1 = 0, 1320 .ssd_def32 = 1, 1321 .ssd_gran = 1 }, 1322 /* GDATA_SEL 5 Data Descriptor for kernel */ 1323 { .ssd_base = 0x0, 1324 .ssd_limit = 0xfffff, 1325 .ssd_type = SDT_MEMRWA, 1326 .ssd_dpl = SEL_KPL, 1327 .ssd_p = 1, 1328 .ssd_xx = 0, .ssd_xx1 = 0, 1329 .ssd_def32 = 1, 1330 .ssd_gran = 1 }, 1331 /* GUCODE_SEL 6 Code Descriptor for user */ 1332 { .ssd_base = 0x0, 1333 .ssd_limit = 0xfffff, 1334 .ssd_type = SDT_MEMERA, 1335 .ssd_dpl = SEL_UPL, 1336 .ssd_p = 1, 1337 .ssd_xx = 0, .ssd_xx1 = 0, 1338 .ssd_def32 = 1, 1339 .ssd_gran = 1 }, 1340 /* GUDATA_SEL 7 Data Descriptor for user */ 1341 { .ssd_base = 0x0, 1342 .ssd_limit = 0xfffff, 1343 .ssd_type = SDT_MEMRWA, 1344 .ssd_dpl = SEL_UPL, 1345 .ssd_p = 1, 1346 .ssd_xx = 0, .ssd_xx1 = 0, 1347 .ssd_def32 = 1, 1348 .ssd_gran = 1 }, 1349 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ 1350 { .ssd_base = 0x400, 1351 .ssd_limit = 0xfffff, 1352 .ssd_type = SDT_MEMRWA, 1353 .ssd_dpl = SEL_KPL, 1354 .ssd_p = 1, 1355 .ssd_xx = 0, .ssd_xx1 = 0, 1356 .ssd_def32 = 1, 1357 .ssd_gran = 1 }, 1358 /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 1359 { 1360 .ssd_base = 0x0, 1361 .ssd_limit = sizeof(struct i386tss)-1, 1362 .ssd_type = SDT_SYS386TSS, 1363 .ssd_dpl = 0, 1364 .ssd_p = 1, 1365 .ssd_xx = 0, .ssd_xx1 = 0, 1366 .ssd_def32 = 0, 1367 .ssd_gran = 0 }, 1368 /* GLDT_SEL 10 LDT Descriptor */ 1369 { .ssd_base = 0, 1370 .ssd_limit = sizeof(union descriptor) * NLDT - 1, 1371 .ssd_type = SDT_SYSLDT, 1372 .ssd_dpl = SEL_UPL, 1373 .ssd_p = 1, 1374 .ssd_xx = 0, .ssd_xx1 = 0, 1375 .ssd_def32 = 0, 1376 .ssd_gran = 0 }, 1377 /* GUSERLDT_SEL 11 User LDT Descriptor per process */ 1378 { .ssd_base = 0, 1379 .ssd_limit = (512 * sizeof(union descriptor)-1), 1380 .ssd_type = SDT_SYSLDT, 1381 .ssd_dpl = 0, 1382 .ssd_p = 1, 1383 .ssd_xx = 0, .ssd_xx1 = 0, 1384 .ssd_def32 = 0, 1385 .ssd_gran = 0 }, 1386 /* GPANIC_SEL 12 Panic Tss Descriptor */ 1387 { .ssd_base = 0, 1388 .ssd_limit = sizeof(struct i386tss)-1, 1389 .ssd_type = SDT_SYS386TSS, 1390 .ssd_dpl = 0, 1391 .ssd_p = 1, 1392 .ssd_xx = 0, .ssd_xx1 = 0, 1393 .ssd_def32 = 0, 1394 .ssd_gran = 0 }, 1395 /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */ 1396 { .ssd_base = 0, 1397 .ssd_limit = 0xfffff, 1398 .ssd_type = SDT_MEMERA, 1399 .ssd_dpl = 0, 1400 .ssd_p = 1, 1401 .ssd_xx = 0, .ssd_xx1 = 0, 1402 .ssd_def32 = 0, 1403 .ssd_gran = 1 }, 1404 /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */ 1405 { .ssd_base = 0, 1406 .ssd_limit = 0xfffff, 1407 .ssd_type = SDT_MEMERA, 1408 .ssd_dpl = 0, 1409 .ssd_p = 1, 1410 .ssd_xx = 0, .ssd_xx1 = 0, 1411 .ssd_def32 = 0, 1412 .ssd_gran = 1 }, 1413 /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */ 1414 { .ssd_base = 0, 1415 .ssd_limit = 0xfffff, 1416 .ssd_type = SDT_MEMRWA, 1417 .ssd_dpl = 0, 1418 .ssd_p = 1, 1419 .ssd_xx = 0, .ssd_xx1 = 0, 1420 .ssd_def32 = 1, 1421 .ssd_gran = 1 }, 1422 /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */ 1423 { .ssd_base = 0, 1424 .ssd_limit = 0xfffff, 1425 .ssd_type = SDT_MEMRWA, 1426 .ssd_dpl = 0, 1427 .ssd_p = 1, 1428 .ssd_xx = 0, .ssd_xx1 = 0, 1429 .ssd_def32 = 0, 1430 .ssd_gran = 1 }, 1431 /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */ 1432 { .ssd_base = 0, 1433 .ssd_limit = 0xfffff, 1434 .ssd_type = SDT_MEMRWA, 1435 .ssd_dpl = 0, 1436 .ssd_p = 1, 1437 .ssd_xx = 0, .ssd_xx1 = 0, 1438 .ssd_def32 = 0, 1439 .ssd_gran = 1 }, 1440 /* GNDIS_SEL 18 NDIS Descriptor */ 1441 { .ssd_base = 0x0, 1442 .ssd_limit = 0x0, 1443 .ssd_type = 0, 1444 .ssd_dpl = 0, 1445 .ssd_p = 0, 1446 .ssd_xx = 0, .ssd_xx1 = 0, 1447 .ssd_def32 = 0, 1448 .ssd_gran = 0 }, 1449 }; 1450 1451 static struct soft_segment_descriptor ldt_segs[] = { 1452 /* Null Descriptor - overwritten by call gate */ 1453 { .ssd_base = 0x0, 1454 .ssd_limit = 0x0, 1455 .ssd_type = 0, 1456 .ssd_dpl = 0, 1457 .ssd_p = 0, 1458 .ssd_xx = 0, .ssd_xx1 = 0, 1459 .ssd_def32 = 0, 1460 .ssd_gran = 0 }, 1461 /* Null Descriptor - overwritten by call gate */ 1462 { .ssd_base = 0x0, 1463 .ssd_limit = 0x0, 1464 .ssd_type = 0, 1465 .ssd_dpl = 0, 1466 .ssd_p = 0, 1467 .ssd_xx = 0, .ssd_xx1 = 0, 1468 .ssd_def32 = 0, 1469 .ssd_gran = 0 }, 1470 /* Null Descriptor - overwritten by call gate */ 1471 { .ssd_base = 0x0, 1472 .ssd_limit = 0x0, 1473 .ssd_type = 0, 1474 .ssd_dpl = 0, 1475 .ssd_p = 0, 1476 .ssd_xx = 0, .ssd_xx1 = 0, 1477 .ssd_def32 = 0, 1478 .ssd_gran = 0 }, 1479 /* Code Descriptor for user */ 1480 { .ssd_base = 0x0, 1481 .ssd_limit = 0xfffff, 1482 .ssd_type = SDT_MEMERA, 1483 .ssd_dpl = SEL_UPL, 1484 .ssd_p = 1, 1485 .ssd_xx = 0, .ssd_xx1 = 0, 1486 .ssd_def32 = 1, 1487 .ssd_gran = 1 }, 1488 /* Null Descriptor - overwritten by call gate */ 1489 { .ssd_base = 0x0, 1490 .ssd_limit = 0x0, 1491 .ssd_type = 0, 1492 .ssd_dpl = 0, 1493 .ssd_p = 0, 1494 .ssd_xx = 0, .ssd_xx1 = 0, 1495 .ssd_def32 = 0, 1496 .ssd_gran = 0 }, 1497 /* Data Descriptor for user */ 1498 { .ssd_base = 0x0, 1499 .ssd_limit = 0xfffff, 1500 .ssd_type = SDT_MEMRWA, 1501 .ssd_dpl = SEL_UPL, 1502 .ssd_p = 1, 1503 .ssd_xx = 0, .ssd_xx1 = 0, 1504 .ssd_def32 = 1, 1505 .ssd_gran = 1 }, 1506 }; 1507 1508 uintptr_t setidt_disp; 1509 1510 void 1511 setidt(int idx, inthand_t *func, int typ, int dpl, int selec) 1512 { 1513 uintptr_t off; 1514 1515 off = func != NULL ? (uintptr_t)func + setidt_disp : 0; 1516 setidt_nodisp(idx, off, typ, dpl, selec); 1517 } 1518 1519 void 1520 setidt_nodisp(int idx, uintptr_t off, int typ, int dpl, int selec) 1521 { 1522 struct gate_descriptor *ip; 1523 1524 ip = idt + idx; 1525 ip->gd_looffset = off; 1526 ip->gd_selector = selec; 1527 ip->gd_stkcpy = 0; 1528 ip->gd_xx = 0; 1529 ip->gd_type = typ; 1530 ip->gd_dpl = dpl; 1531 ip->gd_p = 1; 1532 ip->gd_hioffset = ((u_int)off) >> 16 ; 1533 } 1534 1535 extern inthand_t 1536 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 1537 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 1538 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 1539 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 1540 IDTVEC(xmm), 1541 #ifdef KDTRACE_HOOKS 1542 IDTVEC(dtrace_ret), 1543 #endif 1544 #ifdef XENHVM 1545 IDTVEC(xen_intr_upcall), 1546 #endif 1547 IDTVEC(int0x80_syscall); 1548 1549 #ifdef DDB 1550 /* 1551 * Display the index and function name of any IDT entries that don't use 1552 * the default 'rsvd' entry point. 1553 */ 1554 DB_SHOW_COMMAND(idt, db_show_idt) 1555 { 1556 struct gate_descriptor *ip; 1557 int idx; 1558 uintptr_t func, func_trm; 1559 bool trm; 1560 1561 ip = idt; 1562 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 1563 if (ip->gd_type == SDT_SYSTASKGT) { 1564 db_printf("%3d\t<TASK>\n", idx); 1565 } else { 1566 func = (ip->gd_hioffset << 16 | ip->gd_looffset); 1567 if (func >= PMAP_TRM_MIN_ADDRESS) { 1568 func_trm = func; 1569 func -= setidt_disp; 1570 trm = true; 1571 } else 1572 trm = false; 1573 if (func != (uintptr_t)&IDTVEC(rsvd)) { 1574 db_printf("%3d\t", idx); 1575 db_printsym(func, DB_STGY_PROC); 1576 if (trm) 1577 db_printf(" (trampoline %#x)", 1578 func_trm); 1579 db_printf("\n"); 1580 } 1581 } 1582 ip++; 1583 } 1584 } 1585 1586 /* Show privileged registers. */ 1587 DB_SHOW_COMMAND(sysregs, db_show_sysregs) 1588 { 1589 uint64_t idtr, gdtr; 1590 1591 idtr = ridt(); 1592 db_printf("idtr\t0x%08x/%04x\n", 1593 (u_int)(idtr >> 16), (u_int)idtr & 0xffff); 1594 gdtr = rgdt(); 1595 db_printf("gdtr\t0x%08x/%04x\n", 1596 (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff); 1597 db_printf("ldtr\t0x%04x\n", rldt()); 1598 db_printf("tr\t0x%04x\n", rtr()); 1599 db_printf("cr0\t0x%08x\n", rcr0()); 1600 db_printf("cr2\t0x%08x\n", rcr2()); 1601 db_printf("cr3\t0x%08x\n", rcr3()); 1602 db_printf("cr4\t0x%08x\n", rcr4()); 1603 if (rcr4() & CR4_XSAVE) 1604 db_printf("xcr0\t0x%016llx\n", rxcr(0)); 1605 if (amd_feature & (AMDID_NX | AMDID_LM)) 1606 db_printf("EFER\t0x%016llx\n", rdmsr(MSR_EFER)); 1607 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) 1608 db_printf("FEATURES_CTL\t0x%016llx\n", 1609 rdmsr(MSR_IA32_FEATURE_CONTROL)); 1610 if (((cpu_vendor_id == CPU_VENDOR_INTEL || 1611 cpu_vendor_id == CPU_VENDOR_AMD) && CPUID_TO_FAMILY(cpu_id) >= 6) || 1612 cpu_vendor_id == CPU_VENDOR_HYGON) 1613 db_printf("DEBUG_CTL\t0x%016llx\n", rdmsr(MSR_DEBUGCTLMSR)); 1614 if (cpu_feature & CPUID_PAT) 1615 db_printf("PAT\t0x%016llx\n", rdmsr(MSR_PAT)); 1616 } 1617 1618 DB_SHOW_COMMAND(dbregs, db_show_dbregs) 1619 { 1620 1621 db_printf("dr0\t0x%08x\n", rdr0()); 1622 db_printf("dr1\t0x%08x\n", rdr1()); 1623 db_printf("dr2\t0x%08x\n", rdr2()); 1624 db_printf("dr3\t0x%08x\n", rdr3()); 1625 db_printf("dr6\t0x%08x\n", rdr6()); 1626 db_printf("dr7\t0x%08x\n", rdr7()); 1627 } 1628 1629 DB_SHOW_COMMAND(frame, db_show_frame) 1630 { 1631 struct trapframe *frame; 1632 1633 frame = have_addr ? (struct trapframe *)addr : curthread->td_frame; 1634 printf("ss %#x esp %#x efl %#x cs %#x eip %#x\n", 1635 frame->tf_ss, frame->tf_esp, frame->tf_eflags, frame->tf_cs, 1636 frame->tf_eip); 1637 printf("err %#x trapno %d\n", frame->tf_err, frame->tf_trapno); 1638 printf("ds %#x es %#x fs %#x\n", 1639 frame->tf_ds, frame->tf_es, frame->tf_fs); 1640 printf("eax %#x ecx %#x edx %#x ebx %#x\n", 1641 frame->tf_eax, frame->tf_ecx, frame->tf_edx, frame->tf_ebx); 1642 printf("ebp %#x esi %#x edi %#x\n", 1643 frame->tf_ebp, frame->tf_esi, frame->tf_edi); 1644 1645 } 1646 #endif 1647 1648 void 1649 sdtossd(sd, ssd) 1650 struct segment_descriptor *sd; 1651 struct soft_segment_descriptor *ssd; 1652 { 1653 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 1654 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 1655 ssd->ssd_type = sd->sd_type; 1656 ssd->ssd_dpl = sd->sd_dpl; 1657 ssd->ssd_p = sd->sd_p; 1658 ssd->ssd_def32 = sd->sd_def32; 1659 ssd->ssd_gran = sd->sd_gran; 1660 } 1661 1662 static int 1663 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, 1664 int *physmap_idxp) 1665 { 1666 uint64_t lim, ign; 1667 int i, insert_idx, physmap_idx; 1668 1669 physmap_idx = *physmap_idxp; 1670 1671 if (length == 0) 1672 return (1); 1673 1674 lim = 0x100000000; /* 4G */ 1675 if (pae_mode && above4g_allow) 1676 lim = above24g_allow ? -1ULL : 0x600000000; /* 24G */ 1677 if (base >= lim) { 1678 printf("%uK of memory above %uGB ignored, pae %d " 1679 "above4g_allow %d above24g_allow %d\n", 1680 (u_int)(length / 1024), (u_int)(lim >> 30), pae_mode, 1681 above4g_allow, above24g_allow); 1682 return (1); 1683 } 1684 if (base + length >= lim) { 1685 ign = base + length - lim; 1686 length -= ign; 1687 printf("%uK of memory above %uGB ignored, pae %d " 1688 "above4g_allow %d above24g_allow %d\n", 1689 (u_int)(ign / 1024), (u_int)(lim >> 30), pae_mode, 1690 above4g_allow, above24g_allow); 1691 } 1692 1693 /* 1694 * Find insertion point while checking for overlap. Start off by 1695 * assuming the new entry will be added to the end. 1696 */ 1697 insert_idx = physmap_idx + 2; 1698 for (i = 0; i <= physmap_idx; i += 2) { 1699 if (base < physmap[i + 1]) { 1700 if (base + length <= physmap[i]) { 1701 insert_idx = i; 1702 break; 1703 } 1704 if (boothowto & RB_VERBOSE) 1705 printf( 1706 "Overlapping memory regions, ignoring second region\n"); 1707 return (1); 1708 } 1709 } 1710 1711 /* See if we can prepend to the next entry. */ 1712 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { 1713 physmap[insert_idx] = base; 1714 return (1); 1715 } 1716 1717 /* See if we can append to the previous entry. */ 1718 if (insert_idx > 0 && base == physmap[insert_idx - 1]) { 1719 physmap[insert_idx - 1] += length; 1720 return (1); 1721 } 1722 1723 physmap_idx += 2; 1724 *physmap_idxp = physmap_idx; 1725 if (physmap_idx == PHYS_AVAIL_ENTRIES) { 1726 printf( 1727 "Too many segments in the physical address map, giving up\n"); 1728 return (0); 1729 } 1730 1731 /* 1732 * Move the last 'N' entries down to make room for the new 1733 * entry if needed. 1734 */ 1735 for (i = physmap_idx; i > insert_idx; i -= 2) { 1736 physmap[i] = physmap[i - 2]; 1737 physmap[i + 1] = physmap[i - 1]; 1738 } 1739 1740 /* Insert the new entry. */ 1741 physmap[insert_idx] = base; 1742 physmap[insert_idx + 1] = base + length; 1743 return (1); 1744 } 1745 1746 static int 1747 add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp) 1748 { 1749 if (boothowto & RB_VERBOSE) 1750 printf("SMAP type=%02x base=%016llx len=%016llx\n", 1751 smap->type, smap->base, smap->length); 1752 1753 if (smap->type != SMAP_TYPE_MEMORY) 1754 return (1); 1755 1756 return (add_physmap_entry(smap->base, smap->length, physmap, 1757 physmap_idxp)); 1758 } 1759 1760 static void 1761 add_smap_entries(struct bios_smap *smapbase, vm_paddr_t *physmap, 1762 int *physmap_idxp) 1763 { 1764 struct bios_smap *smap, *smapend; 1765 u_int32_t smapsize; 1766 /* 1767 * Memory map from INT 15:E820. 1768 * 1769 * subr_module.c says: 1770 * "Consumer may safely assume that size value precedes data." 1771 * ie: an int32_t immediately precedes SMAP. 1772 */ 1773 smapsize = *((u_int32_t *)smapbase - 1); 1774 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1775 1776 for (smap = smapbase; smap < smapend; smap++) 1777 if (!add_smap_entry(smap, physmap, physmap_idxp)) 1778 break; 1779 } 1780 1781 static void 1782 basemem_setup(void) 1783 { 1784 1785 if (basemem > 640) { 1786 printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", 1787 basemem); 1788 basemem = 640; 1789 } 1790 1791 pmap_basemem_setup(basemem); 1792 } 1793 1794 /* 1795 * Populate the (physmap) array with base/bound pairs describing the 1796 * available physical memory in the system, then test this memory and 1797 * build the phys_avail array describing the actually-available memory. 1798 * 1799 * If we cannot accurately determine the physical memory map, then use 1800 * value from the 0xE801 call, and failing that, the RTC. 1801 * 1802 * Total memory size may be set by the kernel environment variable 1803 * hw.physmem or the compile-time define MAXMEM. 1804 * 1805 * XXX first should be vm_paddr_t. 1806 */ 1807 static void 1808 getmemsize(int first) 1809 { 1810 int has_smap, off, physmap_idx, pa_indx, da_indx; 1811 u_long memtest; 1812 vm_paddr_t physmap[PHYS_AVAIL_ENTRIES]; 1813 quad_t dcons_addr, dcons_size, physmem_tunable; 1814 int hasbrokenint12, i, res; 1815 u_int extmem; 1816 struct vm86frame vmf; 1817 struct vm86context vmc; 1818 vm_paddr_t pa; 1819 struct bios_smap *smap, *smapbase; 1820 caddr_t kmdp; 1821 1822 has_smap = 0; 1823 bzero(&vmf, sizeof(vmf)); 1824 bzero(physmap, sizeof(physmap)); 1825 basemem = 0; 1826 1827 /* 1828 * Tell the physical memory allocator about pages used to store 1829 * the kernel and preloaded data. See kmem_bootstrap_free(). 1830 */ 1831 vm_phys_add_seg((vm_paddr_t)KERNLOAD, trunc_page(first)); 1832 1833 TUNABLE_INT_FETCH("hw.above4g_allow", &above4g_allow); 1834 TUNABLE_INT_FETCH("hw.above24g_allow", &above24g_allow); 1835 1836 /* 1837 * Check if the loader supplied an SMAP memory map. If so, 1838 * use that and do not make any VM86 calls. 1839 */ 1840 physmap_idx = 0; 1841 kmdp = preload_search_by_type("elf kernel"); 1842 if (kmdp == NULL) 1843 kmdp = preload_search_by_type("elf32 kernel"); 1844 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1845 MODINFO_METADATA | MODINFOMD_SMAP); 1846 if (smapbase != NULL) { 1847 add_smap_entries(smapbase, physmap, &physmap_idx); 1848 has_smap = 1; 1849 goto have_smap; 1850 } 1851 1852 /* 1853 * Some newer BIOSes have a broken INT 12H implementation 1854 * which causes a kernel panic immediately. In this case, we 1855 * need use the SMAP to determine the base memory size. 1856 */ 1857 hasbrokenint12 = 0; 1858 TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12); 1859 if (hasbrokenint12 == 0) { 1860 /* Use INT12 to determine base memory size. */ 1861 vm86_intcall(0x12, &vmf); 1862 basemem = vmf.vmf_ax; 1863 basemem_setup(); 1864 } 1865 1866 /* 1867 * Fetch the memory map with INT 15:E820. Map page 1 R/W into 1868 * the kernel page table so we can use it as a buffer. The 1869 * kernel will unmap this page later. 1870 */ 1871 vmc.npages = 0; 1872 smap = (void *)vm86_addpage(&vmc, 1, PMAP_MAP_LOW + ptoa(1)); 1873 res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); 1874 KASSERT(res != 0, ("vm86_getptr() failed: address not found")); 1875 1876 vmf.vmf_ebx = 0; 1877 do { 1878 vmf.vmf_eax = 0xE820; 1879 vmf.vmf_edx = SMAP_SIG; 1880 vmf.vmf_ecx = sizeof(struct bios_smap); 1881 i = vm86_datacall(0x15, &vmf, &vmc); 1882 if (i || vmf.vmf_eax != SMAP_SIG) 1883 break; 1884 has_smap = 1; 1885 if (!add_smap_entry(smap, physmap, &physmap_idx)) 1886 break; 1887 } while (vmf.vmf_ebx != 0); 1888 1889 have_smap: 1890 /* 1891 * If we didn't fetch the "base memory" size from INT12, 1892 * figure it out from the SMAP (or just guess). 1893 */ 1894 if (basemem == 0) { 1895 for (i = 0; i <= physmap_idx; i += 2) { 1896 if (physmap[i] == 0x00000000) { 1897 basemem = physmap[i + 1] / 1024; 1898 break; 1899 } 1900 } 1901 1902 /* XXX: If we couldn't find basemem from SMAP, just guess. */ 1903 if (basemem == 0) 1904 basemem = 640; 1905 basemem_setup(); 1906 } 1907 1908 if (physmap[1] != 0) 1909 goto physmap_done; 1910 1911 /* 1912 * If we failed to find an SMAP, figure out the extended 1913 * memory size. We will then build a simple memory map with 1914 * two segments, one for "base memory" and the second for 1915 * "extended memory". Note that "extended memory" starts at a 1916 * physical address of 1MB and that both basemem and extmem 1917 * are in units of 1KB. 1918 * 1919 * First, try to fetch the extended memory size via INT 15:E801. 1920 */ 1921 vmf.vmf_ax = 0xE801; 1922 if (vm86_intcall(0x15, &vmf) == 0) { 1923 extmem = vmf.vmf_cx + vmf.vmf_dx * 64; 1924 } else { 1925 /* 1926 * If INT15:E801 fails, this is our last ditch effort 1927 * to determine the extended memory size. Currently 1928 * we prefer the RTC value over INT15:88. 1929 */ 1930 #if 0 1931 vmf.vmf_ah = 0x88; 1932 vm86_intcall(0x15, &vmf); 1933 extmem = vmf.vmf_ax; 1934 #else 1935 extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); 1936 #endif 1937 } 1938 1939 /* 1940 * Special hack for chipsets that still remap the 384k hole when 1941 * there's 16MB of memory - this really confuses people that 1942 * are trying to use bus mastering ISA controllers with the 1943 * "16MB limit"; they only have 16MB, but the remapping puts 1944 * them beyond the limit. 1945 * 1946 * If extended memory is between 15-16MB (16-17MB phys address range), 1947 * chop it to 15MB. 1948 */ 1949 if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) 1950 extmem = 15 * 1024; 1951 1952 physmap[0] = 0; 1953 physmap[1] = basemem * 1024; 1954 physmap_idx = 2; 1955 physmap[physmap_idx] = 0x100000; 1956 physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; 1957 1958 physmap_done: 1959 /* 1960 * Now, physmap contains a map of physical memory. 1961 */ 1962 1963 #ifdef SMP 1964 /* make hole for AP bootstrap code */ 1965 alloc_ap_trampoline(physmap, &physmap_idx); 1966 #endif 1967 1968 /* 1969 * Maxmem isn't the "maximum memory", it's one larger than the 1970 * highest page of the physical address space. It should be 1971 * called something like "Maxphyspage". We may adjust this 1972 * based on ``hw.physmem'' and the results of the memory test. 1973 * 1974 * This is especially confusing when it is much larger than the 1975 * memory size and is displayed as "realmem". 1976 */ 1977 Maxmem = atop(physmap[physmap_idx + 1]); 1978 1979 #ifdef MAXMEM 1980 Maxmem = MAXMEM / 4; 1981 #endif 1982 1983 if (TUNABLE_QUAD_FETCH("hw.physmem", &physmem_tunable)) 1984 Maxmem = atop(physmem_tunable); 1985 1986 /* 1987 * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend 1988 * the amount of memory in the system. 1989 */ 1990 if (has_smap && Maxmem > atop(physmap[physmap_idx + 1])) 1991 Maxmem = atop(physmap[physmap_idx + 1]); 1992 1993 /* 1994 * The boot memory test is disabled by default, as it takes a 1995 * significant amount of time on large-memory systems, and is 1996 * unfriendly to virtual machines as it unnecessarily touches all 1997 * pages. 1998 * 1999 * A general name is used as the code may be extended to support 2000 * additional tests beyond the current "page present" test. 2001 */ 2002 memtest = 0; 2003 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 2004 2005 if (atop(physmap[physmap_idx + 1]) != Maxmem && 2006 (boothowto & RB_VERBOSE)) 2007 printf("Physical memory use set to %ldK\n", Maxmem * 4); 2008 2009 /* 2010 * If Maxmem has been increased beyond what the system has detected, 2011 * extend the last memory segment to the new limit. 2012 */ 2013 if (atop(physmap[physmap_idx + 1]) < Maxmem) 2014 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); 2015 2016 /* call pmap initialization to make new kernel address space */ 2017 pmap_bootstrap(first); 2018 2019 /* 2020 * Size up each available chunk of physical memory. 2021 */ 2022 physmap[0] = PAGE_SIZE; /* mask off page 0 */ 2023 pa_indx = 0; 2024 da_indx = 1; 2025 phys_avail[pa_indx++] = physmap[0]; 2026 phys_avail[pa_indx] = physmap[0]; 2027 dump_avail[da_indx] = physmap[0]; 2028 2029 /* 2030 * Get dcons buffer address 2031 */ 2032 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 2033 getenv_quad("dcons.size", &dcons_size) == 0) 2034 dcons_addr = 0; 2035 2036 /* 2037 * physmap is in bytes, so when converting to page boundaries, 2038 * round up the start address and round down the end address. 2039 */ 2040 for (i = 0; i <= physmap_idx; i += 2) { 2041 vm_paddr_t end; 2042 2043 end = ptoa((vm_paddr_t)Maxmem); 2044 if (physmap[i + 1] < end) 2045 end = trunc_page(physmap[i + 1]); 2046 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 2047 int tmp, page_bad, full; 2048 int *ptr; 2049 2050 full = FALSE; 2051 /* 2052 * block out kernel memory as not available. 2053 */ 2054 if (pa >= KERNLOAD && pa < first) 2055 goto do_dump_avail; 2056 2057 /* 2058 * block out dcons buffer 2059 */ 2060 if (dcons_addr > 0 2061 && pa >= trunc_page(dcons_addr) 2062 && pa < dcons_addr + dcons_size) 2063 goto do_dump_avail; 2064 2065 page_bad = FALSE; 2066 if (memtest == 0) 2067 goto skip_memtest; 2068 2069 /* 2070 * map page into kernel: valid, read/write,non-cacheable 2071 */ 2072 ptr = (int *)pmap_cmap3(pa, PG_V | PG_RW | PG_N); 2073 2074 tmp = *(int *)ptr; 2075 /* 2076 * Test for alternating 1's and 0's 2077 */ 2078 *(volatile int *)ptr = 0xaaaaaaaa; 2079 if (*(volatile int *)ptr != 0xaaaaaaaa) 2080 page_bad = TRUE; 2081 /* 2082 * Test for alternating 0's and 1's 2083 */ 2084 *(volatile int *)ptr = 0x55555555; 2085 if (*(volatile int *)ptr != 0x55555555) 2086 page_bad = TRUE; 2087 /* 2088 * Test for all 1's 2089 */ 2090 *(volatile int *)ptr = 0xffffffff; 2091 if (*(volatile int *)ptr != 0xffffffff) 2092 page_bad = TRUE; 2093 /* 2094 * Test for all 0's 2095 */ 2096 *(volatile int *)ptr = 0x0; 2097 if (*(volatile int *)ptr != 0x0) 2098 page_bad = TRUE; 2099 /* 2100 * Restore original value. 2101 */ 2102 *(int *)ptr = tmp; 2103 2104 skip_memtest: 2105 /* 2106 * Adjust array of valid/good pages. 2107 */ 2108 if (page_bad == TRUE) 2109 continue; 2110 /* 2111 * If this good page is a continuation of the 2112 * previous set of good pages, then just increase 2113 * the end pointer. Otherwise start a new chunk. 2114 * Note that "end" points one higher than end, 2115 * making the range >= start and < end. 2116 * If we're also doing a speculative memory 2117 * test and we at or past the end, bump up Maxmem 2118 * so that we keep going. The first bad page 2119 * will terminate the loop. 2120 */ 2121 if (phys_avail[pa_indx] == pa) { 2122 phys_avail[pa_indx] += PAGE_SIZE; 2123 } else { 2124 pa_indx++; 2125 if (pa_indx == PHYS_AVAIL_ENTRIES) { 2126 printf( 2127 "Too many holes in the physical address space, giving up\n"); 2128 pa_indx--; 2129 full = TRUE; 2130 goto do_dump_avail; 2131 } 2132 phys_avail[pa_indx++] = pa; /* start */ 2133 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 2134 } 2135 physmem++; 2136 do_dump_avail: 2137 if (dump_avail[da_indx] == pa) { 2138 dump_avail[da_indx] += PAGE_SIZE; 2139 } else { 2140 da_indx++; 2141 if (da_indx == PHYS_AVAIL_ENTRIES) { 2142 da_indx--; 2143 goto do_next; 2144 } 2145 dump_avail[da_indx++] = pa; /* start */ 2146 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 2147 } 2148 do_next: 2149 if (full) 2150 break; 2151 } 2152 } 2153 pmap_cmap3(0, 0); 2154 2155 /* 2156 * XXX 2157 * The last chunk must contain at least one page plus the message 2158 * buffer to avoid complicating other code (message buffer address 2159 * calculation, etc.). 2160 */ 2161 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 2162 round_page(msgbufsize) >= phys_avail[pa_indx]) { 2163 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 2164 phys_avail[pa_indx--] = 0; 2165 phys_avail[pa_indx--] = 0; 2166 } 2167 2168 Maxmem = atop(phys_avail[pa_indx]); 2169 2170 /* Trim off space for the message buffer. */ 2171 phys_avail[pa_indx] -= round_page(msgbufsize); 2172 2173 /* Map the message buffer. */ 2174 for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) 2175 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + 2176 off); 2177 } 2178 2179 static void 2180 i386_kdb_init(void) 2181 { 2182 #ifdef DDB 2183 db_fetch_ksymtab(bootinfo.bi_symtab, bootinfo.bi_esymtab); 2184 #endif 2185 kdb_init(); 2186 #ifdef KDB 2187 if (boothowto & RB_KDB) 2188 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 2189 #endif 2190 } 2191 2192 static void 2193 fixup_idt(void) 2194 { 2195 struct gate_descriptor *ip; 2196 uintptr_t off; 2197 int x; 2198 2199 for (x = 0; x < NIDT; x++) { 2200 ip = &idt[x]; 2201 if (ip->gd_type != SDT_SYS386IGT && 2202 ip->gd_type != SDT_SYS386TGT) 2203 continue; 2204 off = ip->gd_looffset + (((u_int)ip->gd_hioffset) << 16); 2205 KASSERT(off >= (uintptr_t)start_exceptions && 2206 off < (uintptr_t)end_exceptions, 2207 ("IDT[%d] type %d off %#x", x, ip->gd_type, off)); 2208 off += setidt_disp; 2209 MPASS(off >= PMAP_TRM_MIN_ADDRESS && 2210 off < PMAP_TRM_MAX_ADDRESS); 2211 ip->gd_looffset = off; 2212 ip->gd_hioffset = off >> 16; 2213 } 2214 } 2215 2216 static void 2217 i386_setidt1(void) 2218 { 2219 int x; 2220 2221 /* exceptions */ 2222 for (x = 0; x < NIDT; x++) 2223 setidt(x, &IDTVEC(rsvd), SDT_SYS386IGT, SEL_KPL, 2224 GSEL(GCODE_SEL, SEL_KPL)); 2225 setidt(IDT_DE, &IDTVEC(div), SDT_SYS386IGT, SEL_KPL, 2226 GSEL(GCODE_SEL, SEL_KPL)); 2227 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, 2228 GSEL(GCODE_SEL, SEL_KPL)); 2229 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL, 2230 GSEL(GCODE_SEL, SEL_KPL)); 2231 setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, 2232 GSEL(GCODE_SEL, SEL_KPL)); 2233 setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386IGT, SEL_UPL, 2234 GSEL(GCODE_SEL, SEL_KPL)); 2235 setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386IGT, SEL_KPL, 2236 GSEL(GCODE_SEL, SEL_KPL)); 2237 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL, 2238 GSEL(GCODE_SEL, SEL_KPL)); 2239 setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386IGT, SEL_KPL, 2240 GSEL(GCODE_SEL, SEL_KPL)); 2241 setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, 2242 SEL_KPL)); 2243 setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386IGT, 2244 SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 2245 setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386IGT, SEL_KPL, 2246 GSEL(GCODE_SEL, SEL_KPL)); 2247 setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386IGT, SEL_KPL, 2248 GSEL(GCODE_SEL, SEL_KPL)); 2249 setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386IGT, SEL_KPL, 2250 GSEL(GCODE_SEL, SEL_KPL)); 2251 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL, 2252 GSEL(GCODE_SEL, SEL_KPL)); 2253 setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, 2254 GSEL(GCODE_SEL, SEL_KPL)); 2255 setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386IGT, SEL_KPL, 2256 GSEL(GCODE_SEL, SEL_KPL)); 2257 setidt(IDT_AC, &IDTVEC(align), SDT_SYS386IGT, SEL_KPL, 2258 GSEL(GCODE_SEL, SEL_KPL)); 2259 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386IGT, SEL_KPL, 2260 GSEL(GCODE_SEL, SEL_KPL)); 2261 setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386IGT, SEL_KPL, 2262 GSEL(GCODE_SEL, SEL_KPL)); 2263 setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), 2264 SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); 2265 #ifdef KDTRACE_HOOKS 2266 setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), 2267 SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); 2268 #endif 2269 #ifdef XENHVM 2270 setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), 2271 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 2272 #endif 2273 } 2274 2275 static void 2276 i386_setidt2(void) 2277 { 2278 2279 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL, 2280 GSEL(GCODE_SEL, SEL_KPL)); 2281 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL, 2282 GSEL(GCODE_SEL, SEL_KPL)); 2283 } 2284 2285 #if defined(DEV_ISA) && !defined(DEV_ATPIC) 2286 static void 2287 i386_setidt3(void) 2288 { 2289 2290 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), 2291 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 2292 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), 2293 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 2294 } 2295 #endif 2296 2297 register_t 2298 init386(int first) 2299 { 2300 struct region_descriptor r_gdt, r_idt; /* table descriptors */ 2301 int gsel_tss, metadata_missing, x, pa; 2302 struct pcpu *pc; 2303 struct xstate_hdr *xhdr; 2304 caddr_t kmdp; 2305 vm_offset_t addend; 2306 size_t ucode_len; 2307 int late_console; 2308 2309 thread0.td_kstack = proc0kstack; 2310 thread0.td_kstack_pages = TD0_KSTACK_PAGES; 2311 2312 /* 2313 * This may be done better later if it gets more high level 2314 * components in it. If so just link td->td_proc here. 2315 */ 2316 proc_linkup0(&proc0, &thread0); 2317 2318 if (bootinfo.bi_modulep) { 2319 metadata_missing = 0; 2320 addend = (vm_paddr_t)bootinfo.bi_modulep < KERNBASE ? 2321 PMAP_MAP_LOW : 0; 2322 preload_metadata = (caddr_t)bootinfo.bi_modulep + addend; 2323 preload_bootstrap_relocate(addend); 2324 } else { 2325 metadata_missing = 1; 2326 } 2327 2328 if (bootinfo.bi_envp != 0) { 2329 addend = (vm_paddr_t)bootinfo.bi_envp < KERNBASE ? 2330 PMAP_MAP_LOW : 0; 2331 init_static_kenv((char *)bootinfo.bi_envp + addend, 0); 2332 } else { 2333 init_static_kenv(NULL, 0); 2334 } 2335 2336 /* 2337 * Re-evaluate CPU features if we loaded a microcode update. 2338 */ 2339 ucode_len = ucode_load_bsp(first); 2340 if (ucode_len != 0) { 2341 identify_cpu(); 2342 first = roundup2(first + ucode_len, PAGE_SIZE); 2343 } 2344 2345 identify_hypervisor(); 2346 2347 /* Init basic tunables, hz etc */ 2348 init_param1(); 2349 2350 /* 2351 * Make gdt memory segments. All segments cover the full 4GB 2352 * of address space and permissions are enforced at page level. 2353 */ 2354 gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); 2355 gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); 2356 gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1); 2357 gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1); 2358 gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1); 2359 gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1); 2360 2361 pc = &__pcpu[0]; 2362 gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1); 2363 gdt_segs[GPRIV_SEL].ssd_base = (int)pc; 2364 gdt_segs[GPROC0_SEL].ssd_base = (int)&common_tss0; 2365 2366 for (x = 0; x < NGDT; x++) 2367 ssdtosd(&gdt_segs[x], &gdt0[x].sd); 2368 2369 r_gdt.rd_limit = NGDT * sizeof(gdt0[0]) - 1; 2370 r_gdt.rd_base = (int)gdt0; 2371 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); 2372 lgdt(&r_gdt); 2373 2374 pcpu_init(pc, 0, sizeof(struct pcpu)); 2375 for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) 2376 pmap_kenter(pa, pa); 2377 dpcpu_init((void *)first, 0); 2378 first += DPCPU_SIZE; 2379 PCPU_SET(prvspace, pc); 2380 PCPU_SET(curthread, &thread0); 2381 /* Non-late cninit() and printf() can be moved up to here. */ 2382 2383 /* 2384 * Initialize mutexes. 2385 * 2386 * icu_lock: in order to allow an interrupt to occur in a critical 2387 * section, to set pcpu->ipending (etc...) properly, we 2388 * must be able to get the icu lock, so it can't be 2389 * under witness. 2390 */ 2391 mutex_init(); 2392 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); 2393 2394 i386_setidt1(); 2395 2396 r_idt.rd_limit = sizeof(idt0) - 1; 2397 r_idt.rd_base = (int) idt; 2398 lidt(&r_idt); 2399 2400 /* 2401 * Initialize the clock before the console so that console 2402 * initialization can use DELAY(). 2403 */ 2404 clock_init(); 2405 2406 finishidentcpu(); /* Final stage of CPU initialization */ 2407 i386_setidt2(); 2408 pmap_set_nx(); 2409 initializecpu(); /* Initialize CPU registers */ 2410 initializecpucache(); 2411 2412 /* pointer to selector slot for %fs/%gs */ 2413 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 2414 2415 /* Initialize the tss (except for the final esp0) early for vm86. */ 2416 common_tss0.tss_esp0 = thread0.td_kstack + thread0.td_kstack_pages * 2417 PAGE_SIZE - VM86_STACK_SPACE; 2418 common_tss0.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL); 2419 common_tss0.tss_ioopt = sizeof(struct i386tss) << 16; 2420 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 2421 PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); 2422 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 2423 ltr(gsel_tss); 2424 2425 /* Initialize the PIC early for vm86 calls. */ 2426 #ifdef DEV_ISA 2427 #ifdef DEV_ATPIC 2428 elcr_probe(); 2429 atpic_startup(); 2430 #else 2431 /* Reset and mask the atpics and leave them shut down. */ 2432 atpic_reset(); 2433 2434 /* 2435 * Point the ICU spurious interrupt vectors at the APIC spurious 2436 * interrupt handler. 2437 */ 2438 i386_setidt3(); 2439 #endif 2440 #endif 2441 2442 /* 2443 * The console and kdb should be initialized even earlier than here, 2444 * but some console drivers don't work until after getmemsize(). 2445 * Default to late console initialization to support these drivers. 2446 * This loses mainly printf()s in getmemsize() and early debugging. 2447 */ 2448 late_console = 1; 2449 TUNABLE_INT_FETCH("debug.late_console", &late_console); 2450 if (!late_console) { 2451 cninit(); 2452 i386_kdb_init(); 2453 } 2454 2455 kmdp = preload_search_by_type("elf kernel"); 2456 link_elf_ireloc(kmdp); 2457 2458 vm86_initialize(); 2459 getmemsize(first); 2460 init_param2(physmem); 2461 2462 /* now running on new page tables, configured,and u/iom is accessible */ 2463 2464 if (late_console) 2465 cninit(); 2466 2467 if (metadata_missing) 2468 printf("WARNING: loader(8) metadata is missing!\n"); 2469 2470 if (late_console) 2471 i386_kdb_init(); 2472 2473 msgbufinit(msgbufp, msgbufsize); 2474 npxinit(true); 2475 /* 2476 * Set up thread0 pcb after npxinit calculated pcb + fpu save 2477 * area size. Zero out the extended state header in fpu save 2478 * area. 2479 */ 2480 thread0.td_pcb = get_pcb_td(&thread0); 2481 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); 2482 bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); 2483 if (use_xsave) { 2484 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 2485 1); 2486 xhdr->xstate_bv = xsave_mask; 2487 } 2488 PCPU_SET(curpcb, thread0.td_pcb); 2489 /* Move esp0 in the tss to its final place. */ 2490 /* Note: -16 is so we can grow the trapframe if we came from vm86 */ 2491 common_tss0.tss_esp0 = (vm_offset_t)thread0.td_pcb - VM86_STACK_SPACE; 2492 PCPU_SET(kesp0, common_tss0.tss_esp0); 2493 gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; /* clear busy bit */ 2494 ltr(gsel_tss); 2495 2496 /* transfer to user mode */ 2497 2498 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 2499 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 2500 2501 /* setup proc 0's pcb */ 2502 thread0.td_pcb->pcb_flags = 0; 2503 thread0.td_pcb->pcb_cr3 = pmap_get_kcr3(); 2504 thread0.td_pcb->pcb_ext = 0; 2505 thread0.td_frame = &proc0_tf; 2506 2507 cpu_probe_amdc1e(); 2508 2509 #ifdef FDT 2510 x86_init_fdt(); 2511 #endif 2512 2513 /* Location of kernel stack for locore */ 2514 return ((register_t)thread0.td_pcb); 2515 } 2516 2517 static void 2518 machdep_init_trampoline(void) 2519 { 2520 struct region_descriptor r_gdt, r_idt; 2521 struct i386tss *tss; 2522 char *copyout_buf, *trampoline, *tramp_stack_base; 2523 int x; 2524 2525 gdt = pmap_trm_alloc(sizeof(union descriptor) * NGDT * mp_ncpus, 2526 M_NOWAIT | M_ZERO); 2527 bcopy(gdt0, gdt, sizeof(union descriptor) * NGDT); 2528 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 2529 r_gdt.rd_base = (int)gdt; 2530 lgdt(&r_gdt); 2531 2532 tss = pmap_trm_alloc(sizeof(struct i386tss) * mp_ncpus, 2533 M_NOWAIT | M_ZERO); 2534 bcopy(&common_tss0, tss, sizeof(struct i386tss)); 2535 gdt[GPROC0_SEL].sd.sd_lobase = (int)tss; 2536 gdt[GPROC0_SEL].sd.sd_hibase = (u_int)tss >> 24; 2537 gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; 2538 2539 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 2540 PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); 2541 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 2542 PCPU_SET(common_tssp, tss); 2543 ltr(GSEL(GPROC0_SEL, SEL_KPL)); 2544 2545 trampoline = pmap_trm_alloc(end_exceptions - start_exceptions, 2546 M_NOWAIT); 2547 bcopy(start_exceptions, trampoline, end_exceptions - start_exceptions); 2548 tramp_stack_base = pmap_trm_alloc(TRAMP_STACK_SZ, M_NOWAIT); 2549 PCPU_SET(trampstk, (uintptr_t)tramp_stack_base + TRAMP_STACK_SZ - 2550 VM86_STACK_SPACE); 2551 tss[0].tss_esp0 = PCPU_GET(trampstk); 2552 2553 idt = pmap_trm_alloc(sizeof(idt0), M_NOWAIT | M_ZERO); 2554 bcopy(idt0, idt, sizeof(idt0)); 2555 2556 /* Re-initialize new IDT since the handlers were relocated */ 2557 setidt_disp = trampoline - start_exceptions; 2558 fixup_idt(); 2559 2560 r_idt.rd_limit = sizeof(struct gate_descriptor) * NIDT - 1; 2561 r_idt.rd_base = (int)idt; 2562 lidt(&r_idt); 2563 2564 /* dblfault TSS */ 2565 dblfault_tss = pmap_trm_alloc(sizeof(struct i386tss), M_NOWAIT | M_ZERO); 2566 dblfault_stack = pmap_trm_alloc(PAGE_SIZE, M_NOWAIT); 2567 dblfault_tss->tss_esp = dblfault_tss->tss_esp0 = 2568 dblfault_tss->tss_esp1 = dblfault_tss->tss_esp2 = 2569 (int)dblfault_stack + PAGE_SIZE; 2570 dblfault_tss->tss_ss = dblfault_tss->tss_ss0 = dblfault_tss->tss_ss1 = 2571 dblfault_tss->tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); 2572 dblfault_tss->tss_cr3 = pmap_get_kcr3(); 2573 dblfault_tss->tss_eip = (int)dblfault_handler; 2574 dblfault_tss->tss_eflags = PSL_KERNEL; 2575 dblfault_tss->tss_ds = dblfault_tss->tss_es = 2576 dblfault_tss->tss_gs = GSEL(GDATA_SEL, SEL_KPL); 2577 dblfault_tss->tss_fs = GSEL(GPRIV_SEL, SEL_KPL); 2578 dblfault_tss->tss_cs = GSEL(GCODE_SEL, SEL_KPL); 2579 dblfault_tss->tss_ldt = GSEL(GLDT_SEL, SEL_KPL); 2580 gdt[GPANIC_SEL].sd.sd_lobase = (int)dblfault_tss; 2581 gdt[GPANIC_SEL].sd.sd_hibase = (u_int)dblfault_tss >> 24; 2582 2583 /* make ldt memory segments */ 2584 ldt = pmap_trm_alloc(sizeof(union descriptor) * NLDT, 2585 M_NOWAIT | M_ZERO); 2586 gdt[GLDT_SEL].sd.sd_lobase = (int)ldt; 2587 gdt[GLDT_SEL].sd.sd_hibase = (u_int)ldt >> 24; 2588 ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); 2589 ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); 2590 for (x = 0; x < nitems(ldt_segs); x++) 2591 ssdtosd(&ldt_segs[x], &ldt[x].sd); 2592 2593 _default_ldt = GSEL(GLDT_SEL, SEL_KPL); 2594 lldt(_default_ldt); 2595 PCPU_SET(currentldt, _default_ldt); 2596 2597 copyout_buf = pmap_trm_alloc(TRAMP_COPYOUT_SZ, M_NOWAIT); 2598 PCPU_SET(copyout_buf, copyout_buf); 2599 copyout_init_tramp(); 2600 } 2601 SYSINIT(vm_mem, SI_SUB_VM, SI_ORDER_SECOND, machdep_init_trampoline, NULL); 2602 2603 #ifdef COMPAT_43 2604 static void 2605 i386_setup_lcall_gate(void) 2606 { 2607 struct sysentvec *sv; 2608 struct user_segment_descriptor desc; 2609 u_int lcall_addr; 2610 2611 sv = &elf32_freebsd_sysvec; 2612 lcall_addr = (uintptr_t)sv->sv_psstrings - sz_lcall_tramp; 2613 2614 bzero(&desc, sizeof(desc)); 2615 desc.sd_type = SDT_MEMERA; 2616 desc.sd_dpl = SEL_UPL; 2617 desc.sd_p = 1; 2618 desc.sd_def32 = 1; 2619 desc.sd_gran = 1; 2620 desc.sd_lolimit = 0xffff; 2621 desc.sd_hilimit = 0xf; 2622 desc.sd_lobase = lcall_addr; 2623 desc.sd_hibase = lcall_addr >> 24; 2624 bcopy(&desc, &ldt[LSYS5CALLS_SEL], sizeof(desc)); 2625 } 2626 SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_ANY, i386_setup_lcall_gate, NULL); 2627 #endif 2628 2629 void 2630 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 2631 { 2632 2633 pcpu->pc_acpi_id = 0xffffffff; 2634 } 2635 2636 static int 2637 smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 2638 { 2639 struct bios_smap *smapbase; 2640 struct bios_smap_xattr smap; 2641 caddr_t kmdp; 2642 uint32_t *smapattr; 2643 int count, error, i; 2644 2645 /* Retrieve the system memory map from the loader. */ 2646 kmdp = preload_search_by_type("elf kernel"); 2647 if (kmdp == NULL) 2648 kmdp = preload_search_by_type("elf32 kernel"); 2649 smapbase = (struct bios_smap *)preload_search_info(kmdp, 2650 MODINFO_METADATA | MODINFOMD_SMAP); 2651 if (smapbase == NULL) 2652 return (0); 2653 smapattr = (uint32_t *)preload_search_info(kmdp, 2654 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 2655 count = *((u_int32_t *)smapbase - 1) / sizeof(*smapbase); 2656 error = 0; 2657 for (i = 0; i < count; i++) { 2658 smap.base = smapbase[i].base; 2659 smap.length = smapbase[i].length; 2660 smap.type = smapbase[i].type; 2661 if (smapattr != NULL) 2662 smap.xattr = smapattr[i]; 2663 else 2664 smap.xattr = 0; 2665 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 2666 } 2667 return (error); 2668 } 2669 SYSCTL_PROC(_machdep, OID_AUTO, smap, 2670 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 2671 smap_sysctl_handler, "S,bios_smap_xattr", 2672 "Raw BIOS SMAP data"); 2673 2674 void 2675 spinlock_enter(void) 2676 { 2677 struct thread *td; 2678 register_t flags; 2679 2680 td = curthread; 2681 if (td->td_md.md_spinlock_count == 0) { 2682 flags = intr_disable(); 2683 td->td_md.md_spinlock_count = 1; 2684 td->td_md.md_saved_flags = flags; 2685 critical_enter(); 2686 } else 2687 td->td_md.md_spinlock_count++; 2688 } 2689 2690 void 2691 spinlock_exit(void) 2692 { 2693 struct thread *td; 2694 register_t flags; 2695 2696 td = curthread; 2697 flags = td->td_md.md_saved_flags; 2698 td->td_md.md_spinlock_count--; 2699 if (td->td_md.md_spinlock_count == 0) { 2700 critical_exit(); 2701 intr_restore(flags); 2702 } 2703 } 2704 2705 #if defined(I586_CPU) && !defined(NO_F00F_HACK) 2706 static void f00f_hack(void *unused); 2707 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); 2708 2709 static void 2710 f00f_hack(void *unused) 2711 { 2712 struct region_descriptor r_idt; 2713 struct gate_descriptor *new_idt; 2714 vm_offset_t tmp; 2715 2716 if (!has_f00f_bug) 2717 return; 2718 2719 GIANT_REQUIRED; 2720 2721 printf("Intel Pentium detected, installing workaround for F00F bug\n"); 2722 2723 tmp = (vm_offset_t)pmap_trm_alloc(PAGE_SIZE * 3, M_NOWAIT | M_ZERO); 2724 if (tmp == 0) 2725 panic("kmem_malloc returned 0"); 2726 tmp = round_page(tmp); 2727 2728 /* Put the problematic entry (#6) at the end of the lower page. */ 2729 new_idt = (struct gate_descriptor *) 2730 (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor)); 2731 bcopy(idt, new_idt, sizeof(idt0)); 2732 r_idt.rd_base = (u_int)new_idt; 2733 r_idt.rd_limit = sizeof(idt0) - 1; 2734 lidt(&r_idt); 2735 /* SMP machines do not need the F00F hack. */ 2736 idt = new_idt; 2737 pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ); 2738 } 2739 #endif /* defined(I586_CPU) && !NO_F00F_HACK */ 2740 2741 /* 2742 * Construct a PCB from a trapframe. This is called from kdb_trap() where 2743 * we want to start a backtrace from the function that caused us to enter 2744 * the debugger. We have the context in the trapframe, but base the trace 2745 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 2746 * enough for a backtrace. 2747 */ 2748 void 2749 makectx(struct trapframe *tf, struct pcb *pcb) 2750 { 2751 2752 pcb->pcb_edi = tf->tf_edi; 2753 pcb->pcb_esi = tf->tf_esi; 2754 pcb->pcb_ebp = tf->tf_ebp; 2755 pcb->pcb_ebx = tf->tf_ebx; 2756 pcb->pcb_eip = tf->tf_eip; 2757 pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8; 2758 pcb->pcb_gs = rgs(); 2759 } 2760 2761 int 2762 ptrace_set_pc(struct thread *td, u_long addr) 2763 { 2764 2765 td->td_frame->tf_eip = addr; 2766 return (0); 2767 } 2768 2769 int 2770 ptrace_single_step(struct thread *td) 2771 { 2772 2773 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2774 if ((td->td_frame->tf_eflags & PSL_T) == 0) { 2775 td->td_frame->tf_eflags |= PSL_T; 2776 td->td_dbgflags |= TDB_STEP; 2777 } 2778 return (0); 2779 } 2780 2781 int 2782 ptrace_clear_single_step(struct thread *td) 2783 { 2784 2785 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2786 td->td_frame->tf_eflags &= ~PSL_T; 2787 td->td_dbgflags &= ~TDB_STEP; 2788 return (0); 2789 } 2790 2791 int 2792 fill_regs(struct thread *td, struct reg *regs) 2793 { 2794 struct pcb *pcb; 2795 struct trapframe *tp; 2796 2797 tp = td->td_frame; 2798 pcb = td->td_pcb; 2799 regs->r_gs = pcb->pcb_gs; 2800 return (fill_frame_regs(tp, regs)); 2801 } 2802 2803 int 2804 fill_frame_regs(struct trapframe *tp, struct reg *regs) 2805 { 2806 2807 regs->r_fs = tp->tf_fs; 2808 regs->r_es = tp->tf_es; 2809 regs->r_ds = tp->tf_ds; 2810 regs->r_edi = tp->tf_edi; 2811 regs->r_esi = tp->tf_esi; 2812 regs->r_ebp = tp->tf_ebp; 2813 regs->r_ebx = tp->tf_ebx; 2814 regs->r_edx = tp->tf_edx; 2815 regs->r_ecx = tp->tf_ecx; 2816 regs->r_eax = tp->tf_eax; 2817 regs->r_eip = tp->tf_eip; 2818 regs->r_cs = tp->tf_cs; 2819 regs->r_eflags = tp->tf_eflags; 2820 regs->r_esp = tp->tf_esp; 2821 regs->r_ss = tp->tf_ss; 2822 regs->r_err = 0; 2823 regs->r_trapno = 0; 2824 return (0); 2825 } 2826 2827 int 2828 set_regs(struct thread *td, struct reg *regs) 2829 { 2830 struct pcb *pcb; 2831 struct trapframe *tp; 2832 2833 tp = td->td_frame; 2834 if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) || 2835 !CS_SECURE(regs->r_cs)) 2836 return (EINVAL); 2837 pcb = td->td_pcb; 2838 tp->tf_fs = regs->r_fs; 2839 tp->tf_es = regs->r_es; 2840 tp->tf_ds = regs->r_ds; 2841 tp->tf_edi = regs->r_edi; 2842 tp->tf_esi = regs->r_esi; 2843 tp->tf_ebp = regs->r_ebp; 2844 tp->tf_ebx = regs->r_ebx; 2845 tp->tf_edx = regs->r_edx; 2846 tp->tf_ecx = regs->r_ecx; 2847 tp->tf_eax = regs->r_eax; 2848 tp->tf_eip = regs->r_eip; 2849 tp->tf_cs = regs->r_cs; 2850 tp->tf_eflags = regs->r_eflags; 2851 tp->tf_esp = regs->r_esp; 2852 tp->tf_ss = regs->r_ss; 2853 pcb->pcb_gs = regs->r_gs; 2854 return (0); 2855 } 2856 2857 int 2858 fill_fpregs(struct thread *td, struct fpreg *fpregs) 2859 { 2860 2861 KASSERT(td == curthread || TD_IS_SUSPENDED(td) || 2862 P_SHOULDSTOP(td->td_proc), 2863 ("not suspended thread %p", td)); 2864 npxgetregs(td); 2865 if (cpu_fxsr) 2866 npx_fill_fpregs_xmm(&get_pcb_user_save_td(td)->sv_xmm, 2867 (struct save87 *)fpregs); 2868 else 2869 bcopy(&get_pcb_user_save_td(td)->sv_87, fpregs, 2870 sizeof(*fpregs)); 2871 return (0); 2872 } 2873 2874 int 2875 set_fpregs(struct thread *td, struct fpreg *fpregs) 2876 { 2877 2878 critical_enter(); 2879 if (cpu_fxsr) 2880 npx_set_fpregs_xmm((struct save87 *)fpregs, 2881 &get_pcb_user_save_td(td)->sv_xmm); 2882 else 2883 bcopy(fpregs, &get_pcb_user_save_td(td)->sv_87, 2884 sizeof(*fpregs)); 2885 npxuserinited(td); 2886 critical_exit(); 2887 return (0); 2888 } 2889 2890 /* 2891 * Get machine context. 2892 */ 2893 int 2894 get_mcontext(struct thread *td, mcontext_t *mcp, int flags) 2895 { 2896 struct trapframe *tp; 2897 struct segment_descriptor *sdp; 2898 2899 tp = td->td_frame; 2900 2901 PROC_LOCK(curthread->td_proc); 2902 mcp->mc_onstack = sigonstack(tp->tf_esp); 2903 PROC_UNLOCK(curthread->td_proc); 2904 mcp->mc_gs = td->td_pcb->pcb_gs; 2905 mcp->mc_fs = tp->tf_fs; 2906 mcp->mc_es = tp->tf_es; 2907 mcp->mc_ds = tp->tf_ds; 2908 mcp->mc_edi = tp->tf_edi; 2909 mcp->mc_esi = tp->tf_esi; 2910 mcp->mc_ebp = tp->tf_ebp; 2911 mcp->mc_isp = tp->tf_isp; 2912 mcp->mc_eflags = tp->tf_eflags; 2913 if (flags & GET_MC_CLEAR_RET) { 2914 mcp->mc_eax = 0; 2915 mcp->mc_edx = 0; 2916 mcp->mc_eflags &= ~PSL_C; 2917 } else { 2918 mcp->mc_eax = tp->tf_eax; 2919 mcp->mc_edx = tp->tf_edx; 2920 } 2921 mcp->mc_ebx = tp->tf_ebx; 2922 mcp->mc_ecx = tp->tf_ecx; 2923 mcp->mc_eip = tp->tf_eip; 2924 mcp->mc_cs = tp->tf_cs; 2925 mcp->mc_esp = tp->tf_esp; 2926 mcp->mc_ss = tp->tf_ss; 2927 mcp->mc_len = sizeof(*mcp); 2928 get_fpcontext(td, mcp, NULL, 0); 2929 sdp = &td->td_pcb->pcb_fsd; 2930 mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; 2931 sdp = &td->td_pcb->pcb_gsd; 2932 mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; 2933 mcp->mc_flags = 0; 2934 mcp->mc_xfpustate = 0; 2935 mcp->mc_xfpustate_len = 0; 2936 bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2)); 2937 return (0); 2938 } 2939 2940 /* 2941 * Set machine context. 2942 * 2943 * However, we don't set any but the user modifiable flags, and we won't 2944 * touch the cs selector. 2945 */ 2946 int 2947 set_mcontext(struct thread *td, mcontext_t *mcp) 2948 { 2949 struct trapframe *tp; 2950 char *xfpustate; 2951 int eflags, ret; 2952 2953 tp = td->td_frame; 2954 if (mcp->mc_len != sizeof(*mcp) || 2955 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) 2956 return (EINVAL); 2957 eflags = (mcp->mc_eflags & PSL_USERCHANGE) | 2958 (tp->tf_eflags & ~PSL_USERCHANGE); 2959 if (mcp->mc_flags & _MC_HASFPXSTATE) { 2960 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - 2961 sizeof(union savefpu)) 2962 return (EINVAL); 2963 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); 2964 ret = copyin((void *)mcp->mc_xfpustate, xfpustate, 2965 mcp->mc_xfpustate_len); 2966 if (ret != 0) 2967 return (ret); 2968 } else 2969 xfpustate = NULL; 2970 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); 2971 if (ret != 0) 2972 return (ret); 2973 tp->tf_fs = mcp->mc_fs; 2974 tp->tf_es = mcp->mc_es; 2975 tp->tf_ds = mcp->mc_ds; 2976 tp->tf_edi = mcp->mc_edi; 2977 tp->tf_esi = mcp->mc_esi; 2978 tp->tf_ebp = mcp->mc_ebp; 2979 tp->tf_ebx = mcp->mc_ebx; 2980 tp->tf_edx = mcp->mc_edx; 2981 tp->tf_ecx = mcp->mc_ecx; 2982 tp->tf_eax = mcp->mc_eax; 2983 tp->tf_eip = mcp->mc_eip; 2984 tp->tf_eflags = eflags; 2985 tp->tf_esp = mcp->mc_esp; 2986 tp->tf_ss = mcp->mc_ss; 2987 td->td_pcb->pcb_gs = mcp->mc_gs; 2988 return (0); 2989 } 2990 2991 static void 2992 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, 2993 size_t xfpusave_len) 2994 { 2995 size_t max_len, len; 2996 2997 mcp->mc_ownedfp = npxgetregs(td); 2998 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], 2999 sizeof(mcp->mc_fpstate)); 3000 mcp->mc_fpformat = npxformat(); 3001 if (!use_xsave || xfpusave_len == 0) 3002 return; 3003 max_len = cpu_max_ext_state_size - sizeof(union savefpu); 3004 len = xfpusave_len; 3005 if (len > max_len) { 3006 len = max_len; 3007 bzero(xfpusave + max_len, len - max_len); 3008 } 3009 mcp->mc_flags |= _MC_HASFPXSTATE; 3010 mcp->mc_xfpustate_len = len; 3011 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); 3012 } 3013 3014 static int 3015 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, 3016 size_t xfpustate_len) 3017 { 3018 int error; 3019 3020 if (mcp->mc_fpformat == _MC_FPFMT_NODEV) 3021 return (0); 3022 else if (mcp->mc_fpformat != _MC_FPFMT_387 && 3023 mcp->mc_fpformat != _MC_FPFMT_XMM) 3024 return (EINVAL); 3025 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { 3026 /* We don't care what state is left in the FPU or PCB. */ 3027 fpstate_drop(td); 3028 error = 0; 3029 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || 3030 mcp->mc_ownedfp == _MC_FPOWNED_PCB) { 3031 error = npxsetregs(td, (union savefpu *)&mcp->mc_fpstate, 3032 xfpustate, xfpustate_len); 3033 } else 3034 return (EINVAL); 3035 return (error); 3036 } 3037 3038 static void 3039 fpstate_drop(struct thread *td) 3040 { 3041 3042 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); 3043 critical_enter(); 3044 if (PCPU_GET(fpcurthread) == td) 3045 npxdrop(); 3046 /* 3047 * XXX force a full drop of the npx. The above only drops it if we 3048 * owned it. npxgetregs() has the same bug in the !cpu_fxsr case. 3049 * 3050 * XXX I don't much like npxgetregs()'s semantics of doing a full 3051 * drop. Dropping only to the pcb matches fnsave's behaviour. 3052 * We only need to drop to !PCB_INITDONE in sendsig(). But 3053 * sendsig() is the only caller of npxgetregs()... perhaps we just 3054 * have too many layers. 3055 */ 3056 curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE | 3057 PCB_NPXUSERINITDONE); 3058 critical_exit(); 3059 } 3060 3061 int 3062 fill_dbregs(struct thread *td, struct dbreg *dbregs) 3063 { 3064 struct pcb *pcb; 3065 3066 if (td == NULL) { 3067 dbregs->dr[0] = rdr0(); 3068 dbregs->dr[1] = rdr1(); 3069 dbregs->dr[2] = rdr2(); 3070 dbregs->dr[3] = rdr3(); 3071 dbregs->dr[6] = rdr6(); 3072 dbregs->dr[7] = rdr7(); 3073 } else { 3074 pcb = td->td_pcb; 3075 dbregs->dr[0] = pcb->pcb_dr0; 3076 dbregs->dr[1] = pcb->pcb_dr1; 3077 dbregs->dr[2] = pcb->pcb_dr2; 3078 dbregs->dr[3] = pcb->pcb_dr3; 3079 dbregs->dr[6] = pcb->pcb_dr6; 3080 dbregs->dr[7] = pcb->pcb_dr7; 3081 } 3082 dbregs->dr[4] = 0; 3083 dbregs->dr[5] = 0; 3084 return (0); 3085 } 3086 3087 int 3088 set_dbregs(struct thread *td, struct dbreg *dbregs) 3089 { 3090 struct pcb *pcb; 3091 int i; 3092 3093 if (td == NULL) { 3094 load_dr0(dbregs->dr[0]); 3095 load_dr1(dbregs->dr[1]); 3096 load_dr2(dbregs->dr[2]); 3097 load_dr3(dbregs->dr[3]); 3098 load_dr6(dbregs->dr[6]); 3099 load_dr7(dbregs->dr[7]); 3100 } else { 3101 /* 3102 * Don't let an illegal value for dr7 get set. Specifically, 3103 * check for undefined settings. Setting these bit patterns 3104 * result in undefined behaviour and can lead to an unexpected 3105 * TRCTRAP. 3106 */ 3107 for (i = 0; i < 4; i++) { 3108 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) 3109 return (EINVAL); 3110 if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02) 3111 return (EINVAL); 3112 } 3113 3114 pcb = td->td_pcb; 3115 3116 /* 3117 * Don't let a process set a breakpoint that is not within the 3118 * process's address space. If a process could do this, it 3119 * could halt the system by setting a breakpoint in the kernel 3120 * (if ddb was enabled). Thus, we need to check to make sure 3121 * that no breakpoints are being enabled for addresses outside 3122 * process's address space. 3123 * 3124 * XXX - what about when the watched area of the user's 3125 * address space is written into from within the kernel 3126 * ... wouldn't that still cause a breakpoint to be generated 3127 * from within kernel mode? 3128 */ 3129 3130 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { 3131 /* dr0 is enabled */ 3132 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) 3133 return (EINVAL); 3134 } 3135 3136 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { 3137 /* dr1 is enabled */ 3138 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) 3139 return (EINVAL); 3140 } 3141 3142 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { 3143 /* dr2 is enabled */ 3144 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) 3145 return (EINVAL); 3146 } 3147 3148 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { 3149 /* dr3 is enabled */ 3150 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) 3151 return (EINVAL); 3152 } 3153 3154 pcb->pcb_dr0 = dbregs->dr[0]; 3155 pcb->pcb_dr1 = dbregs->dr[1]; 3156 pcb->pcb_dr2 = dbregs->dr[2]; 3157 pcb->pcb_dr3 = dbregs->dr[3]; 3158 pcb->pcb_dr6 = dbregs->dr[6]; 3159 pcb->pcb_dr7 = dbregs->dr[7]; 3160 3161 pcb->pcb_flags |= PCB_DBREGS; 3162 } 3163 3164 return (0); 3165 } 3166 3167 /* 3168 * Return > 0 if a hardware breakpoint has been hit, and the 3169 * breakpoint was in user space. Return 0, otherwise. 3170 */ 3171 int 3172 user_dbreg_trap(register_t dr6) 3173 { 3174 u_int32_t dr7; 3175 u_int32_t bp; /* breakpoint bits extracted from dr6 */ 3176 int nbp; /* number of breakpoints that triggered */ 3177 caddr_t addr[4]; /* breakpoint addresses */ 3178 int i; 3179 3180 bp = dr6 & DBREG_DR6_BMASK; 3181 if (bp == 0) { 3182 /* 3183 * None of the breakpoint bits are set meaning this 3184 * trap was not caused by any of the debug registers 3185 */ 3186 return 0; 3187 } 3188 3189 dr7 = rdr7(); 3190 if ((dr7 & 0x000000ff) == 0) { 3191 /* 3192 * all GE and LE bits in the dr7 register are zero, 3193 * thus the trap couldn't have been caused by the 3194 * hardware debug registers 3195 */ 3196 return 0; 3197 } 3198 3199 nbp = 0; 3200 3201 /* 3202 * at least one of the breakpoints were hit, check to see 3203 * which ones and if any of them are user space addresses 3204 */ 3205 3206 if (bp & 0x01) { 3207 addr[nbp++] = (caddr_t)rdr0(); 3208 } 3209 if (bp & 0x02) { 3210 addr[nbp++] = (caddr_t)rdr1(); 3211 } 3212 if (bp & 0x04) { 3213 addr[nbp++] = (caddr_t)rdr2(); 3214 } 3215 if (bp & 0x08) { 3216 addr[nbp++] = (caddr_t)rdr3(); 3217 } 3218 3219 for (i = 0; i < nbp; i++) { 3220 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { 3221 /* 3222 * addr[i] is in user space 3223 */ 3224 return nbp; 3225 } 3226 } 3227 3228 /* 3229 * None of the breakpoints are in user space. 3230 */ 3231 return 0; 3232 } 3233 3234 #ifdef KDB 3235 3236 /* 3237 * Provide inb() and outb() as functions. They are normally only available as 3238 * inline functions, thus cannot be called from the debugger. 3239 */ 3240 3241 /* silence compiler warnings */ 3242 u_char inb_(u_short); 3243 void outb_(u_short, u_char); 3244 3245 u_char 3246 inb_(u_short port) 3247 { 3248 return inb(port); 3249 } 3250 3251 void 3252 outb_(u_short port, u_char data) 3253 { 3254 outb(port, data); 3255 } 3256 3257 #endif /* KDB */ 3258