1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 2018 The FreeBSD Foundation 5 * Copyright (c) 1992 Terrence R. Lambert. 6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * William Jolitz. 11 * 12 * Portions of this software were developed by A. Joseph Koshy under 13 * sponsorship from the FreeBSD Foundation and Google, Inc. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. All advertising materials mentioning features or use of this software 24 * must display the following acknowledgement: 25 * This product includes software developed by the University of 26 * California, Berkeley and its contributors. 27 * 4. Neither the name of the University nor the names of its contributors 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41 * SUCH DAMAGE. 42 * 43 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 44 */ 45 46 #include <sys/cdefs.h> 47 __FBSDID("$FreeBSD$"); 48 49 #include "opt_apic.h" 50 #include "opt_atpic.h" 51 #include "opt_cpu.h" 52 #include "opt_ddb.h" 53 #include "opt_inet.h" 54 #include "opt_isa.h" 55 #include "opt_kstack_pages.h" 56 #include "opt_maxmem.h" 57 #include "opt_mp_watchdog.h" 58 #include "opt_perfmon.h" 59 #include "opt_platform.h" 60 61 #include <sys/param.h> 62 #include <sys/proc.h> 63 #include <sys/systm.h> 64 #include <sys/bio.h> 65 #include <sys/buf.h> 66 #include <sys/bus.h> 67 #include <sys/callout.h> 68 #include <sys/cons.h> 69 #include <sys/cpu.h> 70 #include <sys/eventhandler.h> 71 #include <sys/exec.h> 72 #include <sys/imgact.h> 73 #include <sys/kdb.h> 74 #include <sys/kernel.h> 75 #include <sys/ktr.h> 76 #include <sys/linker.h> 77 #include <sys/lock.h> 78 #include <sys/malloc.h> 79 #include <sys/memrange.h> 80 #include <sys/msgbuf.h> 81 #include <sys/mutex.h> 82 #include <sys/pcpu.h> 83 #include <sys/ptrace.h> 84 #include <sys/reboot.h> 85 #include <sys/rwlock.h> 86 #include <sys/sched.h> 87 #include <sys/signalvar.h> 88 #include <sys/smp.h> 89 #include <sys/syscallsubr.h> 90 #include <sys/sysctl.h> 91 #include <sys/sysent.h> 92 #include <sys/sysproto.h> 93 #include <sys/ucontext.h> 94 #include <sys/vmmeter.h> 95 96 #include <vm/vm.h> 97 #include <vm/vm_extern.h> 98 #include <vm/vm_kern.h> 99 #include <vm/vm_page.h> 100 #include <vm/vm_map.h> 101 #include <vm/vm_object.h> 102 #include <vm/vm_pager.h> 103 #include <vm/vm_param.h> 104 105 #ifdef DDB 106 #ifndef KDB 107 #error KDB must be enabled in order for DDB to work! 108 #endif 109 #include <ddb/ddb.h> 110 #include <ddb/db_sym.h> 111 #endif 112 113 #include <isa/rtc.h> 114 115 #include <net/netisr.h> 116 117 #include <machine/bootinfo.h> 118 #include <machine/clock.h> 119 #include <machine/cpu.h> 120 #include <machine/cputypes.h> 121 #include <machine/intr_machdep.h> 122 #include <x86/mca.h> 123 #include <machine/md_var.h> 124 #include <machine/metadata.h> 125 #include <machine/mp_watchdog.h> 126 #include <machine/pc/bios.h> 127 #include <machine/pcb.h> 128 #include <machine/pcb_ext.h> 129 #include <machine/proc.h> 130 #include <machine/reg.h> 131 #include <machine/sigframe.h> 132 #include <machine/specialreg.h> 133 #include <machine/sysarch.h> 134 #include <machine/trap.h> 135 #include <machine/vm86.h> 136 #include <x86/init.h> 137 #ifdef PERFMON 138 #include <machine/perfmon.h> 139 #endif 140 #ifdef SMP 141 #include <machine/smp.h> 142 #endif 143 #ifdef FDT 144 #include <x86/fdt.h> 145 #endif 146 147 #ifdef DEV_APIC 148 #include <x86/apicvar.h> 149 #endif 150 151 #ifdef DEV_ISA 152 #include <x86/isa/icu.h> 153 #endif 154 155 /* Sanity check for __curthread() */ 156 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 157 158 register_t init386(int first); 159 void dblfault_handler(void); 160 161 static void cpu_startup(void *); 162 static void fpstate_drop(struct thread *td); 163 static void get_fpcontext(struct thread *td, mcontext_t *mcp, 164 char *xfpusave, size_t xfpusave_len); 165 static int set_fpcontext(struct thread *td, mcontext_t *mcp, 166 char *xfpustate, size_t xfpustate_len); 167 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 168 169 /* Intel ICH registers */ 170 #define ICH_PMBASE 0x400 171 #define ICH_SMI_EN ICH_PMBASE + 0x30 172 173 int _udatasel, _ucodesel; 174 u_int basemem; 175 176 int cold = 1; 177 178 #ifdef COMPAT_43 179 static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 180 #endif 181 #ifdef COMPAT_FREEBSD4 182 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); 183 #endif 184 185 long Maxmem = 0; 186 long realmem = 0; 187 188 #ifdef PAE 189 FEATURE(pae, "Physical Address Extensions"); 190 #endif 191 192 /* 193 * The number of PHYSMAP entries must be one less than the number of 194 * PHYSSEG entries because the PHYSMAP entry that spans the largest 195 * physical address that is accessible by ISA DMA is split into two 196 * PHYSSEG entries. 197 */ 198 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 199 200 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 201 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 202 203 /* must be 2 less so 0 0 can signal end of chunks */ 204 #define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2) 205 #define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2) 206 207 struct kva_md_info kmi; 208 209 static struct trapframe proc0_tf; 210 struct pcpu __pcpu[MAXCPU]; 211 212 struct mtx icu_lock; 213 214 struct mem_range_softc mem_range_softc; 215 216 extern char start_exceptions[], end_exceptions[]; 217 218 extern struct sysentvec elf32_freebsd_sysvec; 219 220 /* Default init_ops implementation. */ 221 struct init_ops init_ops = { 222 .early_clock_source_init = i8254_init, 223 .early_delay = i8254_delay, 224 #ifdef DEV_APIC 225 .msi_init = msi_init, 226 #endif 227 }; 228 229 static void 230 cpu_startup(dummy) 231 void *dummy; 232 { 233 uintmax_t memsize; 234 char *sysenv; 235 236 /* 237 * On MacBooks, we need to disallow the legacy USB circuit to 238 * generate an SMI# because this can cause several problems, 239 * namely: incorrect CPU frequency detection and failure to 240 * start the APs. 241 * We do this by disabling a bit in the SMI_EN (SMI Control and 242 * Enable register) of the Intel ICH LPC Interface Bridge. 243 */ 244 sysenv = kern_getenv("smbios.system.product"); 245 if (sysenv != NULL) { 246 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 247 strncmp(sysenv, "MacBook3,1", 10) == 0 || 248 strncmp(sysenv, "MacBook4,1", 10) == 0 || 249 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 250 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 251 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 252 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 253 strncmp(sysenv, "Macmini1,1", 10) == 0) { 254 if (bootverbose) 255 printf("Disabling LEGACY_USB_EN bit on " 256 "Intel ICH.\n"); 257 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 258 } 259 freeenv(sysenv); 260 } 261 262 /* 263 * Good {morning,afternoon,evening,night}. 264 */ 265 startrtclock(); 266 printcpuinfo(); 267 panicifcpuunsupported(); 268 #ifdef PERFMON 269 perfmon_init(); 270 #endif 271 272 /* 273 * Display physical memory if SMBIOS reports reasonable amount. 274 */ 275 memsize = 0; 276 sysenv = kern_getenv("smbios.memory.enabled"); 277 if (sysenv != NULL) { 278 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 279 freeenv(sysenv); 280 } 281 if (memsize < ptoa((uintmax_t)vm_free_count())) 282 memsize = ptoa((uintmax_t)Maxmem); 283 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 284 realmem = atop(memsize); 285 286 /* 287 * Display any holes after the first chunk of extended memory. 288 */ 289 if (bootverbose) { 290 int indx; 291 292 printf("Physical memory chunk(s):\n"); 293 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 294 vm_paddr_t size; 295 296 size = phys_avail[indx + 1] - phys_avail[indx]; 297 printf( 298 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 299 (uintmax_t)phys_avail[indx], 300 (uintmax_t)phys_avail[indx + 1] - 1, 301 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 302 } 303 } 304 305 vm_ksubmap_init(&kmi); 306 307 printf("avail memory = %ju (%ju MB)\n", 308 ptoa((uintmax_t)vm_free_count()), 309 ptoa((uintmax_t)vm_free_count()) / 1048576); 310 311 /* 312 * Set up buffers, so they can be used to read disk labels. 313 */ 314 bufinit(); 315 vm_pager_bufferinit(); 316 cpu_setregs(); 317 } 318 319 /* 320 * Send an interrupt to process. 321 * 322 * Stack is set up to allow sigcode stored 323 * at top to call routine, followed by call 324 * to sigreturn routine below. After sigreturn 325 * resets the signal mask, the stack, and the 326 * frame pointer, it returns to the user 327 * specified pc, psl. 328 */ 329 #ifdef COMPAT_43 330 static void 331 osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 332 { 333 struct osigframe sf, *fp; 334 struct proc *p; 335 struct thread *td; 336 struct sigacts *psp; 337 struct trapframe *regs; 338 int sig; 339 int oonstack; 340 341 td = curthread; 342 p = td->td_proc; 343 PROC_LOCK_ASSERT(p, MA_OWNED); 344 sig = ksi->ksi_signo; 345 psp = p->p_sigacts; 346 mtx_assert(&psp->ps_mtx, MA_OWNED); 347 regs = td->td_frame; 348 oonstack = sigonstack(regs->tf_esp); 349 350 /* Allocate space for the signal handler context. */ 351 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && 352 SIGISMEMBER(psp->ps_sigonstack, sig)) { 353 fp = (struct osigframe *)((uintptr_t)td->td_sigstk.ss_sp + 354 td->td_sigstk.ss_size - sizeof(struct osigframe)); 355 #if defined(COMPAT_43) 356 td->td_sigstk.ss_flags |= SS_ONSTACK; 357 #endif 358 } else 359 fp = (struct osigframe *)regs->tf_esp - 1; 360 361 /* Build the argument list for the signal handler. */ 362 sf.sf_signum = sig; 363 sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; 364 bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo)); 365 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 366 /* Signal handler installed with SA_SIGINFO. */ 367 sf.sf_arg2 = (register_t)&fp->sf_siginfo; 368 sf.sf_siginfo.si_signo = sig; 369 sf.sf_siginfo.si_code = ksi->ksi_code; 370 sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; 371 sf.sf_addr = 0; 372 } else { 373 /* Old FreeBSD-style arguments. */ 374 sf.sf_arg2 = ksi->ksi_code; 375 sf.sf_addr = (register_t)ksi->ksi_addr; 376 sf.sf_ahu.sf_handler = catcher; 377 } 378 mtx_unlock(&psp->ps_mtx); 379 PROC_UNLOCK(p); 380 381 /* Save most if not all of trap frame. */ 382 sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; 383 sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; 384 sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; 385 sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; 386 sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; 387 sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; 388 sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; 389 sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; 390 sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; 391 sf.sf_siginfo.si_sc.sc_es = regs->tf_es; 392 sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; 393 sf.sf_siginfo.si_sc.sc_gs = rgs(); 394 sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; 395 396 /* Build the signal context to be used by osigreturn(). */ 397 sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; 398 SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); 399 sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; 400 sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; 401 sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; 402 sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; 403 sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; 404 sf.sf_siginfo.si_sc.sc_err = regs->tf_err; 405 406 /* 407 * If we're a vm86 process, we want to save the segment registers. 408 * We also change eflags to be our emulated eflags, not the actual 409 * eflags. 410 */ 411 if (regs->tf_eflags & PSL_VM) { 412 /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */ 413 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 414 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 415 416 sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; 417 sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; 418 sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; 419 sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; 420 421 if (vm86->vm86_has_vme == 0) 422 sf.sf_siginfo.si_sc.sc_ps = 423 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 424 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 425 426 /* See sendsig() for comments. */ 427 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 428 } 429 430 /* 431 * Copy the sigframe out to the user's stack. 432 */ 433 if (copyout(&sf, fp, sizeof(*fp)) != 0) { 434 PROC_LOCK(p); 435 sigexit(td, SIGILL); 436 } 437 438 regs->tf_esp = (int)fp; 439 if (p->p_sysent->sv_sigcode_base != 0) { 440 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 441 szosigcode; 442 } else { 443 /* a.out sysentvec does not use shared page */ 444 regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode; 445 } 446 regs->tf_eflags &= ~(PSL_T | PSL_D); 447 regs->tf_cs = _ucodesel; 448 regs->tf_ds = _udatasel; 449 regs->tf_es = _udatasel; 450 regs->tf_fs = _udatasel; 451 load_gs(_udatasel); 452 regs->tf_ss = _udatasel; 453 PROC_LOCK(p); 454 mtx_lock(&psp->ps_mtx); 455 } 456 #endif /* COMPAT_43 */ 457 458 #ifdef COMPAT_FREEBSD4 459 static void 460 freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 461 { 462 struct sigframe4 sf, *sfp; 463 struct proc *p; 464 struct thread *td; 465 struct sigacts *psp; 466 struct trapframe *regs; 467 int sig; 468 int oonstack; 469 470 td = curthread; 471 p = td->td_proc; 472 PROC_LOCK_ASSERT(p, MA_OWNED); 473 sig = ksi->ksi_signo; 474 psp = p->p_sigacts; 475 mtx_assert(&psp->ps_mtx, MA_OWNED); 476 regs = td->td_frame; 477 oonstack = sigonstack(regs->tf_esp); 478 479 /* Save user context. */ 480 bzero(&sf, sizeof(sf)); 481 sf.sf_uc.uc_sigmask = *mask; 482 sf.sf_uc.uc_stack = td->td_sigstk; 483 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 484 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 485 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 486 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 487 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 488 bzero(sf.sf_uc.uc_mcontext.mc_fpregs, 489 sizeof(sf.sf_uc.uc_mcontext.mc_fpregs)); 490 bzero(sf.sf_uc.uc_mcontext.__spare__, 491 sizeof(sf.sf_uc.uc_mcontext.__spare__)); 492 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 493 494 /* Allocate space for the signal handler context. */ 495 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 496 SIGISMEMBER(psp->ps_sigonstack, sig)) { 497 sfp = (struct sigframe4 *)((uintptr_t)td->td_sigstk.ss_sp + 498 td->td_sigstk.ss_size - sizeof(struct sigframe4)); 499 #if defined(COMPAT_43) 500 td->td_sigstk.ss_flags |= SS_ONSTACK; 501 #endif 502 } else 503 sfp = (struct sigframe4 *)regs->tf_esp - 1; 504 505 /* Build the argument list for the signal handler. */ 506 sf.sf_signum = sig; 507 sf.sf_ucontext = (register_t)&sfp->sf_uc; 508 bzero(&sf.sf_si, sizeof(sf.sf_si)); 509 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 510 /* Signal handler installed with SA_SIGINFO. */ 511 sf.sf_siginfo = (register_t)&sfp->sf_si; 512 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 513 514 /* Fill in POSIX parts */ 515 sf.sf_si.si_signo = sig; 516 sf.sf_si.si_code = ksi->ksi_code; 517 sf.sf_si.si_addr = ksi->ksi_addr; 518 } else { 519 /* Old FreeBSD-style arguments. */ 520 sf.sf_siginfo = ksi->ksi_code; 521 sf.sf_addr = (register_t)ksi->ksi_addr; 522 sf.sf_ahu.sf_handler = catcher; 523 } 524 mtx_unlock(&psp->ps_mtx); 525 PROC_UNLOCK(p); 526 527 /* 528 * If we're a vm86 process, we want to save the segment registers. 529 * We also change eflags to be our emulated eflags, not the actual 530 * eflags. 531 */ 532 if (regs->tf_eflags & PSL_VM) { 533 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 534 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 535 536 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 537 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 538 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 539 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 540 541 if (vm86->vm86_has_vme == 0) 542 sf.sf_uc.uc_mcontext.mc_eflags = 543 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 544 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 545 546 /* 547 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 548 * syscalls made by the signal handler. This just avoids 549 * wasting time for our lazy fixup of such faults. PSL_NT 550 * does nothing in vm86 mode, but vm86 programs can set it 551 * almost legitimately in probes for old cpu types. 552 */ 553 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 554 } 555 556 /* 557 * Copy the sigframe out to the user's stack. 558 */ 559 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { 560 PROC_LOCK(p); 561 sigexit(td, SIGILL); 562 } 563 564 regs->tf_esp = (int)sfp; 565 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - 566 szfreebsd4_sigcode; 567 regs->tf_eflags &= ~(PSL_T | PSL_D); 568 regs->tf_cs = _ucodesel; 569 regs->tf_ds = _udatasel; 570 regs->tf_es = _udatasel; 571 regs->tf_fs = _udatasel; 572 regs->tf_ss = _udatasel; 573 PROC_LOCK(p); 574 mtx_lock(&psp->ps_mtx); 575 } 576 #endif /* COMPAT_FREEBSD4 */ 577 578 void 579 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 580 { 581 struct sigframe sf, *sfp; 582 struct proc *p; 583 struct thread *td; 584 struct sigacts *psp; 585 char *sp; 586 struct trapframe *regs; 587 struct segment_descriptor *sdp; 588 char *xfpusave; 589 size_t xfpusave_len; 590 int sig; 591 int oonstack; 592 593 td = curthread; 594 p = td->td_proc; 595 PROC_LOCK_ASSERT(p, MA_OWNED); 596 sig = ksi->ksi_signo; 597 psp = p->p_sigacts; 598 mtx_assert(&psp->ps_mtx, MA_OWNED); 599 #ifdef COMPAT_FREEBSD4 600 if (SIGISMEMBER(psp->ps_freebsd4, sig)) { 601 freebsd4_sendsig(catcher, ksi, mask); 602 return; 603 } 604 #endif 605 #ifdef COMPAT_43 606 if (SIGISMEMBER(psp->ps_osigset, sig)) { 607 osendsig(catcher, ksi, mask); 608 return; 609 } 610 #endif 611 regs = td->td_frame; 612 oonstack = sigonstack(regs->tf_esp); 613 614 if (cpu_max_ext_state_size > sizeof(union savefpu) && use_xsave) { 615 xfpusave_len = cpu_max_ext_state_size - sizeof(union savefpu); 616 xfpusave = __builtin_alloca(xfpusave_len); 617 } else { 618 xfpusave_len = 0; 619 xfpusave = NULL; 620 } 621 622 /* Save user context. */ 623 bzero(&sf, sizeof(sf)); 624 sf.sf_uc.uc_sigmask = *mask; 625 sf.sf_uc.uc_stack = td->td_sigstk; 626 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 627 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 628 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 629 sf.sf_uc.uc_mcontext.mc_gs = rgs(); 630 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); 631 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 632 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); 633 fpstate_drop(td); 634 /* 635 * Unconditionally fill the fsbase and gsbase into the mcontext. 636 */ 637 sdp = &td->td_pcb->pcb_fsd; 638 sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 | 639 sdp->sd_lobase; 640 sdp = &td->td_pcb->pcb_gsd; 641 sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 | 642 sdp->sd_lobase; 643 bzero(sf.sf_uc.uc_mcontext.mc_spare2, 644 sizeof(sf.sf_uc.uc_mcontext.mc_spare2)); 645 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); 646 647 /* Allocate space for the signal handler context. */ 648 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 649 SIGISMEMBER(psp->ps_sigonstack, sig)) { 650 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; 651 #if defined(COMPAT_43) 652 td->td_sigstk.ss_flags |= SS_ONSTACK; 653 #endif 654 } else 655 sp = (char *)regs->tf_esp - 128; 656 if (xfpusave != NULL) { 657 sp -= xfpusave_len; 658 sp = (char *)((unsigned int)sp & ~0x3F); 659 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; 660 } 661 sp -= sizeof(struct sigframe); 662 663 /* Align to 16 bytes. */ 664 sfp = (struct sigframe *)((unsigned int)sp & ~0xF); 665 666 /* Build the argument list for the signal handler. */ 667 sf.sf_signum = sig; 668 sf.sf_ucontext = (register_t)&sfp->sf_uc; 669 bzero(&sf.sf_si, sizeof(sf.sf_si)); 670 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 671 /* Signal handler installed with SA_SIGINFO. */ 672 sf.sf_siginfo = (register_t)&sfp->sf_si; 673 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 674 675 /* Fill in POSIX parts */ 676 sf.sf_si = ksi->ksi_info; 677 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 678 } else { 679 /* Old FreeBSD-style arguments. */ 680 sf.sf_siginfo = ksi->ksi_code; 681 sf.sf_addr = (register_t)ksi->ksi_addr; 682 sf.sf_ahu.sf_handler = catcher; 683 } 684 mtx_unlock(&psp->ps_mtx); 685 PROC_UNLOCK(p); 686 687 /* 688 * If we're a vm86 process, we want to save the segment registers. 689 * We also change eflags to be our emulated eflags, not the actual 690 * eflags. 691 */ 692 if (regs->tf_eflags & PSL_VM) { 693 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 694 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; 695 696 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 697 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 698 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 699 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 700 701 if (vm86->vm86_has_vme == 0) 702 sf.sf_uc.uc_mcontext.mc_eflags = 703 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 704 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 705 706 /* 707 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 708 * syscalls made by the signal handler. This just avoids 709 * wasting time for our lazy fixup of such faults. PSL_NT 710 * does nothing in vm86 mode, but vm86 programs can set it 711 * almost legitimately in probes for old cpu types. 712 */ 713 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 714 } 715 716 /* 717 * Copy the sigframe out to the user's stack. 718 */ 719 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || 720 (xfpusave != NULL && copyout(xfpusave, 721 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) 722 != 0)) { 723 PROC_LOCK(p); 724 sigexit(td, SIGILL); 725 } 726 727 regs->tf_esp = (int)sfp; 728 regs->tf_eip = p->p_sysent->sv_sigcode_base; 729 if (regs->tf_eip == 0) 730 regs->tf_eip = p->p_sysent->sv_psstrings - szsigcode; 731 regs->tf_eflags &= ~(PSL_T | PSL_D); 732 regs->tf_cs = _ucodesel; 733 regs->tf_ds = _udatasel; 734 regs->tf_es = _udatasel; 735 regs->tf_fs = _udatasel; 736 regs->tf_ss = _udatasel; 737 PROC_LOCK(p); 738 mtx_lock(&psp->ps_mtx); 739 } 740 741 /* 742 * System call to cleanup state after a signal 743 * has been taken. Reset signal mask and 744 * stack state from context left by sendsig (above). 745 * Return to previous pc and psl as specified by 746 * context left by sendsig. Check carefully to 747 * make sure that the user has not modified the 748 * state to gain improper privileges. 749 * 750 * MPSAFE 751 */ 752 #ifdef COMPAT_43 753 int 754 osigreturn(td, uap) 755 struct thread *td; 756 struct osigreturn_args /* { 757 struct osigcontext *sigcntxp; 758 } */ *uap; 759 { 760 struct osigcontext sc; 761 struct trapframe *regs; 762 struct osigcontext *scp; 763 int eflags, error; 764 ksiginfo_t ksi; 765 766 regs = td->td_frame; 767 error = copyin(uap->sigcntxp, &sc, sizeof(sc)); 768 if (error != 0) 769 return (error); 770 scp = ≻ 771 eflags = scp->sc_ps; 772 if (eflags & PSL_VM) { 773 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 774 struct vm86_kernel *vm86; 775 776 /* 777 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 778 * set up the vm86 area, and we can't enter vm86 mode. 779 */ 780 if (td->td_pcb->pcb_ext == 0) 781 return (EINVAL); 782 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 783 if (vm86->vm86_inited == 0) 784 return (EINVAL); 785 786 /* Go back to user mode if both flags are set. */ 787 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 788 ksiginfo_init_trap(&ksi); 789 ksi.ksi_signo = SIGBUS; 790 ksi.ksi_code = BUS_OBJERR; 791 ksi.ksi_addr = (void *)regs->tf_eip; 792 trapsignal(td, &ksi); 793 } 794 795 if (vm86->vm86_has_vme) { 796 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 797 (eflags & VME_USERCHANGE) | PSL_VM; 798 } else { 799 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 800 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 801 (eflags & VM_USERCHANGE) | PSL_VM; 802 } 803 tf->tf_vm86_ds = scp->sc_ds; 804 tf->tf_vm86_es = scp->sc_es; 805 tf->tf_vm86_fs = scp->sc_fs; 806 tf->tf_vm86_gs = scp->sc_gs; 807 tf->tf_ds = _udatasel; 808 tf->tf_es = _udatasel; 809 tf->tf_fs = _udatasel; 810 } else { 811 /* 812 * Don't allow users to change privileged or reserved flags. 813 */ 814 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 815 return (EINVAL); 816 } 817 818 /* 819 * Don't allow users to load a valid privileged %cs. Let the 820 * hardware check for invalid selectors, excess privilege in 821 * other selectors, invalid %eip's and invalid %esp's. 822 */ 823 if (!CS_SECURE(scp->sc_cs)) { 824 ksiginfo_init_trap(&ksi); 825 ksi.ksi_signo = SIGBUS; 826 ksi.ksi_code = BUS_OBJERR; 827 ksi.ksi_trapno = T_PROTFLT; 828 ksi.ksi_addr = (void *)regs->tf_eip; 829 trapsignal(td, &ksi); 830 return (EINVAL); 831 } 832 regs->tf_ds = scp->sc_ds; 833 regs->tf_es = scp->sc_es; 834 regs->tf_fs = scp->sc_fs; 835 } 836 837 /* Restore remaining registers. */ 838 regs->tf_eax = scp->sc_eax; 839 regs->tf_ebx = scp->sc_ebx; 840 regs->tf_ecx = scp->sc_ecx; 841 regs->tf_edx = scp->sc_edx; 842 regs->tf_esi = scp->sc_esi; 843 regs->tf_edi = scp->sc_edi; 844 regs->tf_cs = scp->sc_cs; 845 regs->tf_ss = scp->sc_ss; 846 regs->tf_isp = scp->sc_isp; 847 regs->tf_ebp = scp->sc_fp; 848 regs->tf_esp = scp->sc_sp; 849 regs->tf_eip = scp->sc_pc; 850 regs->tf_eflags = eflags; 851 852 #if defined(COMPAT_43) 853 if (scp->sc_onstack & 1) 854 td->td_sigstk.ss_flags |= SS_ONSTACK; 855 else 856 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 857 #endif 858 kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL, 859 SIGPROCMASK_OLD); 860 return (EJUSTRETURN); 861 } 862 #endif /* COMPAT_43 */ 863 864 #ifdef COMPAT_FREEBSD4 865 /* 866 * MPSAFE 867 */ 868 int 869 freebsd4_sigreturn(td, uap) 870 struct thread *td; 871 struct freebsd4_sigreturn_args /* { 872 const ucontext4 *sigcntxp; 873 } */ *uap; 874 { 875 struct ucontext4 uc; 876 struct trapframe *regs; 877 struct ucontext4 *ucp; 878 int cs, eflags, error; 879 ksiginfo_t ksi; 880 881 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 882 if (error != 0) 883 return (error); 884 ucp = &uc; 885 regs = td->td_frame; 886 eflags = ucp->uc_mcontext.mc_eflags; 887 if (eflags & PSL_VM) { 888 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 889 struct vm86_kernel *vm86; 890 891 /* 892 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 893 * set up the vm86 area, and we can't enter vm86 mode. 894 */ 895 if (td->td_pcb->pcb_ext == 0) 896 return (EINVAL); 897 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 898 if (vm86->vm86_inited == 0) 899 return (EINVAL); 900 901 /* Go back to user mode if both flags are set. */ 902 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 903 ksiginfo_init_trap(&ksi); 904 ksi.ksi_signo = SIGBUS; 905 ksi.ksi_code = BUS_OBJERR; 906 ksi.ksi_addr = (void *)regs->tf_eip; 907 trapsignal(td, &ksi); 908 } 909 if (vm86->vm86_has_vme) { 910 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 911 (eflags & VME_USERCHANGE) | PSL_VM; 912 } else { 913 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 914 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 915 (eflags & VM_USERCHANGE) | PSL_VM; 916 } 917 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 918 tf->tf_eflags = eflags; 919 tf->tf_vm86_ds = tf->tf_ds; 920 tf->tf_vm86_es = tf->tf_es; 921 tf->tf_vm86_fs = tf->tf_fs; 922 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 923 tf->tf_ds = _udatasel; 924 tf->tf_es = _udatasel; 925 tf->tf_fs = _udatasel; 926 } else { 927 /* 928 * Don't allow users to change privileged or reserved flags. 929 */ 930 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 931 uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n", 932 td->td_proc->p_pid, td->td_name, eflags); 933 return (EINVAL); 934 } 935 936 /* 937 * Don't allow users to load a valid privileged %cs. Let the 938 * hardware check for invalid selectors, excess privilege in 939 * other selectors, invalid %eip's and invalid %esp's. 940 */ 941 cs = ucp->uc_mcontext.mc_cs; 942 if (!CS_SECURE(cs)) { 943 uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n", 944 td->td_proc->p_pid, td->td_name, cs); 945 ksiginfo_init_trap(&ksi); 946 ksi.ksi_signo = SIGBUS; 947 ksi.ksi_code = BUS_OBJERR; 948 ksi.ksi_trapno = T_PROTFLT; 949 ksi.ksi_addr = (void *)regs->tf_eip; 950 trapsignal(td, &ksi); 951 return (EINVAL); 952 } 953 954 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 955 } 956 957 #if defined(COMPAT_43) 958 if (ucp->uc_mcontext.mc_onstack & 1) 959 td->td_sigstk.ss_flags |= SS_ONSTACK; 960 else 961 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 962 #endif 963 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 964 return (EJUSTRETURN); 965 } 966 #endif /* COMPAT_FREEBSD4 */ 967 968 /* 969 * MPSAFE 970 */ 971 int 972 sys_sigreturn(td, uap) 973 struct thread *td; 974 struct sigreturn_args /* { 975 const struct __ucontext *sigcntxp; 976 } */ *uap; 977 { 978 ucontext_t uc; 979 struct proc *p; 980 struct trapframe *regs; 981 ucontext_t *ucp; 982 char *xfpustate; 983 size_t xfpustate_len; 984 int cs, eflags, error, ret; 985 ksiginfo_t ksi; 986 987 p = td->td_proc; 988 989 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 990 if (error != 0) 991 return (error); 992 ucp = &uc; 993 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { 994 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, 995 td->td_name, ucp->uc_mcontext.mc_flags); 996 return (EINVAL); 997 } 998 regs = td->td_frame; 999 eflags = ucp->uc_mcontext.mc_eflags; 1000 if (eflags & PSL_VM) { 1001 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 1002 struct vm86_kernel *vm86; 1003 1004 /* 1005 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 1006 * set up the vm86 area, and we can't enter vm86 mode. 1007 */ 1008 if (td->td_pcb->pcb_ext == 0) 1009 return (EINVAL); 1010 vm86 = &td->td_pcb->pcb_ext->ext_vm86; 1011 if (vm86->vm86_inited == 0) 1012 return (EINVAL); 1013 1014 /* Go back to user mode if both flags are set. */ 1015 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { 1016 ksiginfo_init_trap(&ksi); 1017 ksi.ksi_signo = SIGBUS; 1018 ksi.ksi_code = BUS_OBJERR; 1019 ksi.ksi_addr = (void *)regs->tf_eip; 1020 trapsignal(td, &ksi); 1021 } 1022 1023 if (vm86->vm86_has_vme) { 1024 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 1025 (eflags & VME_USERCHANGE) | PSL_VM; 1026 } else { 1027 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 1028 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 1029 (eflags & VM_USERCHANGE) | PSL_VM; 1030 } 1031 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); 1032 tf->tf_eflags = eflags; 1033 tf->tf_vm86_ds = tf->tf_ds; 1034 tf->tf_vm86_es = tf->tf_es; 1035 tf->tf_vm86_fs = tf->tf_fs; 1036 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; 1037 tf->tf_ds = _udatasel; 1038 tf->tf_es = _udatasel; 1039 tf->tf_fs = _udatasel; 1040 } else { 1041 /* 1042 * Don't allow users to change privileged or reserved flags. 1043 */ 1044 if (!EFL_SECURE(eflags, regs->tf_eflags)) { 1045 uprintf("pid %d (%s): sigreturn eflags = 0x%x\n", 1046 td->td_proc->p_pid, td->td_name, eflags); 1047 return (EINVAL); 1048 } 1049 1050 /* 1051 * Don't allow users to load a valid privileged %cs. Let the 1052 * hardware check for invalid selectors, excess privilege in 1053 * other selectors, invalid %eip's and invalid %esp's. 1054 */ 1055 cs = ucp->uc_mcontext.mc_cs; 1056 if (!CS_SECURE(cs)) { 1057 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", 1058 td->td_proc->p_pid, td->td_name, cs); 1059 ksiginfo_init_trap(&ksi); 1060 ksi.ksi_signo = SIGBUS; 1061 ksi.ksi_code = BUS_OBJERR; 1062 ksi.ksi_trapno = T_PROTFLT; 1063 ksi.ksi_addr = (void *)regs->tf_eip; 1064 trapsignal(td, &ksi); 1065 return (EINVAL); 1066 } 1067 1068 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { 1069 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; 1070 if (xfpustate_len > cpu_max_ext_state_size - 1071 sizeof(union savefpu)) { 1072 uprintf( 1073 "pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", 1074 p->p_pid, td->td_name, xfpustate_len); 1075 return (EINVAL); 1076 } 1077 xfpustate = __builtin_alloca(xfpustate_len); 1078 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, 1079 xfpustate, xfpustate_len); 1080 if (error != 0) { 1081 uprintf( 1082 "pid %d (%s): sigreturn copying xfpustate failed\n", 1083 p->p_pid, td->td_name); 1084 return (error); 1085 } 1086 } else { 1087 xfpustate = NULL; 1088 xfpustate_len = 0; 1089 } 1090 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, 1091 xfpustate_len); 1092 if (ret != 0) 1093 return (ret); 1094 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); 1095 } 1096 1097 #if defined(COMPAT_43) 1098 if (ucp->uc_mcontext.mc_onstack & 1) 1099 td->td_sigstk.ss_flags |= SS_ONSTACK; 1100 else 1101 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 1102 #endif 1103 1104 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 1105 return (EJUSTRETURN); 1106 } 1107 1108 #ifdef COMPAT_43 1109 static void 1110 setup_priv_lcall_gate(struct proc *p) 1111 { 1112 struct i386_ldt_args uap; 1113 union descriptor desc; 1114 u_int lcall_addr; 1115 1116 bzero(&uap, sizeof(uap)); 1117 uap.start = 0; 1118 uap.num = 1; 1119 lcall_addr = p->p_sysent->sv_psstrings - sz_lcall_tramp; 1120 bzero(&desc, sizeof(desc)); 1121 desc.sd.sd_type = SDT_MEMERA; 1122 desc.sd.sd_dpl = SEL_UPL; 1123 desc.sd.sd_p = 1; 1124 desc.sd.sd_def32 = 1; 1125 desc.sd.sd_gran = 1; 1126 desc.sd.sd_lolimit = 0xffff; 1127 desc.sd.sd_hilimit = 0xf; 1128 desc.sd.sd_lobase = lcall_addr; 1129 desc.sd.sd_hibase = lcall_addr >> 24; 1130 i386_set_ldt(curthread, &uap, &desc); 1131 } 1132 #endif 1133 1134 /* 1135 * Reset registers to default values on exec. 1136 */ 1137 void 1138 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) 1139 { 1140 struct trapframe *regs; 1141 struct pcb *pcb; 1142 register_t saved_eflags; 1143 1144 regs = td->td_frame; 1145 pcb = td->td_pcb; 1146 1147 /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ 1148 pcb->pcb_gs = _udatasel; 1149 load_gs(_udatasel); 1150 1151 mtx_lock_spin(&dt_lock); 1152 if (td->td_proc->p_md.md_ldt != NULL) 1153 user_ldt_free(td); 1154 else 1155 mtx_unlock_spin(&dt_lock); 1156 1157 #ifdef COMPAT_43 1158 if (td->td_proc->p_sysent->sv_psstrings != 1159 elf32_freebsd_sysvec.sv_psstrings) 1160 setup_priv_lcall_gate(td->td_proc); 1161 #endif 1162 1163 /* 1164 * Reset the fs and gs bases. The values from the old address 1165 * space do not make sense for the new program. In particular, 1166 * gsbase might be the TLS base for the old program but the new 1167 * program has no TLS now. 1168 */ 1169 set_fsbase(td, 0); 1170 set_gsbase(td, 0); 1171 1172 /* Make sure edx is 0x0 on entry. Linux binaries depend on it. */ 1173 saved_eflags = regs->tf_eflags & PSL_T; 1174 bzero((char *)regs, sizeof(struct trapframe)); 1175 regs->tf_eip = imgp->entry_addr; 1176 regs->tf_esp = stack; 1177 regs->tf_eflags = PSL_USER | saved_eflags; 1178 regs->tf_ss = _udatasel; 1179 regs->tf_ds = _udatasel; 1180 regs->tf_es = _udatasel; 1181 regs->tf_fs = _udatasel; 1182 regs->tf_cs = _ucodesel; 1183 1184 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ 1185 regs->tf_ebx = imgp->ps_strings; 1186 1187 /* 1188 * Reset the hardware debug registers if they were in use. 1189 * They won't have any meaning for the newly exec'd process. 1190 */ 1191 if (pcb->pcb_flags & PCB_DBREGS) { 1192 pcb->pcb_dr0 = 0; 1193 pcb->pcb_dr1 = 0; 1194 pcb->pcb_dr2 = 0; 1195 pcb->pcb_dr3 = 0; 1196 pcb->pcb_dr6 = 0; 1197 pcb->pcb_dr7 = 0; 1198 if (pcb == curpcb) { 1199 /* 1200 * Clear the debug registers on the running 1201 * CPU, otherwise they will end up affecting 1202 * the next process we switch to. 1203 */ 1204 reset_dbregs(); 1205 } 1206 pcb->pcb_flags &= ~PCB_DBREGS; 1207 } 1208 1209 pcb->pcb_initial_npxcw = __INITIAL_NPXCW__; 1210 1211 /* 1212 * Drop the FP state if we hold it, so that the process gets a 1213 * clean FP state if it uses the FPU again. 1214 */ 1215 fpstate_drop(td); 1216 } 1217 1218 void 1219 cpu_setregs(void) 1220 { 1221 unsigned int cr0; 1222 1223 cr0 = rcr0(); 1224 1225 /* 1226 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support: 1227 * 1228 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT 1229 * instructions. We must set the CR0_MP bit and use the CR0_TS 1230 * bit to control the trap, because setting the CR0_EM bit does 1231 * not cause WAIT instructions to trap. It's important to trap 1232 * WAIT instructions - otherwise the "wait" variants of no-wait 1233 * control instructions would degenerate to the "no-wait" variants 1234 * after FP context switches but work correctly otherwise. It's 1235 * particularly important to trap WAITs when there is no NPX - 1236 * otherwise the "wait" variants would always degenerate. 1237 * 1238 * Try setting CR0_NE to get correct error reporting on 486DX's. 1239 * Setting it should fail or do nothing on lesser processors. 1240 */ 1241 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 1242 load_cr0(cr0); 1243 load_gs(_udatasel); 1244 } 1245 1246 u_long bootdev; /* not a struct cdev *- encoding is different */ 1247 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, 1248 CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)"); 1249 1250 static char bootmethod[16] = "BIOS"; 1251 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, 1252 "System firmware boot method"); 1253 1254 /* 1255 * Initialize 386 and configure to run kernel 1256 */ 1257 1258 /* 1259 * Initialize segments & interrupt table 1260 */ 1261 1262 int _default_ldt; 1263 1264 struct mtx dt_lock; /* lock for GDT and LDT */ 1265 1266 union descriptor gdt0[NGDT]; /* initial global descriptor table */ 1267 union descriptor *gdt = gdt0; /* global descriptor table */ 1268 1269 union descriptor *ldt; /* local descriptor table */ 1270 1271 static struct gate_descriptor idt0[NIDT]; 1272 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 1273 1274 static struct i386tss *dblfault_tss; 1275 static char *dblfault_stack; 1276 1277 static struct i386tss common_tss0; 1278 1279 vm_offset_t proc0kstack; 1280 1281 /* 1282 * software prototypes -- in more palatable form. 1283 * 1284 * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret 1285 * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it) 1286 */ 1287 struct soft_segment_descriptor gdt_segs[] = { 1288 /* GNULL_SEL 0 Null Descriptor */ 1289 { .ssd_base = 0x0, 1290 .ssd_limit = 0x0, 1291 .ssd_type = 0, 1292 .ssd_dpl = SEL_KPL, 1293 .ssd_p = 0, 1294 .ssd_xx = 0, .ssd_xx1 = 0, 1295 .ssd_def32 = 0, 1296 .ssd_gran = 0 }, 1297 /* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */ 1298 { .ssd_base = 0x0, 1299 .ssd_limit = 0xfffff, 1300 .ssd_type = SDT_MEMRWA, 1301 .ssd_dpl = SEL_KPL, 1302 .ssd_p = 1, 1303 .ssd_xx = 0, .ssd_xx1 = 0, 1304 .ssd_def32 = 1, 1305 .ssd_gran = 1 }, 1306 /* GUFS_SEL 2 %fs Descriptor for user */ 1307 { .ssd_base = 0x0, 1308 .ssd_limit = 0xfffff, 1309 .ssd_type = SDT_MEMRWA, 1310 .ssd_dpl = SEL_UPL, 1311 .ssd_p = 1, 1312 .ssd_xx = 0, .ssd_xx1 = 0, 1313 .ssd_def32 = 1, 1314 .ssd_gran = 1 }, 1315 /* GUGS_SEL 3 %gs Descriptor for user */ 1316 { .ssd_base = 0x0, 1317 .ssd_limit = 0xfffff, 1318 .ssd_type = SDT_MEMRWA, 1319 .ssd_dpl = SEL_UPL, 1320 .ssd_p = 1, 1321 .ssd_xx = 0, .ssd_xx1 = 0, 1322 .ssd_def32 = 1, 1323 .ssd_gran = 1 }, 1324 /* GCODE_SEL 4 Code Descriptor for kernel */ 1325 { .ssd_base = 0x0, 1326 .ssd_limit = 0xfffff, 1327 .ssd_type = SDT_MEMERA, 1328 .ssd_dpl = SEL_KPL, 1329 .ssd_p = 1, 1330 .ssd_xx = 0, .ssd_xx1 = 0, 1331 .ssd_def32 = 1, 1332 .ssd_gran = 1 }, 1333 /* GDATA_SEL 5 Data Descriptor for kernel */ 1334 { .ssd_base = 0x0, 1335 .ssd_limit = 0xfffff, 1336 .ssd_type = SDT_MEMRWA, 1337 .ssd_dpl = SEL_KPL, 1338 .ssd_p = 1, 1339 .ssd_xx = 0, .ssd_xx1 = 0, 1340 .ssd_def32 = 1, 1341 .ssd_gran = 1 }, 1342 /* GUCODE_SEL 6 Code Descriptor for user */ 1343 { .ssd_base = 0x0, 1344 .ssd_limit = 0xfffff, 1345 .ssd_type = SDT_MEMERA, 1346 .ssd_dpl = SEL_UPL, 1347 .ssd_p = 1, 1348 .ssd_xx = 0, .ssd_xx1 = 0, 1349 .ssd_def32 = 1, 1350 .ssd_gran = 1 }, 1351 /* GUDATA_SEL 7 Data Descriptor for user */ 1352 { .ssd_base = 0x0, 1353 .ssd_limit = 0xfffff, 1354 .ssd_type = SDT_MEMRWA, 1355 .ssd_dpl = SEL_UPL, 1356 .ssd_p = 1, 1357 .ssd_xx = 0, .ssd_xx1 = 0, 1358 .ssd_def32 = 1, 1359 .ssd_gran = 1 }, 1360 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ 1361 { .ssd_base = 0x400, 1362 .ssd_limit = 0xfffff, 1363 .ssd_type = SDT_MEMRWA, 1364 .ssd_dpl = SEL_KPL, 1365 .ssd_p = 1, 1366 .ssd_xx = 0, .ssd_xx1 = 0, 1367 .ssd_def32 = 1, 1368 .ssd_gran = 1 }, 1369 /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 1370 { 1371 .ssd_base = 0x0, 1372 .ssd_limit = sizeof(struct i386tss)-1, 1373 .ssd_type = SDT_SYS386TSS, 1374 .ssd_dpl = 0, 1375 .ssd_p = 1, 1376 .ssd_xx = 0, .ssd_xx1 = 0, 1377 .ssd_def32 = 0, 1378 .ssd_gran = 0 }, 1379 /* GLDT_SEL 10 LDT Descriptor */ 1380 { .ssd_base = 0, 1381 .ssd_limit = sizeof(union descriptor) * NLDT - 1, 1382 .ssd_type = SDT_SYSLDT, 1383 .ssd_dpl = SEL_UPL, 1384 .ssd_p = 1, 1385 .ssd_xx = 0, .ssd_xx1 = 0, 1386 .ssd_def32 = 0, 1387 .ssd_gran = 0 }, 1388 /* GUSERLDT_SEL 11 User LDT Descriptor per process */ 1389 { .ssd_base = 0, 1390 .ssd_limit = (512 * sizeof(union descriptor)-1), 1391 .ssd_type = SDT_SYSLDT, 1392 .ssd_dpl = 0, 1393 .ssd_p = 1, 1394 .ssd_xx = 0, .ssd_xx1 = 0, 1395 .ssd_def32 = 0, 1396 .ssd_gran = 0 }, 1397 /* GPANIC_SEL 12 Panic Tss Descriptor */ 1398 { .ssd_base = 0, 1399 .ssd_limit = sizeof(struct i386tss)-1, 1400 .ssd_type = SDT_SYS386TSS, 1401 .ssd_dpl = 0, 1402 .ssd_p = 1, 1403 .ssd_xx = 0, .ssd_xx1 = 0, 1404 .ssd_def32 = 0, 1405 .ssd_gran = 0 }, 1406 /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */ 1407 { .ssd_base = 0, 1408 .ssd_limit = 0xfffff, 1409 .ssd_type = SDT_MEMERA, 1410 .ssd_dpl = 0, 1411 .ssd_p = 1, 1412 .ssd_xx = 0, .ssd_xx1 = 0, 1413 .ssd_def32 = 0, 1414 .ssd_gran = 1 }, 1415 /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */ 1416 { .ssd_base = 0, 1417 .ssd_limit = 0xfffff, 1418 .ssd_type = SDT_MEMERA, 1419 .ssd_dpl = 0, 1420 .ssd_p = 1, 1421 .ssd_xx = 0, .ssd_xx1 = 0, 1422 .ssd_def32 = 0, 1423 .ssd_gran = 1 }, 1424 /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */ 1425 { .ssd_base = 0, 1426 .ssd_limit = 0xfffff, 1427 .ssd_type = SDT_MEMRWA, 1428 .ssd_dpl = 0, 1429 .ssd_p = 1, 1430 .ssd_xx = 0, .ssd_xx1 = 0, 1431 .ssd_def32 = 1, 1432 .ssd_gran = 1 }, 1433 /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */ 1434 { .ssd_base = 0, 1435 .ssd_limit = 0xfffff, 1436 .ssd_type = SDT_MEMRWA, 1437 .ssd_dpl = 0, 1438 .ssd_p = 1, 1439 .ssd_xx = 0, .ssd_xx1 = 0, 1440 .ssd_def32 = 0, 1441 .ssd_gran = 1 }, 1442 /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */ 1443 { .ssd_base = 0, 1444 .ssd_limit = 0xfffff, 1445 .ssd_type = SDT_MEMRWA, 1446 .ssd_dpl = 0, 1447 .ssd_p = 1, 1448 .ssd_xx = 0, .ssd_xx1 = 0, 1449 .ssd_def32 = 0, 1450 .ssd_gran = 1 }, 1451 /* GNDIS_SEL 18 NDIS Descriptor */ 1452 { .ssd_base = 0x0, 1453 .ssd_limit = 0x0, 1454 .ssd_type = 0, 1455 .ssd_dpl = 0, 1456 .ssd_p = 0, 1457 .ssd_xx = 0, .ssd_xx1 = 0, 1458 .ssd_def32 = 0, 1459 .ssd_gran = 0 }, 1460 }; 1461 1462 static struct soft_segment_descriptor ldt_segs[] = { 1463 /* Null Descriptor - overwritten by call gate */ 1464 { .ssd_base = 0x0, 1465 .ssd_limit = 0x0, 1466 .ssd_type = 0, 1467 .ssd_dpl = 0, 1468 .ssd_p = 0, 1469 .ssd_xx = 0, .ssd_xx1 = 0, 1470 .ssd_def32 = 0, 1471 .ssd_gran = 0 }, 1472 /* Null Descriptor - overwritten by call gate */ 1473 { .ssd_base = 0x0, 1474 .ssd_limit = 0x0, 1475 .ssd_type = 0, 1476 .ssd_dpl = 0, 1477 .ssd_p = 0, 1478 .ssd_xx = 0, .ssd_xx1 = 0, 1479 .ssd_def32 = 0, 1480 .ssd_gran = 0 }, 1481 /* Null Descriptor - overwritten by call gate */ 1482 { .ssd_base = 0x0, 1483 .ssd_limit = 0x0, 1484 .ssd_type = 0, 1485 .ssd_dpl = 0, 1486 .ssd_p = 0, 1487 .ssd_xx = 0, .ssd_xx1 = 0, 1488 .ssd_def32 = 0, 1489 .ssd_gran = 0 }, 1490 /* Code Descriptor for user */ 1491 { .ssd_base = 0x0, 1492 .ssd_limit = 0xfffff, 1493 .ssd_type = SDT_MEMERA, 1494 .ssd_dpl = SEL_UPL, 1495 .ssd_p = 1, 1496 .ssd_xx = 0, .ssd_xx1 = 0, 1497 .ssd_def32 = 1, 1498 .ssd_gran = 1 }, 1499 /* Null Descriptor - overwritten by call gate */ 1500 { .ssd_base = 0x0, 1501 .ssd_limit = 0x0, 1502 .ssd_type = 0, 1503 .ssd_dpl = 0, 1504 .ssd_p = 0, 1505 .ssd_xx = 0, .ssd_xx1 = 0, 1506 .ssd_def32 = 0, 1507 .ssd_gran = 0 }, 1508 /* Data Descriptor for user */ 1509 { .ssd_base = 0x0, 1510 .ssd_limit = 0xfffff, 1511 .ssd_type = SDT_MEMRWA, 1512 .ssd_dpl = SEL_UPL, 1513 .ssd_p = 1, 1514 .ssd_xx = 0, .ssd_xx1 = 0, 1515 .ssd_def32 = 1, 1516 .ssd_gran = 1 }, 1517 }; 1518 1519 uintptr_t setidt_disp; 1520 1521 void 1522 setidt(int idx, inthand_t *func, int typ, int dpl, int selec) 1523 { 1524 uintptr_t off; 1525 1526 off = func != NULL ? (uintptr_t)func + setidt_disp : 0; 1527 setidt_nodisp(idx, off, typ, dpl, selec); 1528 } 1529 1530 void 1531 setidt_nodisp(int idx, uintptr_t off, int typ, int dpl, int selec) 1532 { 1533 struct gate_descriptor *ip; 1534 1535 ip = idt + idx; 1536 ip->gd_looffset = off; 1537 ip->gd_selector = selec; 1538 ip->gd_stkcpy = 0; 1539 ip->gd_xx = 0; 1540 ip->gd_type = typ; 1541 ip->gd_dpl = dpl; 1542 ip->gd_p = 1; 1543 ip->gd_hioffset = ((u_int)off) >> 16 ; 1544 } 1545 1546 extern inthand_t 1547 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 1548 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 1549 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 1550 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 1551 IDTVEC(xmm), 1552 #ifdef KDTRACE_HOOKS 1553 IDTVEC(dtrace_ret), 1554 #endif 1555 #ifdef XENHVM 1556 IDTVEC(xen_intr_upcall), 1557 #endif 1558 IDTVEC(int0x80_syscall); 1559 1560 #ifdef DDB 1561 /* 1562 * Display the index and function name of any IDT entries that don't use 1563 * the default 'rsvd' entry point. 1564 */ 1565 DB_SHOW_COMMAND(idt, db_show_idt) 1566 { 1567 struct gate_descriptor *ip; 1568 int idx; 1569 uintptr_t func, func_trm; 1570 bool trm; 1571 1572 ip = idt; 1573 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 1574 if (ip->gd_type == SDT_SYSTASKGT) { 1575 db_printf("%3d\t<TASK>\n", idx); 1576 } else { 1577 func = (ip->gd_hioffset << 16 | ip->gd_looffset); 1578 if (func >= PMAP_TRM_MIN_ADDRESS) { 1579 func_trm = func; 1580 func -= setidt_disp; 1581 trm = true; 1582 } else 1583 trm = false; 1584 if (func != (uintptr_t)&IDTVEC(rsvd)) { 1585 db_printf("%3d\t", idx); 1586 db_printsym(func, DB_STGY_PROC); 1587 if (trm) 1588 db_printf(" (trampoline %#x)", 1589 func_trm); 1590 db_printf("\n"); 1591 } 1592 } 1593 ip++; 1594 } 1595 } 1596 1597 /* Show privileged registers. */ 1598 DB_SHOW_COMMAND(sysregs, db_show_sysregs) 1599 { 1600 uint64_t idtr, gdtr; 1601 1602 idtr = ridt(); 1603 db_printf("idtr\t0x%08x/%04x\n", 1604 (u_int)(idtr >> 16), (u_int)idtr & 0xffff); 1605 gdtr = rgdt(); 1606 db_printf("gdtr\t0x%08x/%04x\n", 1607 (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff); 1608 db_printf("ldtr\t0x%04x\n", rldt()); 1609 db_printf("tr\t0x%04x\n", rtr()); 1610 db_printf("cr0\t0x%08x\n", rcr0()); 1611 db_printf("cr2\t0x%08x\n", rcr2()); 1612 db_printf("cr3\t0x%08x\n", rcr3()); 1613 db_printf("cr4\t0x%08x\n", rcr4()); 1614 if (rcr4() & CR4_XSAVE) 1615 db_printf("xcr0\t0x%016llx\n", rxcr(0)); 1616 if (amd_feature & (AMDID_NX | AMDID_LM)) 1617 db_printf("EFER\t0x%016llx\n", rdmsr(MSR_EFER)); 1618 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) 1619 db_printf("FEATURES_CTL\t0x%016llx\n", 1620 rdmsr(MSR_IA32_FEATURE_CONTROL)); 1621 if ((cpu_vendor_id == CPU_VENDOR_INTEL || 1622 cpu_vendor_id == CPU_VENDOR_AMD) && CPUID_TO_FAMILY(cpu_id) >= 6) 1623 db_printf("DEBUG_CTL\t0x%016llx\n", rdmsr(MSR_DEBUGCTLMSR)); 1624 if (cpu_feature & CPUID_PAT) 1625 db_printf("PAT\t0x%016llx\n", rdmsr(MSR_PAT)); 1626 } 1627 1628 DB_SHOW_COMMAND(dbregs, db_show_dbregs) 1629 { 1630 1631 db_printf("dr0\t0x%08x\n", rdr0()); 1632 db_printf("dr1\t0x%08x\n", rdr1()); 1633 db_printf("dr2\t0x%08x\n", rdr2()); 1634 db_printf("dr3\t0x%08x\n", rdr3()); 1635 db_printf("dr6\t0x%08x\n", rdr6()); 1636 db_printf("dr7\t0x%08x\n", rdr7()); 1637 } 1638 1639 DB_SHOW_COMMAND(frame, db_show_frame) 1640 { 1641 struct trapframe *frame; 1642 1643 frame = have_addr ? (struct trapframe *)addr : curthread->td_frame; 1644 printf("ss %#x esp %#x efl %#x cs %#x eip %#x\n", 1645 frame->tf_ss, frame->tf_esp, frame->tf_eflags, frame->tf_cs, 1646 frame->tf_eip); 1647 printf("err %#x trapno %d\n", frame->tf_err, frame->tf_trapno); 1648 printf("ds %#x es %#x fs %#x\n", 1649 frame->tf_ds, frame->tf_es, frame->tf_fs); 1650 printf("eax %#x ecx %#x edx %#x ebx %#x\n", 1651 frame->tf_eax, frame->tf_ecx, frame->tf_edx, frame->tf_ebx); 1652 printf("ebp %#x esi %#x edi %#x\n", 1653 frame->tf_ebp, frame->tf_esi, frame->tf_edi); 1654 1655 } 1656 #endif 1657 1658 void 1659 sdtossd(sd, ssd) 1660 struct segment_descriptor *sd; 1661 struct soft_segment_descriptor *ssd; 1662 { 1663 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 1664 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 1665 ssd->ssd_type = sd->sd_type; 1666 ssd->ssd_dpl = sd->sd_dpl; 1667 ssd->ssd_p = sd->sd_p; 1668 ssd->ssd_def32 = sd->sd_def32; 1669 ssd->ssd_gran = sd->sd_gran; 1670 } 1671 1672 static int 1673 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, 1674 int *physmap_idxp) 1675 { 1676 int i, insert_idx, physmap_idx; 1677 1678 physmap_idx = *physmap_idxp; 1679 1680 if (length == 0) 1681 return (1); 1682 1683 #ifndef PAE 1684 if (base > 0xffffffff) { 1685 printf("%uK of memory above 4GB ignored\n", 1686 (u_int)(length / 1024)); 1687 return (1); 1688 } 1689 #endif 1690 1691 /* 1692 * Find insertion point while checking for overlap. Start off by 1693 * assuming the new entry will be added to the end. 1694 */ 1695 insert_idx = physmap_idx + 2; 1696 for (i = 0; i <= physmap_idx; i += 2) { 1697 if (base < physmap[i + 1]) { 1698 if (base + length <= physmap[i]) { 1699 insert_idx = i; 1700 break; 1701 } 1702 if (boothowto & RB_VERBOSE) 1703 printf( 1704 "Overlapping memory regions, ignoring second region\n"); 1705 return (1); 1706 } 1707 } 1708 1709 /* See if we can prepend to the next entry. */ 1710 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { 1711 physmap[insert_idx] = base; 1712 return (1); 1713 } 1714 1715 /* See if we can append to the previous entry. */ 1716 if (insert_idx > 0 && base == physmap[insert_idx - 1]) { 1717 physmap[insert_idx - 1] += length; 1718 return (1); 1719 } 1720 1721 physmap_idx += 2; 1722 *physmap_idxp = physmap_idx; 1723 if (physmap_idx == PHYSMAP_SIZE) { 1724 printf( 1725 "Too many segments in the physical address map, giving up\n"); 1726 return (0); 1727 } 1728 1729 /* 1730 * Move the last 'N' entries down to make room for the new 1731 * entry if needed. 1732 */ 1733 for (i = physmap_idx; i > insert_idx; i -= 2) { 1734 physmap[i] = physmap[i - 2]; 1735 physmap[i + 1] = physmap[i - 1]; 1736 } 1737 1738 /* Insert the new entry. */ 1739 physmap[insert_idx] = base; 1740 physmap[insert_idx + 1] = base + length; 1741 return (1); 1742 } 1743 1744 static int 1745 add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp) 1746 { 1747 if (boothowto & RB_VERBOSE) 1748 printf("SMAP type=%02x base=%016llx len=%016llx\n", 1749 smap->type, smap->base, smap->length); 1750 1751 if (smap->type != SMAP_TYPE_MEMORY) 1752 return (1); 1753 1754 return (add_physmap_entry(smap->base, smap->length, physmap, 1755 physmap_idxp)); 1756 } 1757 1758 static void 1759 add_smap_entries(struct bios_smap *smapbase, vm_paddr_t *physmap, 1760 int *physmap_idxp) 1761 { 1762 struct bios_smap *smap, *smapend; 1763 u_int32_t smapsize; 1764 /* 1765 * Memory map from INT 15:E820. 1766 * 1767 * subr_module.c says: 1768 * "Consumer may safely assume that size value precedes data." 1769 * ie: an int32_t immediately precedes SMAP. 1770 */ 1771 smapsize = *((u_int32_t *)smapbase - 1); 1772 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1773 1774 for (smap = smapbase; smap < smapend; smap++) 1775 if (!add_smap_entry(smap, physmap, physmap_idxp)) 1776 break; 1777 } 1778 1779 static void 1780 basemem_setup(void) 1781 { 1782 pt_entry_t *pte; 1783 int i; 1784 1785 if (basemem > 640) { 1786 printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", 1787 basemem); 1788 basemem = 640; 1789 } 1790 1791 /* 1792 * Map pages between basemem and ISA_HOLE_START, if any, r/w into 1793 * the vm86 page table so that vm86 can scribble on them using 1794 * the vm86 map too. XXX: why 2 ways for this and only 1 way for 1795 * page 0, at least as initialized here? 1796 */ 1797 pte = (pt_entry_t *)vm86paddr; 1798 for (i = basemem / 4; i < 160; i++) 1799 pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; 1800 } 1801 1802 /* 1803 * Populate the (physmap) array with base/bound pairs describing the 1804 * available physical memory in the system, then test this memory and 1805 * build the phys_avail array describing the actually-available memory. 1806 * 1807 * If we cannot accurately determine the physical memory map, then use 1808 * value from the 0xE801 call, and failing that, the RTC. 1809 * 1810 * Total memory size may be set by the kernel environment variable 1811 * hw.physmem or the compile-time define MAXMEM. 1812 * 1813 * XXX first should be vm_paddr_t. 1814 */ 1815 static void 1816 getmemsize(int first) 1817 { 1818 int has_smap, off, physmap_idx, pa_indx, da_indx; 1819 u_long memtest; 1820 vm_paddr_t physmap[PHYSMAP_SIZE]; 1821 pt_entry_t *pte; 1822 quad_t dcons_addr, dcons_size, physmem_tunable; 1823 int hasbrokenint12, i, res; 1824 u_int extmem; 1825 struct vm86frame vmf; 1826 struct vm86context vmc; 1827 vm_paddr_t pa; 1828 struct bios_smap *smap, *smapbase; 1829 caddr_t kmdp; 1830 1831 has_smap = 0; 1832 bzero(&vmf, sizeof(vmf)); 1833 bzero(physmap, sizeof(physmap)); 1834 basemem = 0; 1835 1836 /* 1837 * Check if the loader supplied an SMAP memory map. If so, 1838 * use that and do not make any VM86 calls. 1839 */ 1840 physmap_idx = 0; 1841 kmdp = preload_search_by_type("elf kernel"); 1842 if (kmdp == NULL) 1843 kmdp = preload_search_by_type("elf32 kernel"); 1844 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1845 MODINFO_METADATA | MODINFOMD_SMAP); 1846 if (smapbase != NULL) { 1847 add_smap_entries(smapbase, physmap, &physmap_idx); 1848 has_smap = 1; 1849 goto have_smap; 1850 } 1851 1852 /* 1853 * Some newer BIOSes have a broken INT 12H implementation 1854 * which causes a kernel panic immediately. In this case, we 1855 * need use the SMAP to determine the base memory size. 1856 */ 1857 hasbrokenint12 = 0; 1858 TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12); 1859 if (hasbrokenint12 == 0) { 1860 /* Use INT12 to determine base memory size. */ 1861 vm86_intcall(0x12, &vmf); 1862 basemem = vmf.vmf_ax; 1863 basemem_setup(); 1864 } 1865 1866 /* 1867 * Fetch the memory map with INT 15:E820. Map page 1 R/W into 1868 * the kernel page table so we can use it as a buffer. The 1869 * kernel will unmap this page later. 1870 */ 1871 vmc.npages = 0; 1872 smap = (void *)vm86_addpage(&vmc, 1, PMAP_MAP_LOW + ptoa(1)); 1873 res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); 1874 KASSERT(res != 0, ("vm86_getptr() failed: address not found")); 1875 1876 vmf.vmf_ebx = 0; 1877 do { 1878 vmf.vmf_eax = 0xE820; 1879 vmf.vmf_edx = SMAP_SIG; 1880 vmf.vmf_ecx = sizeof(struct bios_smap); 1881 i = vm86_datacall(0x15, &vmf, &vmc); 1882 if (i || vmf.vmf_eax != SMAP_SIG) 1883 break; 1884 has_smap = 1; 1885 if (!add_smap_entry(smap, physmap, &physmap_idx)) 1886 break; 1887 } while (vmf.vmf_ebx != 0); 1888 1889 have_smap: 1890 /* 1891 * If we didn't fetch the "base memory" size from INT12, 1892 * figure it out from the SMAP (or just guess). 1893 */ 1894 if (basemem == 0) { 1895 for (i = 0; i <= physmap_idx; i += 2) { 1896 if (physmap[i] == 0x00000000) { 1897 basemem = physmap[i + 1] / 1024; 1898 break; 1899 } 1900 } 1901 1902 /* XXX: If we couldn't find basemem from SMAP, just guess. */ 1903 if (basemem == 0) 1904 basemem = 640; 1905 basemem_setup(); 1906 } 1907 1908 if (physmap[1] != 0) 1909 goto physmap_done; 1910 1911 /* 1912 * If we failed to find an SMAP, figure out the extended 1913 * memory size. We will then build a simple memory map with 1914 * two segments, one for "base memory" and the second for 1915 * "extended memory". Note that "extended memory" starts at a 1916 * physical address of 1MB and that both basemem and extmem 1917 * are in units of 1KB. 1918 * 1919 * First, try to fetch the extended memory size via INT 15:E801. 1920 */ 1921 vmf.vmf_ax = 0xE801; 1922 if (vm86_intcall(0x15, &vmf) == 0) { 1923 extmem = vmf.vmf_cx + vmf.vmf_dx * 64; 1924 } else { 1925 /* 1926 * If INT15:E801 fails, this is our last ditch effort 1927 * to determine the extended memory size. Currently 1928 * we prefer the RTC value over INT15:88. 1929 */ 1930 #if 0 1931 vmf.vmf_ah = 0x88; 1932 vm86_intcall(0x15, &vmf); 1933 extmem = vmf.vmf_ax; 1934 #else 1935 extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); 1936 #endif 1937 } 1938 1939 /* 1940 * Special hack for chipsets that still remap the 384k hole when 1941 * there's 16MB of memory - this really confuses people that 1942 * are trying to use bus mastering ISA controllers with the 1943 * "16MB limit"; they only have 16MB, but the remapping puts 1944 * them beyond the limit. 1945 * 1946 * If extended memory is between 15-16MB (16-17MB phys address range), 1947 * chop it to 15MB. 1948 */ 1949 if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) 1950 extmem = 15 * 1024; 1951 1952 physmap[0] = 0; 1953 physmap[1] = basemem * 1024; 1954 physmap_idx = 2; 1955 physmap[physmap_idx] = 0x100000; 1956 physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; 1957 1958 physmap_done: 1959 /* 1960 * Now, physmap contains a map of physical memory. 1961 */ 1962 1963 #ifdef SMP 1964 /* make hole for AP bootstrap code */ 1965 alloc_ap_trampoline(physmap, &physmap_idx); 1966 #endif 1967 1968 /* 1969 * Maxmem isn't the "maximum memory", it's one larger than the 1970 * highest page of the physical address space. It should be 1971 * called something like "Maxphyspage". We may adjust this 1972 * based on ``hw.physmem'' and the results of the memory test. 1973 * 1974 * This is especially confusing when it is much larger than the 1975 * memory size and is displayed as "realmem". 1976 */ 1977 Maxmem = atop(physmap[physmap_idx + 1]); 1978 1979 #ifdef MAXMEM 1980 Maxmem = MAXMEM / 4; 1981 #endif 1982 1983 if (TUNABLE_QUAD_FETCH("hw.physmem", &physmem_tunable)) 1984 Maxmem = atop(physmem_tunable); 1985 1986 /* 1987 * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend 1988 * the amount of memory in the system. 1989 */ 1990 if (has_smap && Maxmem > atop(physmap[physmap_idx + 1])) 1991 Maxmem = atop(physmap[physmap_idx + 1]); 1992 1993 /* 1994 * By default enable the memory test on real hardware, and disable 1995 * it if we appear to be running in a VM. This avoids touching all 1996 * pages unnecessarily, which doesn't matter on real hardware but is 1997 * bad for shared VM hosts. Use a general name so that 1998 * one could eventually do more with the code than just disable it. 1999 */ 2000 memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1; 2001 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 2002 2003 if (atop(physmap[physmap_idx + 1]) != Maxmem && 2004 (boothowto & RB_VERBOSE)) 2005 printf("Physical memory use set to %ldK\n", Maxmem * 4); 2006 2007 /* 2008 * If Maxmem has been increased beyond what the system has detected, 2009 * extend the last memory segment to the new limit. 2010 */ 2011 if (atop(physmap[physmap_idx + 1]) < Maxmem) 2012 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); 2013 2014 /* call pmap initialization to make new kernel address space */ 2015 pmap_bootstrap(first); 2016 2017 /* 2018 * Size up each available chunk of physical memory. 2019 */ 2020 physmap[0] = PAGE_SIZE; /* mask off page 0 */ 2021 pa_indx = 0; 2022 da_indx = 1; 2023 phys_avail[pa_indx++] = physmap[0]; 2024 phys_avail[pa_indx] = physmap[0]; 2025 dump_avail[da_indx] = physmap[0]; 2026 pte = CMAP3; 2027 2028 /* 2029 * Get dcons buffer address 2030 */ 2031 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 2032 getenv_quad("dcons.size", &dcons_size) == 0) 2033 dcons_addr = 0; 2034 2035 /* 2036 * physmap is in bytes, so when converting to page boundaries, 2037 * round up the start address and round down the end address. 2038 */ 2039 for (i = 0; i <= physmap_idx; i += 2) { 2040 vm_paddr_t end; 2041 2042 end = ptoa((vm_paddr_t)Maxmem); 2043 if (physmap[i + 1] < end) 2044 end = trunc_page(physmap[i + 1]); 2045 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 2046 int tmp, page_bad, full; 2047 int *ptr = (int *)CADDR3; 2048 2049 full = FALSE; 2050 /* 2051 * block out kernel memory as not available. 2052 */ 2053 if (pa >= KERNLOAD && pa < first) 2054 goto do_dump_avail; 2055 2056 /* 2057 * block out dcons buffer 2058 */ 2059 if (dcons_addr > 0 2060 && pa >= trunc_page(dcons_addr) 2061 && pa < dcons_addr + dcons_size) 2062 goto do_dump_avail; 2063 2064 page_bad = FALSE; 2065 if (memtest == 0) 2066 goto skip_memtest; 2067 2068 /* 2069 * map page into kernel: valid, read/write,non-cacheable 2070 */ 2071 *pte = pa | PG_V | PG_RW | PG_N; 2072 invltlb(); 2073 2074 tmp = *(int *)ptr; 2075 /* 2076 * Test for alternating 1's and 0's 2077 */ 2078 *(volatile int *)ptr = 0xaaaaaaaa; 2079 if (*(volatile int *)ptr != 0xaaaaaaaa) 2080 page_bad = TRUE; 2081 /* 2082 * Test for alternating 0's and 1's 2083 */ 2084 *(volatile int *)ptr = 0x55555555; 2085 if (*(volatile int *)ptr != 0x55555555) 2086 page_bad = TRUE; 2087 /* 2088 * Test for all 1's 2089 */ 2090 *(volatile int *)ptr = 0xffffffff; 2091 if (*(volatile int *)ptr != 0xffffffff) 2092 page_bad = TRUE; 2093 /* 2094 * Test for all 0's 2095 */ 2096 *(volatile int *)ptr = 0x0; 2097 if (*(volatile int *)ptr != 0x0) 2098 page_bad = TRUE; 2099 /* 2100 * Restore original value. 2101 */ 2102 *(int *)ptr = tmp; 2103 2104 skip_memtest: 2105 /* 2106 * Adjust array of valid/good pages. 2107 */ 2108 if (page_bad == TRUE) 2109 continue; 2110 /* 2111 * If this good page is a continuation of the 2112 * previous set of good pages, then just increase 2113 * the end pointer. Otherwise start a new chunk. 2114 * Note that "end" points one higher than end, 2115 * making the range >= start and < end. 2116 * If we're also doing a speculative memory 2117 * test and we at or past the end, bump up Maxmem 2118 * so that we keep going. The first bad page 2119 * will terminate the loop. 2120 */ 2121 if (phys_avail[pa_indx] == pa) { 2122 phys_avail[pa_indx] += PAGE_SIZE; 2123 } else { 2124 pa_indx++; 2125 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 2126 printf( 2127 "Too many holes in the physical address space, giving up\n"); 2128 pa_indx--; 2129 full = TRUE; 2130 goto do_dump_avail; 2131 } 2132 phys_avail[pa_indx++] = pa; /* start */ 2133 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 2134 } 2135 physmem++; 2136 do_dump_avail: 2137 if (dump_avail[da_indx] == pa) { 2138 dump_avail[da_indx] += PAGE_SIZE; 2139 } else { 2140 da_indx++; 2141 if (da_indx == DUMP_AVAIL_ARRAY_END) { 2142 da_indx--; 2143 goto do_next; 2144 } 2145 dump_avail[da_indx++] = pa; /* start */ 2146 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 2147 } 2148 do_next: 2149 if (full) 2150 break; 2151 } 2152 } 2153 *pte = 0; 2154 invltlb(); 2155 2156 /* 2157 * XXX 2158 * The last chunk must contain at least one page plus the message 2159 * buffer to avoid complicating other code (message buffer address 2160 * calculation, etc.). 2161 */ 2162 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 2163 round_page(msgbufsize) >= phys_avail[pa_indx]) { 2164 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 2165 phys_avail[pa_indx--] = 0; 2166 phys_avail[pa_indx--] = 0; 2167 } 2168 2169 Maxmem = atop(phys_avail[pa_indx]); 2170 2171 /* Trim off space for the message buffer. */ 2172 phys_avail[pa_indx] -= round_page(msgbufsize); 2173 2174 /* Map the message buffer. */ 2175 for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) 2176 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + 2177 off); 2178 } 2179 2180 static void 2181 i386_kdb_init(void) 2182 { 2183 #ifdef DDB 2184 db_fetch_ksymtab(bootinfo.bi_symtab, bootinfo.bi_esymtab); 2185 #endif 2186 kdb_init(); 2187 #ifdef KDB 2188 if (boothowto & RB_KDB) 2189 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 2190 #endif 2191 } 2192 2193 static void 2194 fixup_idt(void) 2195 { 2196 struct gate_descriptor *ip; 2197 uintptr_t off; 2198 int x; 2199 2200 for (x = 0; x < NIDT; x++) { 2201 ip = &idt[x]; 2202 if (ip->gd_type != SDT_SYS386IGT && 2203 ip->gd_type != SDT_SYS386TGT) 2204 continue; 2205 off = ip->gd_looffset + (((u_int)ip->gd_hioffset) << 16); 2206 KASSERT(off >= (uintptr_t)start_exceptions && 2207 off < (uintptr_t)end_exceptions, 2208 ("IDT[%d] type %d off %#x", x, ip->gd_type, off)); 2209 off += setidt_disp; 2210 MPASS(off >= PMAP_TRM_MIN_ADDRESS && 2211 off < PMAP_TRM_MAX_ADDRESS); 2212 ip->gd_looffset = off; 2213 ip->gd_hioffset = off >> 16; 2214 } 2215 } 2216 2217 static void 2218 i386_setidt1(void) 2219 { 2220 int x; 2221 2222 /* exceptions */ 2223 for (x = 0; x < NIDT; x++) 2224 setidt(x, &IDTVEC(rsvd), SDT_SYS386IGT, SEL_KPL, 2225 GSEL(GCODE_SEL, SEL_KPL)); 2226 setidt(IDT_DE, &IDTVEC(div), SDT_SYS386IGT, SEL_KPL, 2227 GSEL(GCODE_SEL, SEL_KPL)); 2228 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, 2229 GSEL(GCODE_SEL, SEL_KPL)); 2230 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL, 2231 GSEL(GCODE_SEL, SEL_KPL)); 2232 setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, 2233 GSEL(GCODE_SEL, SEL_KPL)); 2234 setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386IGT, SEL_UPL, 2235 GSEL(GCODE_SEL, SEL_KPL)); 2236 setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386IGT, SEL_KPL, 2237 GSEL(GCODE_SEL, SEL_KPL)); 2238 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL, 2239 GSEL(GCODE_SEL, SEL_KPL)); 2240 setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386IGT, SEL_KPL, 2241 GSEL(GCODE_SEL, SEL_KPL)); 2242 setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, 2243 SEL_KPL)); 2244 setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386IGT, 2245 SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 2246 setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386IGT, SEL_KPL, 2247 GSEL(GCODE_SEL, SEL_KPL)); 2248 setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386IGT, SEL_KPL, 2249 GSEL(GCODE_SEL, SEL_KPL)); 2250 setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386IGT, SEL_KPL, 2251 GSEL(GCODE_SEL, SEL_KPL)); 2252 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL, 2253 GSEL(GCODE_SEL, SEL_KPL)); 2254 setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, 2255 GSEL(GCODE_SEL, SEL_KPL)); 2256 setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, 2257 GSEL(GCODE_SEL, SEL_KPL)); 2258 setidt(IDT_AC, &IDTVEC(align), SDT_SYS386IGT, SEL_KPL, 2259 GSEL(GCODE_SEL, SEL_KPL)); 2260 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386IGT, SEL_KPL, 2261 GSEL(GCODE_SEL, SEL_KPL)); 2262 setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386IGT, SEL_KPL, 2263 GSEL(GCODE_SEL, SEL_KPL)); 2264 setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), 2265 SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); 2266 #ifdef KDTRACE_HOOKS 2267 setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), 2268 SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); 2269 #endif 2270 #ifdef XENHVM 2271 setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), 2272 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 2273 #endif 2274 } 2275 2276 static void 2277 i386_setidt2(void) 2278 { 2279 2280 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL, 2281 GSEL(GCODE_SEL, SEL_KPL)); 2282 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL, 2283 GSEL(GCODE_SEL, SEL_KPL)); 2284 } 2285 2286 #if defined(DEV_ISA) && !defined(DEV_ATPIC) 2287 static void 2288 i386_setidt3(void) 2289 { 2290 2291 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), 2292 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 2293 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), 2294 SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 2295 } 2296 #endif 2297 2298 register_t 2299 init386(int first) 2300 { 2301 struct region_descriptor r_gdt, r_idt; /* table descriptors */ 2302 int gsel_tss, metadata_missing, x, pa; 2303 struct pcpu *pc; 2304 struct xstate_hdr *xhdr; 2305 caddr_t kmdp; 2306 vm_offset_t addend; 2307 int late_console; 2308 2309 thread0.td_kstack = proc0kstack; 2310 thread0.td_kstack_pages = TD0_KSTACK_PAGES; 2311 2312 /* 2313 * This may be done better later if it gets more high level 2314 * components in it. If so just link td->td_proc here. 2315 */ 2316 proc_linkup0(&proc0, &thread0); 2317 2318 if (bootinfo.bi_modulep) { 2319 metadata_missing = 0; 2320 addend = (vm_paddr_t)bootinfo.bi_modulep < KERNBASE ? 2321 PMAP_MAP_LOW : 0; 2322 preload_metadata = (caddr_t)bootinfo.bi_modulep + addend; 2323 preload_bootstrap_relocate(addend); 2324 } else { 2325 metadata_missing = 1; 2326 } 2327 2328 if (bootinfo.bi_envp != 0) { 2329 addend = (vm_paddr_t)bootinfo.bi_envp < KERNBASE ? 2330 PMAP_MAP_LOW : 0; 2331 init_static_kenv((char *)bootinfo.bi_envp + addend, 0); 2332 } else { 2333 init_static_kenv(NULL, 0); 2334 } 2335 2336 identify_hypervisor(); 2337 2338 /* Init basic tunables, hz etc */ 2339 init_param1(); 2340 2341 /* 2342 * Make gdt memory segments. All segments cover the full 4GB 2343 * of address space and permissions are enforced at page level. 2344 */ 2345 gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); 2346 gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); 2347 gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1); 2348 gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1); 2349 gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1); 2350 gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1); 2351 2352 pc = &__pcpu[0]; 2353 gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1); 2354 gdt_segs[GPRIV_SEL].ssd_base = (int)pc; 2355 gdt_segs[GPROC0_SEL].ssd_base = (int)&common_tss0; 2356 2357 for (x = 0; x < NGDT; x++) 2358 ssdtosd(&gdt_segs[x], &gdt0[x].sd); 2359 2360 r_gdt.rd_limit = NGDT * sizeof(gdt0[0]) - 1; 2361 r_gdt.rd_base = (int)gdt0; 2362 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); 2363 lgdt(&r_gdt); 2364 2365 pcpu_init(pc, 0, sizeof(struct pcpu)); 2366 for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) 2367 pmap_kenter(pa, pa); 2368 dpcpu_init((void *)first, 0); 2369 first += DPCPU_SIZE; 2370 PCPU_SET(prvspace, pc); 2371 PCPU_SET(curthread, &thread0); 2372 /* Non-late cninit() and printf() can be moved up to here. */ 2373 2374 /* 2375 * Initialize mutexes. 2376 * 2377 * icu_lock: in order to allow an interrupt to occur in a critical 2378 * section, to set pcpu->ipending (etc...) properly, we 2379 * must be able to get the icu lock, so it can't be 2380 * under witness. 2381 */ 2382 mutex_init(); 2383 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); 2384 2385 i386_setidt1(); 2386 2387 r_idt.rd_limit = sizeof(idt0) - 1; 2388 r_idt.rd_base = (int) idt; 2389 lidt(&r_idt); 2390 2391 /* 2392 * Initialize the clock before the console so that console 2393 * initialization can use DELAY(). 2394 */ 2395 clock_init(); 2396 2397 finishidentcpu(); /* Final stage of CPU initialization */ 2398 i386_setidt2(); 2399 initializecpu(); /* Initialize CPU registers */ 2400 initializecpucache(); 2401 2402 /* pointer to selector slot for %fs/%gs */ 2403 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 2404 2405 /* Initialize the tss (except for the final esp0) early for vm86. */ 2406 common_tss0.tss_esp0 = thread0.td_kstack + thread0.td_kstack_pages * 2407 PAGE_SIZE - VM86_STACK_SPACE; 2408 common_tss0.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL); 2409 common_tss0.tss_ioopt = sizeof(struct i386tss) << 16; 2410 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 2411 PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); 2412 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 2413 ltr(gsel_tss); 2414 2415 /* Initialize the PIC early for vm86 calls. */ 2416 #ifdef DEV_ISA 2417 #ifdef DEV_ATPIC 2418 elcr_probe(); 2419 atpic_startup(); 2420 #else 2421 /* Reset and mask the atpics and leave them shut down. */ 2422 atpic_reset(); 2423 2424 /* 2425 * Point the ICU spurious interrupt vectors at the APIC spurious 2426 * interrupt handler. 2427 */ 2428 i386_setidt3(); 2429 #endif 2430 #endif 2431 2432 /* 2433 * The console and kdb should be initialized even earlier than here, 2434 * but some console drivers don't work until after getmemsize(). 2435 * Default to late console initialization to support these drivers. 2436 * This loses mainly printf()s in getmemsize() and early debugging. 2437 */ 2438 late_console = 1; 2439 TUNABLE_INT_FETCH("debug.late_console", &late_console); 2440 if (!late_console) { 2441 cninit(); 2442 i386_kdb_init(); 2443 } 2444 2445 kmdp = preload_search_by_type("elf kernel"); 2446 link_elf_ireloc(kmdp); 2447 2448 vm86_initialize(); 2449 getmemsize(first); 2450 init_param2(physmem); 2451 2452 /* now running on new page tables, configured,and u/iom is accessible */ 2453 2454 if (late_console) 2455 cninit(); 2456 2457 if (metadata_missing) 2458 printf("WARNING: loader(8) metadata is missing!\n"); 2459 2460 if (late_console) 2461 i386_kdb_init(); 2462 2463 msgbufinit(msgbufp, msgbufsize); 2464 npxinit(true); 2465 /* 2466 * Set up thread0 pcb after npxinit calculated pcb + fpu save 2467 * area size. Zero out the extended state header in fpu save 2468 * area. 2469 */ 2470 thread0.td_pcb = get_pcb_td(&thread0); 2471 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); 2472 bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); 2473 if (use_xsave) { 2474 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 2475 1); 2476 xhdr->xstate_bv = xsave_mask; 2477 } 2478 PCPU_SET(curpcb, thread0.td_pcb); 2479 /* Move esp0 in the tss to its final place. */ 2480 /* Note: -16 is so we can grow the trapframe if we came from vm86 */ 2481 common_tss0.tss_esp0 = (vm_offset_t)thread0.td_pcb - VM86_STACK_SPACE; 2482 PCPU_SET(kesp0, common_tss0.tss_esp0); 2483 gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; /* clear busy bit */ 2484 ltr(gsel_tss); 2485 2486 /* transfer to user mode */ 2487 2488 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 2489 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 2490 2491 /* setup proc 0's pcb */ 2492 thread0.td_pcb->pcb_flags = 0; 2493 #if defined(PAE) || defined(PAE_TABLES) 2494 thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; 2495 #else 2496 thread0.td_pcb->pcb_cr3 = (int)IdlePTD; 2497 #endif 2498 thread0.td_pcb->pcb_ext = 0; 2499 thread0.td_frame = &proc0_tf; 2500 2501 cpu_probe_amdc1e(); 2502 2503 #ifdef FDT 2504 x86_init_fdt(); 2505 #endif 2506 2507 /* Location of kernel stack for locore */ 2508 return ((register_t)thread0.td_pcb); 2509 } 2510 2511 extern u_int tramp_idleptd; 2512 2513 static void 2514 machdep_init_trampoline(void) 2515 { 2516 struct region_descriptor r_gdt, r_idt; 2517 struct i386tss *tss; 2518 char *copyout_buf, *trampoline, *tramp_stack_base; 2519 u_int *tramp_idleptd_reloced; 2520 int x; 2521 2522 gdt = pmap_trm_alloc(sizeof(union descriptor) * NGDT * mp_ncpus, 2523 M_NOWAIT | M_ZERO); 2524 bcopy(gdt0, gdt, sizeof(union descriptor) * NGDT); 2525 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 2526 r_gdt.rd_base = (int)gdt; 2527 lgdt(&r_gdt); 2528 2529 tss = pmap_trm_alloc(sizeof(struct i386tss) * mp_ncpus, 2530 M_NOWAIT | M_ZERO); 2531 bcopy(&common_tss0, tss, sizeof(struct i386tss)); 2532 gdt[GPROC0_SEL].sd.sd_lobase = (int)tss; 2533 gdt[GPROC0_SEL].sd.sd_hibase = (u_int)tss >> 24; 2534 gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; 2535 ltr(GSEL(GPROC0_SEL, SEL_KPL)); 2536 2537 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); 2538 PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); 2539 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); 2540 PCPU_SET(common_tssp, tss); 2541 2542 trampoline = pmap_trm_alloc(end_exceptions - start_exceptions, 2543 M_NOWAIT); 2544 bcopy(start_exceptions, trampoline, end_exceptions - start_exceptions); 2545 tramp_stack_base = pmap_trm_alloc(TRAMP_STACK_SZ, M_NOWAIT); 2546 PCPU_SET(trampstk, (uintptr_t)tramp_stack_base + TRAMP_STACK_SZ - 2547 VM86_STACK_SPACE); 2548 tss[0].tss_esp0 = PCPU_GET(trampstk); 2549 2550 idt = pmap_trm_alloc(sizeof(idt0), M_NOWAIT | M_ZERO); 2551 bcopy(idt0, idt, sizeof(idt0)); 2552 2553 /* Re-initialize new IDT since the handlers were relocated */ 2554 setidt_disp = trampoline - start_exceptions; 2555 fixup_idt(); 2556 2557 tramp_idleptd_reloced = (u_int *)((uintptr_t)&tramp_idleptd + 2558 setidt_disp); 2559 #if defined(PAE) || defined(PAE_TABLES) 2560 *tramp_idleptd_reloced = (u_int)IdlePDPT; 2561 #else 2562 *tramp_idleptd_reloced = (u_int)IdlePTD; 2563 #endif 2564 2565 r_idt.rd_limit = sizeof(struct gate_descriptor) * NIDT - 1; 2566 r_idt.rd_base = (int)idt; 2567 lidt(&r_idt); 2568 2569 /* dblfault TSS */ 2570 dblfault_tss = pmap_trm_alloc(sizeof(struct i386tss), M_NOWAIT | M_ZERO); 2571 dblfault_stack = pmap_trm_alloc(PAGE_SIZE, M_NOWAIT); 2572 dblfault_tss->tss_esp = dblfault_tss->tss_esp0 = 2573 dblfault_tss->tss_esp1 = dblfault_tss->tss_esp2 = 2574 (int)dblfault_stack + PAGE_SIZE; 2575 dblfault_tss->tss_ss = dblfault_tss->tss_ss0 = dblfault_tss->tss_ss1 = 2576 dblfault_tss->tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); 2577 #if defined(PAE) || defined(PAE_TABLES) 2578 dblfault_tss->tss_cr3 = (int)IdlePDPT; 2579 #else 2580 dblfault_tss->tss_cr3 = (int)IdlePTD; 2581 #endif 2582 dblfault_tss->tss_eip = (int)dblfault_handler; 2583 dblfault_tss->tss_eflags = PSL_KERNEL; 2584 dblfault_tss->tss_ds = dblfault_tss->tss_es = 2585 dblfault_tss->tss_gs = GSEL(GDATA_SEL, SEL_KPL); 2586 dblfault_tss->tss_fs = GSEL(GPRIV_SEL, SEL_KPL); 2587 dblfault_tss->tss_cs = GSEL(GCODE_SEL, SEL_KPL); 2588 dblfault_tss->tss_ldt = GSEL(GLDT_SEL, SEL_KPL); 2589 gdt[GPANIC_SEL].sd.sd_lobase = (int)dblfault_tss; 2590 gdt[GPANIC_SEL].sd.sd_hibase = (u_int)dblfault_tss >> 24; 2591 2592 /* make ldt memory segments */ 2593 ldt = pmap_trm_alloc(sizeof(union descriptor) * NLDT, 2594 M_NOWAIT | M_ZERO); 2595 gdt[GLDT_SEL].sd.sd_lobase = (int)ldt; 2596 gdt[GLDT_SEL].sd.sd_hibase = (u_int)ldt >> 24; 2597 ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); 2598 ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); 2599 for (x = 0; x < nitems(ldt_segs); x++) 2600 ssdtosd(&ldt_segs[x], &ldt[x].sd); 2601 2602 _default_ldt = GSEL(GLDT_SEL, SEL_KPL); 2603 lldt(_default_ldt); 2604 PCPU_SET(currentldt, _default_ldt); 2605 2606 copyout_buf = pmap_trm_alloc(TRAMP_COPYOUT_SZ, M_NOWAIT); 2607 PCPU_SET(copyout_buf, copyout_buf); 2608 copyout_init_tramp(); 2609 } 2610 SYSINIT(vm_mem, SI_SUB_VM, SI_ORDER_SECOND, machdep_init_trampoline, NULL); 2611 2612 #ifdef COMPAT_43 2613 static void 2614 i386_setup_lcall_gate(void) 2615 { 2616 struct sysentvec *sv; 2617 struct user_segment_descriptor desc; 2618 u_int lcall_addr; 2619 2620 sv = &elf32_freebsd_sysvec; 2621 lcall_addr = (uintptr_t)sv->sv_psstrings - sz_lcall_tramp; 2622 2623 bzero(&desc, sizeof(desc)); 2624 desc.sd_type = SDT_MEMERA; 2625 desc.sd_dpl = SEL_UPL; 2626 desc.sd_p = 1; 2627 desc.sd_def32 = 1; 2628 desc.sd_gran = 1; 2629 desc.sd_lolimit = 0xffff; 2630 desc.sd_hilimit = 0xf; 2631 desc.sd_lobase = lcall_addr; 2632 desc.sd_hibase = lcall_addr >> 24; 2633 bcopy(&desc, &ldt[LSYS5CALLS_SEL], sizeof(desc)); 2634 } 2635 SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_ANY, i386_setup_lcall_gate, NULL); 2636 #endif 2637 2638 void 2639 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 2640 { 2641 2642 pcpu->pc_acpi_id = 0xffffffff; 2643 } 2644 2645 static int 2646 smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 2647 { 2648 struct bios_smap *smapbase; 2649 struct bios_smap_xattr smap; 2650 caddr_t kmdp; 2651 uint32_t *smapattr; 2652 int count, error, i; 2653 2654 /* Retrieve the system memory map from the loader. */ 2655 kmdp = preload_search_by_type("elf kernel"); 2656 if (kmdp == NULL) 2657 kmdp = preload_search_by_type("elf32 kernel"); 2658 smapbase = (struct bios_smap *)preload_search_info(kmdp, 2659 MODINFO_METADATA | MODINFOMD_SMAP); 2660 if (smapbase == NULL) 2661 return (0); 2662 smapattr = (uint32_t *)preload_search_info(kmdp, 2663 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 2664 count = *((u_int32_t *)smapbase - 1) / sizeof(*smapbase); 2665 error = 0; 2666 for (i = 0; i < count; i++) { 2667 smap.base = smapbase[i].base; 2668 smap.length = smapbase[i].length; 2669 smap.type = smapbase[i].type; 2670 if (smapattr != NULL) 2671 smap.xattr = smapattr[i]; 2672 else 2673 smap.xattr = 0; 2674 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 2675 } 2676 return (error); 2677 } 2678 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 2679 smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); 2680 2681 void 2682 spinlock_enter(void) 2683 { 2684 struct thread *td; 2685 register_t flags; 2686 2687 td = curthread; 2688 if (td->td_md.md_spinlock_count == 0) { 2689 flags = intr_disable(); 2690 td->td_md.md_spinlock_count = 1; 2691 td->td_md.md_saved_flags = flags; 2692 } else 2693 td->td_md.md_spinlock_count++; 2694 critical_enter(); 2695 } 2696 2697 void 2698 spinlock_exit(void) 2699 { 2700 struct thread *td; 2701 register_t flags; 2702 2703 td = curthread; 2704 critical_exit(); 2705 flags = td->td_md.md_saved_flags; 2706 td->td_md.md_spinlock_count--; 2707 if (td->td_md.md_spinlock_count == 0) 2708 intr_restore(flags); 2709 } 2710 2711 #if defined(I586_CPU) && !defined(NO_F00F_HACK) 2712 static void f00f_hack(void *unused); 2713 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); 2714 2715 static void 2716 f00f_hack(void *unused) 2717 { 2718 struct region_descriptor r_idt; 2719 struct gate_descriptor *new_idt; 2720 vm_offset_t tmp; 2721 2722 if (!has_f00f_bug) 2723 return; 2724 2725 GIANT_REQUIRED; 2726 2727 printf("Intel Pentium detected, installing workaround for F00F bug\n"); 2728 2729 tmp = (vm_offset_t)pmap_trm_alloc(PAGE_SIZE * 3, M_NOWAIT | M_ZERO); 2730 if (tmp == 0) 2731 panic("kmem_malloc returned 0"); 2732 tmp = round_page(tmp); 2733 2734 /* Put the problematic entry (#6) at the end of the lower page. */ 2735 new_idt = (struct gate_descriptor *) 2736 (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor)); 2737 bcopy(idt, new_idt, sizeof(idt0)); 2738 r_idt.rd_base = (u_int)new_idt; 2739 r_idt.rd_limit = sizeof(idt0) - 1; 2740 lidt(&r_idt); 2741 /* SMP machines do not need the F00F hack. */ 2742 idt = new_idt; 2743 pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ); 2744 } 2745 #endif /* defined(I586_CPU) && !NO_F00F_HACK */ 2746 2747 /* 2748 * Construct a PCB from a trapframe. This is called from kdb_trap() where 2749 * we want to start a backtrace from the function that caused us to enter 2750 * the debugger. We have the context in the trapframe, but base the trace 2751 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 2752 * enough for a backtrace. 2753 */ 2754 void 2755 makectx(struct trapframe *tf, struct pcb *pcb) 2756 { 2757 2758 pcb->pcb_edi = tf->tf_edi; 2759 pcb->pcb_esi = tf->tf_esi; 2760 pcb->pcb_ebp = tf->tf_ebp; 2761 pcb->pcb_ebx = tf->tf_ebx; 2762 pcb->pcb_eip = tf->tf_eip; 2763 pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8; 2764 pcb->pcb_gs = rgs(); 2765 } 2766 2767 int 2768 ptrace_set_pc(struct thread *td, u_long addr) 2769 { 2770 2771 td->td_frame->tf_eip = addr; 2772 return (0); 2773 } 2774 2775 int 2776 ptrace_single_step(struct thread *td) 2777 { 2778 td->td_frame->tf_eflags |= PSL_T; 2779 return (0); 2780 } 2781 2782 int 2783 ptrace_clear_single_step(struct thread *td) 2784 { 2785 td->td_frame->tf_eflags &= ~PSL_T; 2786 return (0); 2787 } 2788 2789 int 2790 fill_regs(struct thread *td, struct reg *regs) 2791 { 2792 struct pcb *pcb; 2793 struct trapframe *tp; 2794 2795 tp = td->td_frame; 2796 pcb = td->td_pcb; 2797 regs->r_gs = pcb->pcb_gs; 2798 return (fill_frame_regs(tp, regs)); 2799 } 2800 2801 int 2802 fill_frame_regs(struct trapframe *tp, struct reg *regs) 2803 { 2804 regs->r_fs = tp->tf_fs; 2805 regs->r_es = tp->tf_es; 2806 regs->r_ds = tp->tf_ds; 2807 regs->r_edi = tp->tf_edi; 2808 regs->r_esi = tp->tf_esi; 2809 regs->r_ebp = tp->tf_ebp; 2810 regs->r_ebx = tp->tf_ebx; 2811 regs->r_edx = tp->tf_edx; 2812 regs->r_ecx = tp->tf_ecx; 2813 regs->r_eax = tp->tf_eax; 2814 regs->r_eip = tp->tf_eip; 2815 regs->r_cs = tp->tf_cs; 2816 regs->r_eflags = tp->tf_eflags; 2817 regs->r_esp = tp->tf_esp; 2818 regs->r_ss = tp->tf_ss; 2819 return (0); 2820 } 2821 2822 int 2823 set_regs(struct thread *td, struct reg *regs) 2824 { 2825 struct pcb *pcb; 2826 struct trapframe *tp; 2827 2828 tp = td->td_frame; 2829 if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) || 2830 !CS_SECURE(regs->r_cs)) 2831 return (EINVAL); 2832 pcb = td->td_pcb; 2833 tp->tf_fs = regs->r_fs; 2834 tp->tf_es = regs->r_es; 2835 tp->tf_ds = regs->r_ds; 2836 tp->tf_edi = regs->r_edi; 2837 tp->tf_esi = regs->r_esi; 2838 tp->tf_ebp = regs->r_ebp; 2839 tp->tf_ebx = regs->r_ebx; 2840 tp->tf_edx = regs->r_edx; 2841 tp->tf_ecx = regs->r_ecx; 2842 tp->tf_eax = regs->r_eax; 2843 tp->tf_eip = regs->r_eip; 2844 tp->tf_cs = regs->r_cs; 2845 tp->tf_eflags = regs->r_eflags; 2846 tp->tf_esp = regs->r_esp; 2847 tp->tf_ss = regs->r_ss; 2848 pcb->pcb_gs = regs->r_gs; 2849 return (0); 2850 } 2851 2852 int 2853 fill_fpregs(struct thread *td, struct fpreg *fpregs) 2854 { 2855 2856 KASSERT(td == curthread || TD_IS_SUSPENDED(td) || 2857 P_SHOULDSTOP(td->td_proc), 2858 ("not suspended thread %p", td)); 2859 npxgetregs(td); 2860 if (cpu_fxsr) 2861 npx_fill_fpregs_xmm(&get_pcb_user_save_td(td)->sv_xmm, 2862 (struct save87 *)fpregs); 2863 else 2864 bcopy(&get_pcb_user_save_td(td)->sv_87, fpregs, 2865 sizeof(*fpregs)); 2866 return (0); 2867 } 2868 2869 int 2870 set_fpregs(struct thread *td, struct fpreg *fpregs) 2871 { 2872 2873 if (cpu_fxsr) 2874 npx_set_fpregs_xmm((struct save87 *)fpregs, 2875 &get_pcb_user_save_td(td)->sv_xmm); 2876 else 2877 bcopy(fpregs, &get_pcb_user_save_td(td)->sv_87, 2878 sizeof(*fpregs)); 2879 npxuserinited(td); 2880 return (0); 2881 } 2882 2883 /* 2884 * Get machine context. 2885 */ 2886 int 2887 get_mcontext(struct thread *td, mcontext_t *mcp, int flags) 2888 { 2889 struct trapframe *tp; 2890 struct segment_descriptor *sdp; 2891 2892 tp = td->td_frame; 2893 2894 PROC_LOCK(curthread->td_proc); 2895 mcp->mc_onstack = sigonstack(tp->tf_esp); 2896 PROC_UNLOCK(curthread->td_proc); 2897 mcp->mc_gs = td->td_pcb->pcb_gs; 2898 mcp->mc_fs = tp->tf_fs; 2899 mcp->mc_es = tp->tf_es; 2900 mcp->mc_ds = tp->tf_ds; 2901 mcp->mc_edi = tp->tf_edi; 2902 mcp->mc_esi = tp->tf_esi; 2903 mcp->mc_ebp = tp->tf_ebp; 2904 mcp->mc_isp = tp->tf_isp; 2905 mcp->mc_eflags = tp->tf_eflags; 2906 if (flags & GET_MC_CLEAR_RET) { 2907 mcp->mc_eax = 0; 2908 mcp->mc_edx = 0; 2909 mcp->mc_eflags &= ~PSL_C; 2910 } else { 2911 mcp->mc_eax = tp->tf_eax; 2912 mcp->mc_edx = tp->tf_edx; 2913 } 2914 mcp->mc_ebx = tp->tf_ebx; 2915 mcp->mc_ecx = tp->tf_ecx; 2916 mcp->mc_eip = tp->tf_eip; 2917 mcp->mc_cs = tp->tf_cs; 2918 mcp->mc_esp = tp->tf_esp; 2919 mcp->mc_ss = tp->tf_ss; 2920 mcp->mc_len = sizeof(*mcp); 2921 get_fpcontext(td, mcp, NULL, 0); 2922 sdp = &td->td_pcb->pcb_fsd; 2923 mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; 2924 sdp = &td->td_pcb->pcb_gsd; 2925 mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; 2926 mcp->mc_flags = 0; 2927 mcp->mc_xfpustate = 0; 2928 mcp->mc_xfpustate_len = 0; 2929 bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2)); 2930 return (0); 2931 } 2932 2933 /* 2934 * Set machine context. 2935 * 2936 * However, we don't set any but the user modifiable flags, and we won't 2937 * touch the cs selector. 2938 */ 2939 int 2940 set_mcontext(struct thread *td, mcontext_t *mcp) 2941 { 2942 struct trapframe *tp; 2943 char *xfpustate; 2944 int eflags, ret; 2945 2946 tp = td->td_frame; 2947 if (mcp->mc_len != sizeof(*mcp) || 2948 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) 2949 return (EINVAL); 2950 eflags = (mcp->mc_eflags & PSL_USERCHANGE) | 2951 (tp->tf_eflags & ~PSL_USERCHANGE); 2952 if (mcp->mc_flags & _MC_HASFPXSTATE) { 2953 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - 2954 sizeof(union savefpu)) 2955 return (EINVAL); 2956 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); 2957 ret = copyin((void *)mcp->mc_xfpustate, xfpustate, 2958 mcp->mc_xfpustate_len); 2959 if (ret != 0) 2960 return (ret); 2961 } else 2962 xfpustate = NULL; 2963 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); 2964 if (ret != 0) 2965 return (ret); 2966 tp->tf_fs = mcp->mc_fs; 2967 tp->tf_es = mcp->mc_es; 2968 tp->tf_ds = mcp->mc_ds; 2969 tp->tf_edi = mcp->mc_edi; 2970 tp->tf_esi = mcp->mc_esi; 2971 tp->tf_ebp = mcp->mc_ebp; 2972 tp->tf_ebx = mcp->mc_ebx; 2973 tp->tf_edx = mcp->mc_edx; 2974 tp->tf_ecx = mcp->mc_ecx; 2975 tp->tf_eax = mcp->mc_eax; 2976 tp->tf_eip = mcp->mc_eip; 2977 tp->tf_eflags = eflags; 2978 tp->tf_esp = mcp->mc_esp; 2979 tp->tf_ss = mcp->mc_ss; 2980 td->td_pcb->pcb_gs = mcp->mc_gs; 2981 return (0); 2982 } 2983 2984 static void 2985 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, 2986 size_t xfpusave_len) 2987 { 2988 size_t max_len, len; 2989 2990 mcp->mc_ownedfp = npxgetregs(td); 2991 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], 2992 sizeof(mcp->mc_fpstate)); 2993 mcp->mc_fpformat = npxformat(); 2994 if (!use_xsave || xfpusave_len == 0) 2995 return; 2996 max_len = cpu_max_ext_state_size - sizeof(union savefpu); 2997 len = xfpusave_len; 2998 if (len > max_len) { 2999 len = max_len; 3000 bzero(xfpusave + max_len, len - max_len); 3001 } 3002 mcp->mc_flags |= _MC_HASFPXSTATE; 3003 mcp->mc_xfpustate_len = len; 3004 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); 3005 } 3006 3007 static int 3008 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, 3009 size_t xfpustate_len) 3010 { 3011 int error; 3012 3013 if (mcp->mc_fpformat == _MC_FPFMT_NODEV) 3014 return (0); 3015 else if (mcp->mc_fpformat != _MC_FPFMT_387 && 3016 mcp->mc_fpformat != _MC_FPFMT_XMM) 3017 return (EINVAL); 3018 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { 3019 /* We don't care what state is left in the FPU or PCB. */ 3020 fpstate_drop(td); 3021 error = 0; 3022 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || 3023 mcp->mc_ownedfp == _MC_FPOWNED_PCB) { 3024 error = npxsetregs(td, (union savefpu *)&mcp->mc_fpstate, 3025 xfpustate, xfpustate_len); 3026 } else 3027 return (EINVAL); 3028 return (error); 3029 } 3030 3031 static void 3032 fpstate_drop(struct thread *td) 3033 { 3034 3035 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); 3036 critical_enter(); 3037 if (PCPU_GET(fpcurthread) == td) 3038 npxdrop(); 3039 /* 3040 * XXX force a full drop of the npx. The above only drops it if we 3041 * owned it. npxgetregs() has the same bug in the !cpu_fxsr case. 3042 * 3043 * XXX I don't much like npxgetregs()'s semantics of doing a full 3044 * drop. Dropping only to the pcb matches fnsave's behaviour. 3045 * We only need to drop to !PCB_INITDONE in sendsig(). But 3046 * sendsig() is the only caller of npxgetregs()... perhaps we just 3047 * have too many layers. 3048 */ 3049 curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE | 3050 PCB_NPXUSERINITDONE); 3051 critical_exit(); 3052 } 3053 3054 int 3055 fill_dbregs(struct thread *td, struct dbreg *dbregs) 3056 { 3057 struct pcb *pcb; 3058 3059 if (td == NULL) { 3060 dbregs->dr[0] = rdr0(); 3061 dbregs->dr[1] = rdr1(); 3062 dbregs->dr[2] = rdr2(); 3063 dbregs->dr[3] = rdr3(); 3064 dbregs->dr[6] = rdr6(); 3065 dbregs->dr[7] = rdr7(); 3066 } else { 3067 pcb = td->td_pcb; 3068 dbregs->dr[0] = pcb->pcb_dr0; 3069 dbregs->dr[1] = pcb->pcb_dr1; 3070 dbregs->dr[2] = pcb->pcb_dr2; 3071 dbregs->dr[3] = pcb->pcb_dr3; 3072 dbregs->dr[6] = pcb->pcb_dr6; 3073 dbregs->dr[7] = pcb->pcb_dr7; 3074 } 3075 dbregs->dr[4] = 0; 3076 dbregs->dr[5] = 0; 3077 return (0); 3078 } 3079 3080 int 3081 set_dbregs(struct thread *td, struct dbreg *dbregs) 3082 { 3083 struct pcb *pcb; 3084 int i; 3085 3086 if (td == NULL) { 3087 load_dr0(dbregs->dr[0]); 3088 load_dr1(dbregs->dr[1]); 3089 load_dr2(dbregs->dr[2]); 3090 load_dr3(dbregs->dr[3]); 3091 load_dr6(dbregs->dr[6]); 3092 load_dr7(dbregs->dr[7]); 3093 } else { 3094 /* 3095 * Don't let an illegal value for dr7 get set. Specifically, 3096 * check for undefined settings. Setting these bit patterns 3097 * result in undefined behaviour and can lead to an unexpected 3098 * TRCTRAP. 3099 */ 3100 for (i = 0; i < 4; i++) { 3101 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) 3102 return (EINVAL); 3103 if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02) 3104 return (EINVAL); 3105 } 3106 3107 pcb = td->td_pcb; 3108 3109 /* 3110 * Don't let a process set a breakpoint that is not within the 3111 * process's address space. If a process could do this, it 3112 * could halt the system by setting a breakpoint in the kernel 3113 * (if ddb was enabled). Thus, we need to check to make sure 3114 * that no breakpoints are being enabled for addresses outside 3115 * process's address space. 3116 * 3117 * XXX - what about when the watched area of the user's 3118 * address space is written into from within the kernel 3119 * ... wouldn't that still cause a breakpoint to be generated 3120 * from within kernel mode? 3121 */ 3122 3123 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { 3124 /* dr0 is enabled */ 3125 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) 3126 return (EINVAL); 3127 } 3128 3129 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { 3130 /* dr1 is enabled */ 3131 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) 3132 return (EINVAL); 3133 } 3134 3135 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { 3136 /* dr2 is enabled */ 3137 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) 3138 return (EINVAL); 3139 } 3140 3141 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { 3142 /* dr3 is enabled */ 3143 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) 3144 return (EINVAL); 3145 } 3146 3147 pcb->pcb_dr0 = dbregs->dr[0]; 3148 pcb->pcb_dr1 = dbregs->dr[1]; 3149 pcb->pcb_dr2 = dbregs->dr[2]; 3150 pcb->pcb_dr3 = dbregs->dr[3]; 3151 pcb->pcb_dr6 = dbregs->dr[6]; 3152 pcb->pcb_dr7 = dbregs->dr[7]; 3153 3154 pcb->pcb_flags |= PCB_DBREGS; 3155 } 3156 3157 return (0); 3158 } 3159 3160 /* 3161 * Return > 0 if a hardware breakpoint has been hit, and the 3162 * breakpoint was in user space. Return 0, otherwise. 3163 */ 3164 int 3165 user_dbreg_trap(void) 3166 { 3167 u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */ 3168 u_int32_t bp; /* breakpoint bits extracted from dr6 */ 3169 int nbp; /* number of breakpoints that triggered */ 3170 caddr_t addr[4]; /* breakpoint addresses */ 3171 int i; 3172 3173 dr7 = rdr7(); 3174 if ((dr7 & 0x000000ff) == 0) { 3175 /* 3176 * all GE and LE bits in the dr7 register are zero, 3177 * thus the trap couldn't have been caused by the 3178 * hardware debug registers 3179 */ 3180 return 0; 3181 } 3182 3183 nbp = 0; 3184 dr6 = rdr6(); 3185 bp = dr6 & 0x0000000f; 3186 3187 if (!bp) { 3188 /* 3189 * None of the breakpoint bits are set meaning this 3190 * trap was not caused by any of the debug registers 3191 */ 3192 return 0; 3193 } 3194 3195 /* 3196 * at least one of the breakpoints were hit, check to see 3197 * which ones and if any of them are user space addresses 3198 */ 3199 3200 if (bp & 0x01) { 3201 addr[nbp++] = (caddr_t)rdr0(); 3202 } 3203 if (bp & 0x02) { 3204 addr[nbp++] = (caddr_t)rdr1(); 3205 } 3206 if (bp & 0x04) { 3207 addr[nbp++] = (caddr_t)rdr2(); 3208 } 3209 if (bp & 0x08) { 3210 addr[nbp++] = (caddr_t)rdr3(); 3211 } 3212 3213 for (i = 0; i < nbp; i++) { 3214 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { 3215 /* 3216 * addr[i] is in user space 3217 */ 3218 return nbp; 3219 } 3220 } 3221 3222 /* 3223 * None of the breakpoints are in user space. 3224 */ 3225 return 0; 3226 } 3227 3228 #ifdef KDB 3229 3230 /* 3231 * Provide inb() and outb() as functions. They are normally only available as 3232 * inline functions, thus cannot be called from the debugger. 3233 */ 3234 3235 /* silence compiler warnings */ 3236 u_char inb_(u_short); 3237 void outb_(u_short, u_char); 3238 3239 u_char 3240 inb_(u_short port) 3241 { 3242 return inb(port); 3243 } 3244 3245 void 3246 outb_(u_short port, u_char data) 3247 { 3248 outb(port, data); 3249 } 3250 3251 #endif /* KDB */ 3252