1 /*- 2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 3 * Copyright (c) 1992 Terrence R. Lambert. 4 * Copyright (c) 2003 Peter Wemm. 5 * Copyright (c) 2008 The DragonFly Project. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * William Jolitz. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $ 41 */ 42 43 //#include "use_npx.h" 44 #include "use_isa.h" 45 #include "opt_atalk.h" 46 #include "opt_compat.h" 47 #include "opt_cpu.h" 48 #include "opt_ddb.h" 49 #include "opt_directio.h" 50 #include "opt_inet.h" 51 #include "opt_ipx.h" 52 #include "opt_msgbuf.h" 53 #include "opt_swap.h" 54 #include "opt_apic.h" 55 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/sysproto.h> 59 #include <sys/signalvar.h> 60 #include <sys/kernel.h> 61 #include <sys/linker.h> 62 #include <sys/malloc.h> 63 #include <sys/proc.h> 64 #include <sys/priv.h> 65 #include <sys/buf.h> 66 #include <sys/reboot.h> 67 #include <sys/mbuf.h> 68 #include <sys/msgbuf.h> 69 #include <sys/sysent.h> 70 #include <sys/sysctl.h> 71 #include <sys/vmmeter.h> 72 #include <sys/bus.h> 73 #include <sys/upcall.h> 74 #include <sys/usched.h> 75 #include <sys/reg.h> 76 77 #include <vm/vm.h> 78 #include <vm/vm_param.h> 79 #include <sys/lock.h> 80 #include <vm/vm_kern.h> 81 #include <vm/vm_object.h> 82 #include <vm/vm_page.h> 83 #include <vm/vm_map.h> 84 #include <vm/vm_pager.h> 85 #include <vm/vm_extern.h> 86 87 #include <sys/thread2.h> 88 #include <sys/mplock2.h> 89 90 #include <sys/user.h> 91 #include <sys/exec.h> 92 #include <sys/cons.h> 93 94 #include <ddb/ddb.h> 95 96 #include <machine/cpu.h> 97 #include <machine/clock.h> 98 #include <machine/specialreg.h> 99 #if JG 100 #include <machine/bootinfo.h> 101 #endif 102 #include <machine/md_var.h> 103 #include <machine/metadata.h> 104 #include <machine/pc/bios.h> 105 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */ 106 #include <machine/globaldata.h> /* CPU_prvspace */ 107 #include <machine/smp.h> 108 #ifdef PERFMON 109 #include <machine/perfmon.h> 110 #endif 111 #include <machine/cputypes.h> 112 113 #ifdef OLD_BUS_ARCH 114 #include <bus/isa/isa_device.h> 115 #endif 116 #include <machine_base/isa/intr_machdep.h> 117 #include <bus/isa/rtc.h> 118 #include <sys/random.h> 119 #include <sys/ptrace.h> 120 #include <machine/sigframe.h> 121 122 #include <sys/machintr.h> 123 124 #define PHYSMAP_ENTRIES 10 125 126 extern void init386(int first); 127 extern void dblfault_handler(void); 128 extern u_int64_t hammer_time(u_int64_t, u_int64_t); 129 130 extern void printcpuinfo(void); /* XXX header file */ 131 extern void identify_cpu(void); 132 #if JG 133 extern void finishidentcpu(void); 134 #endif 135 extern void panicifcpuunsupported(void); 136 137 static void cpu_startup(void *); 138 #ifndef CPU_DISABLE_SSE 139 static void set_fpregs_xmm(struct save87 *, struct savexmm *); 140 static void fill_fpregs_xmm(struct savexmm *, struct save87 *); 141 #endif /* CPU_DISABLE_SSE */ 142 #ifdef DIRECTIO 143 extern void ffs_rawread_setup(void); 144 #endif /* DIRECTIO */ 145 static void init_locks(void); 146 147 SYSINIT(cpu, SI_BOOT2_SMP, SI_ORDER_FIRST, cpu_startup, NULL) 148 149 #ifdef DDB 150 extern vm_offset_t ksym_start, ksym_end; 151 #endif 152 153 struct privatespace CPU_prvspace[MAXCPU] __aligned(4096); /* XXX */ 154 155 int _udatasel, _ucodesel, _ucode32sel; 156 u_long atdevbase; 157 #ifdef SMP 158 int64_t tsc_offsets[MAXCPU]; 159 #else 160 int64_t tsc_offsets[1]; 161 #endif 162 163 #if defined(SWTCH_OPTIM_STATS) 164 extern int swtch_optim_stats; 165 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, 166 CTLFLAG_RD, &swtch_optim_stats, 0, ""); 167 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, 168 CTLFLAG_RD, &tlb_flush_count, 0, ""); 169 #endif 170 171 int physmem = 0; 172 173 u_long ebda_addr = 0; 174 175 static int 176 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS) 177 { 178 int error = sysctl_handle_int(oidp, 0, ctob(physmem), req); 179 return (error); 180 } 181 182 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD, 183 0, 0, sysctl_hw_physmem, "IU", ""); 184 185 static int 186 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS) 187 { 188 int error = sysctl_handle_int(oidp, 0, 189 ctob(physmem - vmstats.v_wire_count), req); 190 return (error); 191 } 192 193 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD, 194 0, 0, sysctl_hw_usermem, "IU", ""); 195 196 static int 197 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS) 198 { 199 int error = sysctl_handle_int(oidp, 0, 200 x86_64_btop(avail_end - avail_start), req); 201 return (error); 202 } 203 204 SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_INT|CTLFLAG_RD, 205 0, 0, sysctl_hw_availpages, "I", ""); 206 207 vm_paddr_t Maxmem; 208 vm_paddr_t Realmem; 209 210 /* 211 * The number of PHYSMAP entries must be one less than the number of 212 * PHYSSEG entries because the PHYSMAP entry that spans the largest 213 * physical address that is accessible by ISA DMA is split into two 214 * PHYSSEG entries. 215 */ 216 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 217 218 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 219 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 220 221 /* must be 2 less so 0 0 can signal end of chunks */ 222 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2) 223 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2) 224 225 static vm_offset_t buffer_sva, buffer_eva; 226 vm_offset_t clean_sva, clean_eva; 227 static vm_offset_t pager_sva, pager_eva; 228 static struct trapframe proc0_tf; 229 230 static void 231 cpu_startup(void *dummy) 232 { 233 caddr_t v; 234 vm_size_t size = 0; 235 vm_offset_t firstaddr; 236 237 if (boothowto & RB_VERBOSE) 238 bootverbose++; 239 240 /* 241 * Good {morning,afternoon,evening,night}. 242 */ 243 kprintf("%s", version); 244 startrtclock(); 245 printcpuinfo(); 246 panicifcpuunsupported(); 247 #ifdef PERFMON 248 perfmon_init(); 249 #endif 250 kprintf("real memory = %ju (%ju MB)\n", 251 (intmax_t)Realmem, 252 (intmax_t)Realmem / 1024 / 1024); 253 /* 254 * Display any holes after the first chunk of extended memory. 255 */ 256 if (bootverbose) { 257 int indx; 258 259 kprintf("Physical memory chunk(s):\n"); 260 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 261 vm_paddr_t size1 = phys_avail[indx + 1] - phys_avail[indx]; 262 263 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n", 264 (intmax_t)phys_avail[indx], 265 (intmax_t)phys_avail[indx + 1] - 1, 266 (intmax_t)size1, 267 (intmax_t)(size1 / PAGE_SIZE)); 268 } 269 } 270 271 /* 272 * Allocate space for system data structures. 273 * The first available kernel virtual address is in "v". 274 * As pages of kernel virtual memory are allocated, "v" is incremented. 275 * As pages of memory are allocated and cleared, 276 * "firstaddr" is incremented. 277 * An index into the kernel page table corresponding to the 278 * virtual memory address maintained in "v" is kept in "mapaddr". 279 */ 280 281 /* 282 * Make two passes. The first pass calculates how much memory is 283 * needed and allocates it. The second pass assigns virtual 284 * addresses to the various data structures. 285 */ 286 firstaddr = 0; 287 again: 288 v = (caddr_t)firstaddr; 289 290 #define valloc(name, type, num) \ 291 (name) = (type *)v; v = (caddr_t)((name)+(num)) 292 #define valloclim(name, type, num, lim) \ 293 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) 294 295 /* 296 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. 297 * For the first 64MB of ram nominally allocate sufficient buffers to 298 * cover 1/4 of our ram. Beyond the first 64MB allocate additional 299 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing 300 * the buffer cache we limit the eventual kva reservation to 301 * maxbcache bytes. 302 * 303 * factor represents the 1/4 x ram conversion. 304 */ 305 if (nbuf == 0) { 306 int factor = 4 * BKVASIZE / 1024; 307 int kbytes = physmem * (PAGE_SIZE / 1024); 308 309 nbuf = 50; 310 if (kbytes > 4096) 311 nbuf += min((kbytes - 4096) / factor, 65536 / factor); 312 if (kbytes > 65536) 313 nbuf += (kbytes - 65536) * 2 / (factor * 5); 314 if (maxbcache && nbuf > maxbcache / BKVASIZE) 315 nbuf = maxbcache / BKVASIZE; 316 } 317 318 /* 319 * Do not allow the buffer_map to be more then 1/2 the size of the 320 * kernel_map. 321 */ 322 if (nbuf > (virtual_end - virtual_start) / (BKVASIZE * 2)) { 323 nbuf = (virtual_end - virtual_start) / (BKVASIZE * 2); 324 kprintf("Warning: nbufs capped at %d\n", nbuf); 325 } 326 327 nswbuf = max(min(nbuf/4, 256), 16); 328 #ifdef NSWBUF_MIN 329 if (nswbuf < NSWBUF_MIN) 330 nswbuf = NSWBUF_MIN; 331 #endif 332 #ifdef DIRECTIO 333 ffs_rawread_setup(); 334 #endif 335 336 valloc(swbuf, struct buf, nswbuf); 337 valloc(buf, struct buf, nbuf); 338 339 /* 340 * End of first pass, size has been calculated so allocate memory 341 */ 342 if (firstaddr == 0) { 343 size = (vm_size_t)(v - firstaddr); 344 firstaddr = kmem_alloc(&kernel_map, round_page(size)); 345 if (firstaddr == 0) 346 panic("startup: no room for tables"); 347 goto again; 348 } 349 350 /* 351 * End of second pass, addresses have been assigned 352 */ 353 if ((vm_size_t)(v - firstaddr) != size) 354 panic("startup: table size inconsistency"); 355 356 kmem_suballoc(&kernel_map, &clean_map, &clean_sva, &clean_eva, 357 (nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size); 358 kmem_suballoc(&clean_map, &buffer_map, &buffer_sva, &buffer_eva, 359 (nbuf*BKVASIZE)); 360 buffer_map.system_map = 1; 361 kmem_suballoc(&clean_map, &pager_map, &pager_sva, &pager_eva, 362 (nswbuf*MAXPHYS) + pager_map_size); 363 pager_map.system_map = 1; 364 365 #if defined(USERCONFIG) 366 userconfig(); 367 cninit(); /* the preferred console may have changed */ 368 #endif 369 370 kprintf("avail memory = %ju (%ju MB)\n", 371 (uintmax_t)ptoa(vmstats.v_free_count), 372 (uintmax_t)ptoa(vmstats.v_free_count) / 1024 / 1024); 373 374 /* 375 * Set up buffers, so they can be used to read disk labels. 376 */ 377 bufinit(); 378 vm_pager_bufferinit(); 379 380 #ifdef SMP 381 /* 382 * OK, enough kmem_alloc/malloc state should be up, lets get on with it! 383 */ 384 mp_start(); /* fire up the APs and APICs */ 385 mp_announce(); 386 #endif /* SMP */ 387 cpu_setregs(); 388 } 389 390 /* 391 * Send an interrupt to process. 392 * 393 * Stack is set up to allow sigcode stored 394 * at top to call routine, followed by kcall 395 * to sigreturn routine below. After sigreturn 396 * resets the signal mask, the stack, and the 397 * frame pointer, it returns to the user 398 * specified pc, psl. 399 */ 400 void 401 sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code) 402 { 403 struct lwp *lp = curthread->td_lwp; 404 struct proc *p = lp->lwp_proc; 405 struct trapframe *regs; 406 struct sigacts *psp = p->p_sigacts; 407 struct sigframe sf, *sfp; 408 int oonstack; 409 char *sp; 410 411 regs = lp->lwp_md.md_regs; 412 oonstack = (lp->lwp_sigstk.ss_flags & SS_ONSTACK) ? 1 : 0; 413 414 /* Save user context */ 415 bzero(&sf, sizeof(struct sigframe)); 416 sf.sf_uc.uc_sigmask = *mask; 417 sf.sf_uc.uc_stack = lp->lwp_sigstk; 418 sf.sf_uc.uc_mcontext.mc_onstack = oonstack; 419 KKASSERT(__offsetof(struct trapframe, tf_rdi) == 0); 420 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(struct trapframe)); 421 422 /* Make the size of the saved context visible to userland */ 423 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); 424 425 /* Save mailbox pending state for syscall interlock semantics */ 426 if (p->p_flag & P_MAILBOX) 427 sf.sf_uc.uc_mcontext.mc_xflags |= PGEX_MAILBOX; 428 429 /* Allocate and validate space for the signal handler context. */ 430 if ((lp->lwp_flag & LWP_ALTSTACK) != 0 && !oonstack && 431 SIGISMEMBER(psp->ps_sigonstack, sig)) { 432 sp = (char *)(lp->lwp_sigstk.ss_sp + lp->lwp_sigstk.ss_size - 433 sizeof(struct sigframe)); 434 lp->lwp_sigstk.ss_flags |= SS_ONSTACK; 435 } else { 436 /* We take red zone into account */ 437 sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128; 438 } 439 440 /* Align to 16 bytes */ 441 sfp = (struct sigframe *)((intptr_t)sp & ~(intptr_t)0xF); 442 443 /* Translate the signal is appropriate */ 444 if (p->p_sysent->sv_sigtbl) { 445 if (sig <= p->p_sysent->sv_sigsize) 446 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 447 } 448 449 /* 450 * Build the argument list for the signal handler. 451 * 452 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx) 453 */ 454 regs->tf_rdi = sig; /* argument 1 */ 455 regs->tf_rdx = (register_t)&sfp->sf_uc; /* argument 3 */ 456 457 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 458 /* 459 * Signal handler installed with SA_SIGINFO. 460 * 461 * action(signo, siginfo, ucontext) 462 */ 463 regs->tf_rsi = (register_t)&sfp->sf_si; /* argument 2 */ 464 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */ 465 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 466 467 /* fill siginfo structure */ 468 sf.sf_si.si_signo = sig; 469 sf.sf_si.si_code = code; 470 sf.sf_si.si_addr = (void *)regs->tf_addr; 471 } else { 472 /* 473 * Old FreeBSD-style arguments. 474 * 475 * handler (signo, code, [uc], addr) 476 */ 477 regs->tf_rsi = (register_t)code; /* argument 2 */ 478 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */ 479 sf.sf_ahu.sf_handler = catcher; 480 } 481 482 /* 483 * If we're a vm86 process, we want to save the segment registers. 484 * We also change eflags to be our emulated eflags, not the actual 485 * eflags. 486 */ 487 #if JG 488 if (regs->tf_eflags & PSL_VM) { 489 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 490 struct vm86_kernel *vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86; 491 492 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 493 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 494 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 495 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 496 497 if (vm86->vm86_has_vme == 0) 498 sf.sf_uc.uc_mcontext.mc_eflags = 499 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 500 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 501 502 /* 503 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 504 * syscalls made by the signal handler. This just avoids 505 * wasting time for our lazy fixup of such faults. PSL_NT 506 * does nothing in vm86 mode, but vm86 programs can set it 507 * almost legitimately in probes for old cpu types. 508 */ 509 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 510 } 511 #endif 512 513 /* 514 * Save the FPU state and reinit the FP unit 515 */ 516 npxpush(&sf.sf_uc.uc_mcontext); 517 518 /* 519 * Copy the sigframe out to the user's stack. 520 */ 521 if (copyout(&sf, sfp, sizeof(struct sigframe)) != 0) { 522 /* 523 * Something is wrong with the stack pointer. 524 * ...Kill the process. 525 */ 526 sigexit(lp, SIGILL); 527 } 528 529 regs->tf_rsp = (register_t)sfp; 530 regs->tf_rip = PS_STRINGS - *(p->p_sysent->sv_szsigcode); 531 532 /* 533 * i386 abi specifies that the direction flag must be cleared 534 * on function entry 535 */ 536 regs->tf_rflags &= ~(PSL_T|PSL_D); 537 538 /* 539 * 64 bit mode has a code and stack selector but 540 * no data or extra selector. %fs and %gs are not 541 * stored in-context. 542 */ 543 regs->tf_cs = _ucodesel; 544 regs->tf_ss = _udatasel; 545 } 546 547 /* 548 * Sanitize the trapframe for a virtual kernel passing control to a custom 549 * VM context. Remove any items that would otherwise create a privilage 550 * issue. 551 * 552 * XXX at the moment we allow userland to set the resume flag. Is this a 553 * bad idea? 554 */ 555 int 556 cpu_sanitize_frame(struct trapframe *frame) 557 { 558 frame->tf_cs = _ucodesel; 559 frame->tf_ss = _udatasel; 560 /* XXX VM (8086) mode not supported? */ 561 frame->tf_rflags &= (PSL_RF | PSL_USERCHANGE | PSL_VM_UNSUPP); 562 frame->tf_rflags |= PSL_RESERVED_DEFAULT | PSL_I; 563 564 return(0); 565 } 566 567 /* 568 * Sanitize the tls so loading the descriptor does not blow up 569 * on us. For x86_64 we don't have to do anything. 570 */ 571 int 572 cpu_sanitize_tls(struct savetls *tls) 573 { 574 return(0); 575 } 576 577 /* 578 * sigreturn(ucontext_t *sigcntxp) 579 * 580 * System call to cleanup state after a signal 581 * has been taken. Reset signal mask and 582 * stack state from context left by sendsig (above). 583 * Return to previous pc and psl as specified by 584 * context left by sendsig. Check carefully to 585 * make sure that the user has not modified the 586 * state to gain improper privileges. 587 * 588 * MPSAFE 589 */ 590 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 591 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 592 593 int 594 sys_sigreturn(struct sigreturn_args *uap) 595 { 596 struct lwp *lp = curthread->td_lwp; 597 struct proc *p = lp->lwp_proc; 598 struct trapframe *regs; 599 ucontext_t uc; 600 ucontext_t *ucp; 601 register_t rflags; 602 int cs; 603 int error; 604 605 /* 606 * We have to copy the information into kernel space so userland 607 * can't modify it while we are sniffing it. 608 */ 609 regs = lp->lwp_md.md_regs; 610 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 611 if (error) 612 return (error); 613 ucp = &uc; 614 rflags = ucp->uc_mcontext.mc_rflags; 615 616 /* VM (8086) mode not supported */ 617 rflags &= ~PSL_VM_UNSUPP; 618 619 #if JG 620 if (eflags & PSL_VM) { 621 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 622 struct vm86_kernel *vm86; 623 624 /* 625 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 626 * set up the vm86 area, and we can't enter vm86 mode. 627 */ 628 if (lp->lwp_thread->td_pcb->pcb_ext == 0) 629 return (EINVAL); 630 vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86; 631 if (vm86->vm86_inited == 0) 632 return (EINVAL); 633 634 /* go back to user mode if both flags are set */ 635 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) 636 trapsignal(lp, SIGBUS, 0); 637 638 if (vm86->vm86_has_vme) { 639 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 640 (eflags & VME_USERCHANGE) | PSL_VM; 641 } else { 642 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 643 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 644 (eflags & VM_USERCHANGE) | PSL_VM; 645 } 646 bcopy(&ucp->uc_mcontext.mc_gs, tf, sizeof(struct trapframe)); 647 tf->tf_eflags = eflags; 648 tf->tf_vm86_ds = tf->tf_ds; 649 tf->tf_vm86_es = tf->tf_es; 650 tf->tf_vm86_fs = tf->tf_fs; 651 tf->tf_vm86_gs = tf->tf_gs; 652 tf->tf_ds = _udatasel; 653 tf->tf_es = _udatasel; 654 tf->tf_fs = _udatasel; 655 tf->tf_gs = _udatasel; 656 } else 657 #endif 658 { 659 /* 660 * Don't allow users to change privileged or reserved flags. 661 */ 662 /* 663 * XXX do allow users to change the privileged flag PSL_RF. 664 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers 665 * should sometimes set it there too. tf_eflags is kept in 666 * the signal context during signal handling and there is no 667 * other place to remember it, so the PSL_RF bit may be 668 * corrupted by the signal handler without us knowing. 669 * Corruption of the PSL_RF bit at worst causes one more or 670 * one less debugger trap, so allowing it is fairly harmless. 671 */ 672 if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { 673 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags); 674 return(EINVAL); 675 } 676 677 /* 678 * Don't allow users to load a valid privileged %cs. Let the 679 * hardware check for invalid selectors, excess privilege in 680 * other selectors, invalid %eip's and invalid %esp's. 681 */ 682 cs = ucp->uc_mcontext.mc_cs; 683 if (!CS_SECURE(cs)) { 684 kprintf("sigreturn: cs = 0x%x\n", cs); 685 trapsignal(lp, SIGBUS, T_PROTFLT); 686 return(EINVAL); 687 } 688 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(struct trapframe)); 689 } 690 691 /* 692 * Restore the FPU state from the frame 693 */ 694 crit_enter(); 695 npxpop(&ucp->uc_mcontext); 696 697 /* 698 * Merge saved signal mailbox pending flag to maintain interlock 699 * semantics against system calls. 700 */ 701 if (ucp->uc_mcontext.mc_xflags & PGEX_MAILBOX) 702 p->p_flag |= P_MAILBOX; 703 704 if (ucp->uc_mcontext.mc_onstack & 1) 705 lp->lwp_sigstk.ss_flags |= SS_ONSTACK; 706 else 707 lp->lwp_sigstk.ss_flags &= ~SS_ONSTACK; 708 709 lp->lwp_sigmask = ucp->uc_sigmask; 710 SIG_CANTMASK(lp->lwp_sigmask); 711 crit_exit(); 712 return(EJUSTRETURN); 713 } 714 715 /* 716 * Stack frame on entry to function. %rax will contain the function vector, 717 * %rcx will contain the function data. flags, rcx, and rax will have 718 * already been pushed on the stack. 719 */ 720 struct upc_frame { 721 register_t rax; 722 register_t rcx; 723 register_t rdx; 724 register_t flags; 725 register_t oldip; 726 }; 727 728 void 729 sendupcall(struct vmupcall *vu, int morepending) 730 { 731 struct lwp *lp = curthread->td_lwp; 732 struct trapframe *regs; 733 struct upcall upcall; 734 struct upc_frame upc_frame; 735 int crit_count = 0; 736 737 /* 738 * If we are a virtual kernel running an emulated user process 739 * context, switch back to the virtual kernel context before 740 * trying to post the signal. 741 */ 742 if (lp->lwp_vkernel && lp->lwp_vkernel->ve) { 743 lp->lwp_md.md_regs->tf_trapno = 0; 744 vkernel_trap(lp, lp->lwp_md.md_regs); 745 } 746 747 /* 748 * Get the upcall data structure 749 */ 750 if (copyin(lp->lwp_upcall, &upcall, sizeof(upcall)) || 751 copyin((char *)upcall.upc_uthread + upcall.upc_critoff, &crit_count, sizeof(int)) 752 ) { 753 vu->vu_pending = 0; 754 kprintf("bad upcall address\n"); 755 return; 756 } 757 758 /* 759 * If the data structure is already marked pending or has a critical 760 * section count, mark the data structure as pending and return 761 * without doing an upcall. vu_pending is left set. 762 */ 763 if (upcall.upc_pending || crit_count >= vu->vu_pending) { 764 if (upcall.upc_pending < vu->vu_pending) { 765 upcall.upc_pending = vu->vu_pending; 766 copyout(&upcall.upc_pending, &lp->lwp_upcall->upc_pending, 767 sizeof(upcall.upc_pending)); 768 } 769 return; 770 } 771 772 /* 773 * We can run this upcall now, clear vu_pending. 774 * 775 * Bump our critical section count and set or clear the 776 * user pending flag depending on whether more upcalls are 777 * pending. The user will be responsible for calling 778 * upc_dispatch(-1) to process remaining upcalls. 779 */ 780 vu->vu_pending = 0; 781 upcall.upc_pending = morepending; 782 ++crit_count; 783 copyout(&upcall.upc_pending, &lp->lwp_upcall->upc_pending, 784 sizeof(upcall.upc_pending)); 785 copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, 786 sizeof(int)); 787 788 /* 789 * Construct a stack frame and issue the upcall 790 */ 791 regs = lp->lwp_md.md_regs; 792 upc_frame.rax = regs->tf_rax; 793 upc_frame.rcx = regs->tf_rcx; 794 upc_frame.rdx = regs->tf_rdx; 795 upc_frame.flags = regs->tf_rflags; 796 upc_frame.oldip = regs->tf_rip; 797 if (copyout(&upc_frame, (void *)(regs->tf_rsp - sizeof(upc_frame)), 798 sizeof(upc_frame)) != 0) { 799 kprintf("bad stack on upcall\n"); 800 } else { 801 regs->tf_rax = (register_t)vu->vu_func; 802 regs->tf_rcx = (register_t)vu->vu_data; 803 regs->tf_rdx = (register_t)lp->lwp_upcall; 804 regs->tf_rip = (register_t)vu->vu_ctx; 805 regs->tf_rsp -= sizeof(upc_frame); 806 } 807 } 808 809 /* 810 * fetchupcall occurs in the context of a system call, which means that 811 * we have to return EJUSTRETURN in order to prevent eax and edx from 812 * being overwritten by the syscall return value. 813 * 814 * if vu is not NULL we return the new context in %edx, the new data in %ecx, 815 * and the function pointer in %eax. 816 */ 817 int 818 fetchupcall(struct vmupcall *vu, int morepending, void *rsp) 819 { 820 struct upc_frame upc_frame; 821 struct lwp *lp = curthread->td_lwp; 822 struct trapframe *regs; 823 int error; 824 struct upcall upcall; 825 int crit_count; 826 827 regs = lp->lwp_md.md_regs; 828 829 error = copyout(&morepending, &lp->lwp_upcall->upc_pending, sizeof(int)); 830 if (error == 0) { 831 if (vu) { 832 /* 833 * This jumps us to the next ready context. 834 */ 835 vu->vu_pending = 0; 836 error = copyin(lp->lwp_upcall, &upcall, sizeof(upcall)); 837 crit_count = 0; 838 if (error == 0) 839 error = copyin((char *)upcall.upc_uthread + upcall.upc_critoff, &crit_count, sizeof(int)); 840 ++crit_count; 841 if (error == 0) 842 error = copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, sizeof(int)); 843 regs->tf_rax = (register_t)vu->vu_func; 844 regs->tf_rcx = (register_t)vu->vu_data; 845 regs->tf_rdx = (register_t)lp->lwp_upcall; 846 regs->tf_rip = (register_t)vu->vu_ctx; 847 regs->tf_rsp = (register_t)rsp; 848 } else { 849 /* 850 * This returns us to the originally interrupted code. 851 */ 852 error = copyin(rsp, &upc_frame, sizeof(upc_frame)); 853 regs->tf_rax = upc_frame.rax; 854 regs->tf_rcx = upc_frame.rcx; 855 regs->tf_rdx = upc_frame.rdx; 856 regs->tf_rflags = (regs->tf_rflags & ~PSL_USERCHANGE) | 857 (upc_frame.flags & PSL_USERCHANGE); 858 regs->tf_rip = upc_frame.oldip; 859 regs->tf_rsp = (register_t)((char *)rsp + sizeof(upc_frame)); 860 } 861 } 862 if (error == 0) 863 error = EJUSTRETURN; 864 return(error); 865 } 866 867 /* 868 * Machine dependent boot() routine 869 * 870 * I haven't seen anything to put here yet 871 * Possibly some stuff might be grafted back here from boot() 872 */ 873 void 874 cpu_boot(int howto) 875 { 876 } 877 878 /* 879 * Shutdown the CPU as much as possible 880 */ 881 void 882 cpu_halt(void) 883 { 884 for (;;) 885 __asm__ __volatile("hlt"); 886 } 887 888 /* 889 * cpu_idle() represents the idle LWKT. You cannot return from this function 890 * (unless you want to blow things up!). Instead we look for runnable threads 891 * and loop or halt as appropriate. Giant is not held on entry to the thread. 892 * 893 * The main loop is entered with a critical section held, we must release 894 * the critical section before doing anything else. lwkt_switch() will 895 * check for pending interrupts due to entering and exiting its own 896 * critical section. 897 * 898 * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up. 899 * However, there are cases where the idlethread will be entered with 900 * the possibility that no IPI will occur and in such cases 901 * lwkt_switch() sets TDF_IDLE_NOHLT. 902 * 903 * NOTE: cpu_idle_hlt again defaults to 2 (use ACPI sleep states). Set to 904 * 1 to just use hlt and for debugging purposes. 905 */ 906 static int cpu_idle_hlt = 2; 907 static int cpu_idle_hltcnt; 908 static int cpu_idle_spincnt; 909 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, 910 &cpu_idle_hlt, 0, "Idle loop HLT enable"); 911 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hltcnt, CTLFLAG_RW, 912 &cpu_idle_hltcnt, 0, "Idle loop entry halts"); 913 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_spincnt, CTLFLAG_RW, 914 &cpu_idle_spincnt, 0, "Idle loop entry spins"); 915 916 static void 917 cpu_idle_default_hook(void) 918 { 919 /* 920 * We must guarentee that hlt is exactly the instruction 921 * following the sti. 922 */ 923 __asm __volatile("sti; hlt"); 924 } 925 926 /* Other subsystems (e.g., ACPI) can hook this later. */ 927 void (*cpu_idle_hook)(void) = cpu_idle_default_hook; 928 929 void 930 cpu_idle(void) 931 { 932 struct thread *td = curthread; 933 934 crit_exit(); 935 KKASSERT(td->td_critcount == 0); 936 for (;;) { 937 /* 938 * See if there are any LWKTs ready to go. 939 */ 940 lwkt_switch(); 941 942 /* 943 * If we are going to halt call splz unconditionally after 944 * CLIing to catch any interrupt races. Note that we are 945 * at SPL0 and interrupts are enabled. 946 */ 947 if (cpu_idle_hlt && 948 (td->td_gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) { 949 __asm __volatile("cli"); 950 splz(); 951 if ((td->td_gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) { 952 if (cpu_idle_hlt == 1) 953 cpu_idle_default_hook(); 954 else 955 cpu_idle_hook(); 956 } 957 __asm __volatile("sti"); 958 ++cpu_idle_hltcnt; 959 } else { 960 splz(); 961 __asm __volatile("sti"); 962 ++cpu_idle_spincnt; 963 } 964 } 965 } 966 967 #ifdef SMP 968 969 /* 970 * This routine is called if a spinlock has been held through the 971 * exponential backoff period and is seriously contested. On a real cpu 972 * we let it spin. 973 */ 974 void 975 cpu_spinlock_contested(void) 976 { 977 cpu_pause(); 978 } 979 980 #endif 981 982 /* 983 * Clear registers on exec 984 */ 985 void 986 exec_setregs(u_long entry, u_long stack, u_long ps_strings) 987 { 988 struct thread *td = curthread; 989 struct lwp *lp = td->td_lwp; 990 struct pcb *pcb = td->td_pcb; 991 struct trapframe *regs = lp->lwp_md.md_regs; 992 993 /* was i386_user_cleanup() in NetBSD */ 994 user_ldt_free(pcb); 995 996 bzero((char *)regs, sizeof(struct trapframe)); 997 regs->tf_rip = entry; 998 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; /* align the stack */ 999 regs->tf_rdi = stack; /* argv */ 1000 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); 1001 regs->tf_ss = _udatasel; 1002 regs->tf_cs = _ucodesel; 1003 regs->tf_rbx = ps_strings; 1004 1005 /* 1006 * Reset the hardware debug registers if they were in use. 1007 * They won't have any meaning for the newly exec'd process. 1008 */ 1009 if (pcb->pcb_flags & PCB_DBREGS) { 1010 pcb->pcb_dr0 = 0; 1011 pcb->pcb_dr1 = 0; 1012 pcb->pcb_dr2 = 0; 1013 pcb->pcb_dr3 = 0; 1014 pcb->pcb_dr6 = 0; 1015 pcb->pcb_dr7 = 0; /* JG set bit 10? */ 1016 if (pcb == td->td_pcb) { 1017 /* 1018 * Clear the debug registers on the running 1019 * CPU, otherwise they will end up affecting 1020 * the next process we switch to. 1021 */ 1022 reset_dbregs(); 1023 } 1024 pcb->pcb_flags &= ~PCB_DBREGS; 1025 } 1026 1027 /* 1028 * Initialize the math emulator (if any) for the current process. 1029 * Actually, just clear the bit that says that the emulator has 1030 * been initialized. Initialization is delayed until the process 1031 * traps to the emulator (if it is done at all) mainly because 1032 * emulators don't provide an entry point for initialization. 1033 */ 1034 pcb->pcb_flags &= ~FP_SOFTFP; 1035 1036 /* 1037 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing 1038 * gd_npxthread. Otherwise a preemptive interrupt thread 1039 * may panic in npxdna(). 1040 */ 1041 crit_enter(); 1042 load_cr0(rcr0() | CR0_MP); 1043 1044 /* 1045 * NOTE: The MSR values must be correct so we can return to 1046 * userland. gd_user_fs/gs must be correct so the switch 1047 * code knows what the current MSR values are. 1048 */ 1049 pcb->pcb_fsbase = 0; /* Values loaded from PCB on switch */ 1050 pcb->pcb_gsbase = 0; 1051 mdcpu->gd_user_fs = 0; /* Cache of current MSR values */ 1052 mdcpu->gd_user_gs = 0; 1053 wrmsr(MSR_FSBASE, 0); /* Set MSR values for return to userland */ 1054 wrmsr(MSR_KGSBASE, 0); 1055 1056 /* Initialize the npx (if any) for the current process. */ 1057 npxinit(__INITIAL_NPXCW__); 1058 crit_exit(); 1059 1060 pcb->pcb_ds = _udatasel; 1061 pcb->pcb_es = _udatasel; 1062 pcb->pcb_fs = _udatasel; 1063 pcb->pcb_gs = _udatasel; 1064 } 1065 1066 void 1067 cpu_setregs(void) 1068 { 1069 register_t cr0; 1070 1071 cr0 = rcr0(); 1072 cr0 |= CR0_NE; /* Done by npxinit() */ 1073 cr0 |= CR0_MP | CR0_TS; /* Done at every execve() too. */ 1074 cr0 |= CR0_WP | CR0_AM; 1075 load_cr0(cr0); 1076 load_gs(_udatasel); 1077 } 1078 1079 static int 1080 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS) 1081 { 1082 int error; 1083 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, 1084 req); 1085 if (!error && req->newptr) 1086 resettodr(); 1087 return (error); 1088 } 1089 1090 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, 1091 &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); 1092 1093 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, 1094 CTLFLAG_RW, &disable_rtc_set, 0, ""); 1095 1096 #if JG 1097 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, 1098 CTLFLAG_RD, &bootinfo, bootinfo, ""); 1099 #endif 1100 1101 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, 1102 CTLFLAG_RW, &wall_cmos_clock, 0, ""); 1103 1104 extern u_long bootdev; /* not a cdev_t - encoding is different */ 1105 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, 1106 CTLFLAG_RD, &bootdev, 0, "Boot device (not in cdev_t format)"); 1107 1108 /* 1109 * Initialize 386 and configure to run kernel 1110 */ 1111 1112 /* 1113 * Initialize segments & interrupt table 1114 */ 1115 1116 int _default_ldt; 1117 struct user_segment_descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ 1118 static struct gate_descriptor idt0[NIDT]; 1119 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 1120 #if JG 1121 union descriptor ldt[NLDT]; /* local descriptor table */ 1122 #endif 1123 1124 /* table descriptors - used to load tables by cpu */ 1125 struct region_descriptor r_gdt, r_idt; 1126 1127 #if defined(I586_CPU) && !defined(NO_F00F_HACK) 1128 extern int has_f00f_bug; 1129 #endif 1130 1131 /* JG proc0paddr is a virtual address */ 1132 void *proc0paddr; 1133 /* JG alignment? */ 1134 char proc0paddr_buff[LWKT_THREAD_STACK]; 1135 1136 1137 /* software prototypes -- in more palatable form */ 1138 struct soft_segment_descriptor gdt_segs[] = { 1139 /* GNULL_SEL 0 Null Descriptor */ 1140 { 0x0, /* segment base address */ 1141 0x0, /* length */ 1142 0, /* segment type */ 1143 0, /* segment descriptor priority level */ 1144 0, /* segment descriptor present */ 1145 0, /* long */ 1146 0, /* default 32 vs 16 bit size */ 1147 0 /* limit granularity (byte/page units)*/ }, 1148 /* GCODE_SEL 1 Code Descriptor for kernel */ 1149 { 0x0, /* segment base address */ 1150 0xfffff, /* length - all address space */ 1151 SDT_MEMERA, /* segment type */ 1152 SEL_KPL, /* segment descriptor priority level */ 1153 1, /* segment descriptor present */ 1154 1, /* long */ 1155 0, /* default 32 vs 16 bit size */ 1156 1 /* limit granularity (byte/page units)*/ }, 1157 /* GDATA_SEL 2 Data Descriptor for kernel */ 1158 { 0x0, /* segment base address */ 1159 0xfffff, /* length - all address space */ 1160 SDT_MEMRWA, /* segment type */ 1161 SEL_KPL, /* segment descriptor priority level */ 1162 1, /* segment descriptor present */ 1163 1, /* long */ 1164 0, /* default 32 vs 16 bit size */ 1165 1 /* limit granularity (byte/page units)*/ }, 1166 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */ 1167 { 0x0, /* segment base address */ 1168 0xfffff, /* length - all address space */ 1169 SDT_MEMERA, /* segment type */ 1170 SEL_UPL, /* segment descriptor priority level */ 1171 1, /* segment descriptor present */ 1172 0, /* long */ 1173 1, /* default 32 vs 16 bit size */ 1174 1 /* limit granularity (byte/page units)*/ }, 1175 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */ 1176 { 0x0, /* segment base address */ 1177 0xfffff, /* length - all address space */ 1178 SDT_MEMRWA, /* segment type */ 1179 SEL_UPL, /* segment descriptor priority level */ 1180 1, /* segment descriptor present */ 1181 0, /* long */ 1182 1, /* default 32 vs 16 bit size */ 1183 1 /* limit granularity (byte/page units)*/ }, 1184 /* GUCODE_SEL 5 64 bit Code Descriptor for user */ 1185 { 0x0, /* segment base address */ 1186 0xfffff, /* length - all address space */ 1187 SDT_MEMERA, /* segment type */ 1188 SEL_UPL, /* segment descriptor priority level */ 1189 1, /* segment descriptor present */ 1190 1, /* long */ 1191 0, /* default 32 vs 16 bit size */ 1192 1 /* limit granularity (byte/page units)*/ }, 1193 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */ 1194 { 1195 0x0, /* segment base address */ 1196 sizeof(struct x86_64tss)-1,/* length - all address space */ 1197 SDT_SYSTSS, /* segment type */ 1198 SEL_KPL, /* segment descriptor priority level */ 1199 1, /* segment descriptor present */ 1200 0, /* long */ 1201 0, /* unused - default 32 vs 16 bit size */ 1202 0 /* limit granularity (byte/page units)*/ }, 1203 /* Actually, the TSS is a system descriptor which is double size */ 1204 { 0x0, /* segment base address */ 1205 0x0, /* length */ 1206 0, /* segment type */ 1207 0, /* segment descriptor priority level */ 1208 0, /* segment descriptor present */ 1209 0, /* long */ 1210 0, /* default 32 vs 16 bit size */ 1211 0 /* limit granularity (byte/page units)*/ }, 1212 /* GUGS32_SEL 8 32 bit GS Descriptor for user */ 1213 { 0x0, /* segment base address */ 1214 0xfffff, /* length - all address space */ 1215 SDT_MEMRWA, /* segment type */ 1216 SEL_UPL, /* segment descriptor priority level */ 1217 1, /* segment descriptor present */ 1218 0, /* long */ 1219 1, /* default 32 vs 16 bit size */ 1220 1 /* limit granularity (byte/page units)*/ }, 1221 }; 1222 1223 void 1224 setidt(int idx, inthand_t *func, int typ, int dpl, int ist) 1225 { 1226 struct gate_descriptor *ip; 1227 1228 ip = idt + idx; 1229 ip->gd_looffset = (uintptr_t)func; 1230 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 1231 ip->gd_ist = ist; 1232 ip->gd_xx = 0; 1233 ip->gd_type = typ; 1234 ip->gd_dpl = dpl; 1235 ip->gd_p = 1; 1236 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 1237 } 1238 1239 #define IDTVEC(name) __CONCAT(X,name) 1240 1241 extern inthand_t 1242 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 1243 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 1244 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 1245 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 1246 IDTVEC(xmm), IDTVEC(dblfault), 1247 IDTVEC(fast_syscall), IDTVEC(fast_syscall32); 1248 1249 #ifdef DEBUG_INTERRUPTS 1250 extern inthand_t *Xrsvdary[256]; 1251 #endif 1252 1253 void 1254 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd) 1255 { 1256 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 1257 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 1258 ssd->ssd_type = sd->sd_type; 1259 ssd->ssd_dpl = sd->sd_dpl; 1260 ssd->ssd_p = sd->sd_p; 1261 ssd->ssd_def32 = sd->sd_def32; 1262 ssd->ssd_gran = sd->sd_gran; 1263 } 1264 1265 void 1266 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd) 1267 { 1268 1269 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 1270 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 1271 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 1272 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 1273 sd->sd_type = ssd->ssd_type; 1274 sd->sd_dpl = ssd->ssd_dpl; 1275 sd->sd_p = ssd->ssd_p; 1276 sd->sd_long = ssd->ssd_long; 1277 sd->sd_def32 = ssd->ssd_def32; 1278 sd->sd_gran = ssd->ssd_gran; 1279 } 1280 1281 void 1282 ssdtosyssd(struct soft_segment_descriptor *ssd, 1283 struct system_segment_descriptor *sd) 1284 { 1285 1286 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 1287 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 1288 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 1289 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 1290 sd->sd_type = ssd->ssd_type; 1291 sd->sd_dpl = ssd->ssd_dpl; 1292 sd->sd_p = ssd->ssd_p; 1293 sd->sd_gran = ssd->ssd_gran; 1294 } 1295 1296 u_int basemem; 1297 1298 /* 1299 * Populate the (physmap) array with base/bound pairs describing the 1300 * available physical memory in the system, then test this memory and 1301 * build the phys_avail array describing the actually-available memory. 1302 * 1303 * If we cannot accurately determine the physical memory map, then use 1304 * value from the 0xE801 call, and failing that, the RTC. 1305 * 1306 * Total memory size may be set by the kernel environment variable 1307 * hw.physmem or the compile-time define MAXMEM. 1308 * 1309 * XXX first should be vm_paddr_t. 1310 */ 1311 static void 1312 getmemsize(caddr_t kmdp, u_int64_t first) 1313 { 1314 int i, off, physmap_idx, pa_indx, da_indx; 1315 vm_paddr_t pa, physmap[PHYSMAP_SIZE]; 1316 u_long physmem_tunable; 1317 pt_entry_t *pte; 1318 struct bios_smap *smapbase, *smap, *smapend; 1319 u_int32_t smapsize; 1320 quad_t dcons_addr, dcons_size; 1321 1322 bzero(physmap, sizeof(physmap)); 1323 basemem = 0; 1324 physmap_idx = 0; 1325 1326 /* 1327 * get memory map from INT 15:E820, kindly supplied by the loader. 1328 * 1329 * subr_module.c says: 1330 * "Consumer may safely assume that size value precedes data." 1331 * ie: an int32_t immediately precedes smap. 1332 */ 1333 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1334 MODINFO_METADATA | MODINFOMD_SMAP); 1335 if (smapbase == NULL) 1336 panic("No BIOS smap info from loader!"); 1337 1338 smapsize = *((u_int32_t *)smapbase - 1); 1339 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1340 1341 for (smap = smapbase; smap < smapend; smap++) { 1342 if (boothowto & RB_VERBOSE) 1343 kprintf("SMAP type=%02x base=%016lx len=%016lx\n", 1344 smap->type, smap->base, smap->length); 1345 1346 if (smap->type != SMAP_TYPE_MEMORY) 1347 continue; 1348 1349 if (smap->length == 0) 1350 continue; 1351 1352 for (i = 0; i <= physmap_idx; i += 2) { 1353 if (smap->base < physmap[i + 1]) { 1354 if (boothowto & RB_VERBOSE) { 1355 kprintf("Overlapping or non-monotonic " 1356 "memory region, ignoring " 1357 "second region\n"); 1358 } 1359 continue; 1360 } 1361 } 1362 Realmem += smap->length; 1363 1364 if (smap->base == physmap[physmap_idx + 1]) { 1365 physmap[physmap_idx + 1] += smap->length; 1366 continue; 1367 } 1368 1369 physmap_idx += 2; 1370 if (physmap_idx == PHYSMAP_SIZE) { 1371 kprintf("Too many segments in the physical " 1372 "address map, giving up\n"); 1373 break; 1374 } 1375 physmap[physmap_idx] = smap->base; 1376 physmap[physmap_idx + 1] = smap->base + smap->length; 1377 } 1378 1379 /* 1380 * Find the 'base memory' segment for SMP 1381 */ 1382 basemem = 0; 1383 for (i = 0; i <= physmap_idx; i += 2) { 1384 if (physmap[i] == 0x00000000) { 1385 basemem = physmap[i + 1] / 1024; 1386 break; 1387 } 1388 } 1389 if (basemem == 0) 1390 panic("BIOS smap did not include a basemem segment!"); 1391 1392 #ifdef SMP 1393 /* make hole for AP bootstrap code */ 1394 physmap[1] = mp_bootaddress(physmap[1] / 1024); 1395 1396 /* Save EBDA address, if any */ 1397 ebda_addr = (u_long)(*(u_short *)(KERNBASE + 0x40e)); 1398 ebda_addr <<= 4; 1399 #endif 1400 1401 /* 1402 * Maxmem isn't the "maximum memory", it's one larger than the 1403 * highest page of the physical address space. It should be 1404 * called something like "Maxphyspage". We may adjust this 1405 * based on ``hw.physmem'' and the results of the memory test. 1406 */ 1407 Maxmem = atop(physmap[physmap_idx + 1]); 1408 1409 #ifdef MAXMEM 1410 Maxmem = MAXMEM / 4; 1411 #endif 1412 1413 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 1414 Maxmem = atop(physmem_tunable); 1415 1416 /* 1417 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 1418 * in the system. 1419 */ 1420 if (Maxmem > atop(physmap[physmap_idx + 1])) 1421 Maxmem = atop(physmap[physmap_idx + 1]); 1422 1423 /* 1424 * 1425 */ 1426 if (Maxmem > atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS)) { 1427 kprintf("Limiting Maxmem due to DMAP size\n"); 1428 Maxmem = atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS); 1429 } 1430 1431 if (atop(physmap[physmap_idx + 1]) != Maxmem && 1432 (boothowto & RB_VERBOSE)) 1433 kprintf("Physical memory use set to %ldK\n", Maxmem * 4); 1434 1435 /* call pmap initialization to make new kernel address space */ 1436 pmap_bootstrap(&first); 1437 1438 /* 1439 * Size up each available chunk of physical memory. 1440 */ 1441 physmap[0] = PAGE_SIZE; /* mask off page 0 */ 1442 pa_indx = 0; 1443 da_indx = 1; 1444 phys_avail[pa_indx++] = physmap[0]; 1445 phys_avail[pa_indx] = physmap[0]; 1446 dump_avail[da_indx] = physmap[0]; 1447 pte = CMAP1; 1448 1449 /* 1450 * Get dcons buffer address 1451 */ 1452 if (kgetenv_quad("dcons.addr", &dcons_addr) == 0 || 1453 kgetenv_quad("dcons.size", &dcons_size) == 0) 1454 dcons_addr = 0; 1455 1456 /* 1457 * physmap is in bytes, so when converting to page boundaries, 1458 * round up the start address and round down the end address. 1459 */ 1460 for (i = 0; i <= physmap_idx; i += 2) { 1461 vm_paddr_t end; 1462 1463 end = ptoa((vm_paddr_t)Maxmem); 1464 if (physmap[i + 1] < end) 1465 end = trunc_page(physmap[i + 1]); 1466 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 1467 int tmp, page_bad, full; 1468 int *ptr = (int *)CADDR1; 1469 1470 full = FALSE; 1471 /* 1472 * block out kernel memory as not available. 1473 */ 1474 if (pa >= 0x100000 && pa < first) 1475 goto do_dump_avail; 1476 1477 /* 1478 * block out dcons buffer 1479 */ 1480 if (dcons_addr > 0 1481 && pa >= trunc_page(dcons_addr) 1482 && pa < dcons_addr + dcons_size) 1483 goto do_dump_avail; 1484 1485 page_bad = FALSE; 1486 1487 /* 1488 * map page into kernel: valid, read/write,non-cacheable 1489 */ 1490 *pte = pa | PG_V | PG_RW | PG_N; 1491 cpu_invltlb(); 1492 1493 tmp = *(int *)ptr; 1494 /* 1495 * Test for alternating 1's and 0's 1496 */ 1497 *(volatile int *)ptr = 0xaaaaaaaa; 1498 if (*(volatile int *)ptr != 0xaaaaaaaa) 1499 page_bad = TRUE; 1500 /* 1501 * Test for alternating 0's and 1's 1502 */ 1503 *(volatile int *)ptr = 0x55555555; 1504 if (*(volatile int *)ptr != 0x55555555) 1505 page_bad = TRUE; 1506 /* 1507 * Test for all 1's 1508 */ 1509 *(volatile int *)ptr = 0xffffffff; 1510 if (*(volatile int *)ptr != 0xffffffff) 1511 page_bad = TRUE; 1512 /* 1513 * Test for all 0's 1514 */ 1515 *(volatile int *)ptr = 0x0; 1516 if (*(volatile int *)ptr != 0x0) 1517 page_bad = TRUE; 1518 /* 1519 * Restore original value. 1520 */ 1521 *(int *)ptr = tmp; 1522 1523 /* 1524 * Adjust array of valid/good pages. 1525 */ 1526 if (page_bad == TRUE) 1527 continue; 1528 /* 1529 * If this good page is a continuation of the 1530 * previous set of good pages, then just increase 1531 * the end pointer. Otherwise start a new chunk. 1532 * Note that "end" points one higher than end, 1533 * making the range >= start and < end. 1534 * If we're also doing a speculative memory 1535 * test and we at or past the end, bump up Maxmem 1536 * so that we keep going. The first bad page 1537 * will terminate the loop. 1538 */ 1539 if (phys_avail[pa_indx] == pa) { 1540 phys_avail[pa_indx] += PAGE_SIZE; 1541 } else { 1542 pa_indx++; 1543 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 1544 kprintf( 1545 "Too many holes in the physical address space, giving up\n"); 1546 pa_indx--; 1547 full = TRUE; 1548 goto do_dump_avail; 1549 } 1550 phys_avail[pa_indx++] = pa; /* start */ 1551 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 1552 } 1553 physmem++; 1554 do_dump_avail: 1555 if (dump_avail[da_indx] == pa) { 1556 dump_avail[da_indx] += PAGE_SIZE; 1557 } else { 1558 da_indx++; 1559 if (da_indx == DUMP_AVAIL_ARRAY_END) { 1560 da_indx--; 1561 goto do_next; 1562 } 1563 dump_avail[da_indx++] = pa; /* start */ 1564 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 1565 } 1566 do_next: 1567 if (full) 1568 break; 1569 } 1570 } 1571 *pte = 0; 1572 cpu_invltlb(); 1573 1574 /* 1575 * XXX 1576 * The last chunk must contain at least one page plus the message 1577 * buffer to avoid complicating other code (message buffer address 1578 * calculation, etc.). 1579 */ 1580 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 1581 round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) { 1582 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 1583 phys_avail[pa_indx--] = 0; 1584 phys_avail[pa_indx--] = 0; 1585 } 1586 1587 Maxmem = atop(phys_avail[pa_indx]); 1588 1589 /* Trim off space for the message buffer. */ 1590 phys_avail[pa_indx] -= round_page(MSGBUF_SIZE); 1591 1592 avail_end = phys_avail[pa_indx]; 1593 1594 /* Map the message buffer. */ 1595 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) 1596 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + 1597 off); 1598 } 1599 1600 #ifdef SMP 1601 #ifdef APIC_IO 1602 int apic_io_enable = 1; /* Enabled by default for kernels compiled w/APIC_IO */ 1603 #else 1604 int apic_io_enable = 0; /* Disabled by default for kernels compiled without */ 1605 #endif 1606 TUNABLE_INT("hw.apic_io_enable", &apic_io_enable); 1607 extern struct machintr_abi MachIntrABI_APIC; 1608 #endif 1609 1610 extern struct machintr_abi MachIntrABI_ICU; 1611 struct machintr_abi MachIntrABI; 1612 1613 /* 1614 * IDT VECTORS: 1615 * 0 Divide by zero 1616 * 1 Debug 1617 * 2 NMI 1618 * 3 BreakPoint 1619 * 4 OverFlow 1620 * 5 Bound-Range 1621 * 6 Invalid OpCode 1622 * 7 Device Not Available (x87) 1623 * 8 Double-Fault 1624 * 9 Coprocessor Segment overrun (unsupported, reserved) 1625 * 10 Invalid-TSS 1626 * 11 Segment not present 1627 * 12 Stack 1628 * 13 General Protection 1629 * 14 Page Fault 1630 * 15 Reserved 1631 * 16 x87 FP Exception pending 1632 * 17 Alignment Check 1633 * 18 Machine Check 1634 * 19 SIMD floating point 1635 * 20-31 reserved 1636 * 32-255 INTn/external sources 1637 */ 1638 u_int64_t 1639 hammer_time(u_int64_t modulep, u_int64_t physfree) 1640 { 1641 caddr_t kmdp; 1642 int gsel_tss, x; 1643 #if JG 1644 int metadata_missing, off; 1645 #endif 1646 struct mdglobaldata *gd; 1647 u_int64_t msr; 1648 1649 /* 1650 * Prevent lowering of the ipl if we call tsleep() early. 1651 */ 1652 gd = &CPU_prvspace[0].mdglobaldata; 1653 bzero(gd, sizeof(*gd)); 1654 1655 /* 1656 * Note: on both UP and SMP curthread must be set non-NULL 1657 * early in the boot sequence because the system assumes 1658 * that 'curthread' is never NULL. 1659 */ 1660 1661 gd->mi.gd_curthread = &thread0; 1662 thread0.td_gd = &gd->mi; 1663 1664 atdevbase = ISA_HOLE_START + PTOV_OFFSET; 1665 1666 #if JG 1667 metadata_missing = 0; 1668 if (bootinfo.bi_modulep) { 1669 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; 1670 preload_bootstrap_relocate(KERNBASE); 1671 } else { 1672 metadata_missing = 1; 1673 } 1674 if (bootinfo.bi_envp) 1675 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; 1676 #endif 1677 1678 preload_metadata = (caddr_t)(uintptr_t)(modulep + PTOV_OFFSET); 1679 preload_bootstrap_relocate(PTOV_OFFSET); 1680 kmdp = preload_search_by_type("elf kernel"); 1681 if (kmdp == NULL) 1682 kmdp = preload_search_by_type("elf64 kernel"); 1683 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 1684 kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + PTOV_OFFSET; 1685 #ifdef DDB 1686 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 1687 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 1688 #endif 1689 1690 /* 1691 * Setup MachIntrABI 1692 * XXX: Where is the correct place for it? 1693 */ 1694 MachIntrABI = MachIntrABI_ICU; 1695 #ifdef SMP 1696 TUNABLE_INT_FETCH("hw.apic_io_enable", &apic_io_enable); 1697 if (apic_io_enable) 1698 MachIntrABI = MachIntrABI_APIC; 1699 #endif 1700 1701 /* 1702 * start with one cpu. Note: with one cpu, ncpus2_shift, ncpus2_mask, 1703 * and ncpus_fit_mask remain 0. 1704 */ 1705 ncpus = 1; 1706 ncpus2 = 1; 1707 ncpus_fit = 1; 1708 /* Init basic tunables, hz etc */ 1709 init_param1(); 1710 1711 /* 1712 * make gdt memory segments 1713 */ 1714 gdt_segs[GPROC0_SEL].ssd_base = 1715 (uintptr_t) &CPU_prvspace[0].mdglobaldata.gd_common_tss; 1716 1717 gd->mi.gd_prvspace = &CPU_prvspace[0]; 1718 1719 for (x = 0; x < NGDT; x++) { 1720 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1)) 1721 ssdtosd(&gdt_segs[x], &gdt[x]); 1722 } 1723 ssdtosyssd(&gdt_segs[GPROC0_SEL], 1724 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1725 1726 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 1727 r_gdt.rd_base = (long) gdt; 1728 lgdt(&r_gdt); 1729 1730 wrmsr(MSR_FSBASE, 0); /* User value */ 1731 wrmsr(MSR_GSBASE, (u_int64_t)&gd->mi); 1732 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 1733 1734 mi_gdinit(&gd->mi, 0); 1735 cpu_gdinit(gd, 0); 1736 proc0paddr = proc0paddr_buff; 1737 mi_proc0init(&gd->mi, proc0paddr); 1738 safepri = TDPRI_MAX; 1739 1740 /* spinlocks and the BGL */ 1741 init_locks(); 1742 1743 /* exceptions */ 1744 for (x = 0; x < NIDT; x++) 1745 setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); 1746 setidt(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); 1747 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0); 1748 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 1); 1749 setidt(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0); 1750 setidt(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0); 1751 setidt(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); 1752 setidt(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); 1753 setidt(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); 1754 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 1755 setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); 1756 setidt(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); 1757 setidt(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); 1758 setidt(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); 1759 setidt(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); 1760 setidt(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); 1761 setidt(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); 1762 setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); 1763 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0); 1764 setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); 1765 1766 r_idt.rd_limit = sizeof(idt0) - 1; 1767 r_idt.rd_base = (long) idt; 1768 lidt(&r_idt); 1769 1770 /* 1771 * Initialize the console before we print anything out. 1772 */ 1773 cninit(); 1774 1775 #if JG 1776 if (metadata_missing) 1777 kprintf("WARNING: loader(8) metadata is missing!\n"); 1778 #endif 1779 1780 #if NISA >0 1781 isa_defaultirq(); 1782 #endif 1783 rand_initialize(); 1784 1785 #ifdef DDB 1786 kdb_init(); 1787 if (boothowto & RB_KDB) 1788 Debugger("Boot flags requested debugger"); 1789 #endif 1790 1791 #if JG 1792 finishidentcpu(); /* Final stage of CPU initialization */ 1793 setidt(6, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 1794 setidt(13, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 1795 #endif 1796 identify_cpu(); /* Final stage of CPU initialization */ 1797 initializecpu(); /* Initialize CPU registers */ 1798 1799 /* make an initial tss so cpu can get interrupt stack on syscall! */ 1800 gd->gd_common_tss.tss_rsp0 = 1801 (register_t)(thread0.td_kstack + 1802 KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb)); 1803 /* Ensure the stack is aligned to 16 bytes */ 1804 gd->gd_common_tss.tss_rsp0 &= ~(register_t)0xF; 1805 1806 /* double fault stack */ 1807 gd->gd_common_tss.tss_ist1 = 1808 (long)&gd->mi.gd_prvspace->idlestack[ 1809 sizeof(gd->mi.gd_prvspace->idlestack)]; 1810 1811 /* Set the IO permission bitmap (empty due to tss seg limit) */ 1812 gd->gd_common_tss.tss_iobase = sizeof(struct x86_64tss); 1813 1814 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 1815 gd->gd_tss_gdt = &gdt[GPROC0_SEL]; 1816 gd->gd_common_tssd = *gd->gd_tss_gdt; 1817 ltr(gsel_tss); 1818 1819 /* Set up the fast syscall stuff */ 1820 msr = rdmsr(MSR_EFER) | EFER_SCE; 1821 wrmsr(MSR_EFER, msr); 1822 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); 1823 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 1824 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1825 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 1826 wrmsr(MSR_STAR, msr); 1827 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); 1828 1829 getmemsize(kmdp, physfree); 1830 init_param2(physmem); 1831 1832 /* now running on new page tables, configured,and u/iom is accessible */ 1833 1834 /* Map the message buffer. */ 1835 #if JG 1836 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) 1837 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); 1838 #endif 1839 1840 msgbufinit(msgbufp, MSGBUF_SIZE); 1841 1842 1843 /* transfer to user mode */ 1844 1845 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 1846 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 1847 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 1848 1849 load_ds(_udatasel); 1850 load_es(_udatasel); 1851 load_fs(_udatasel); 1852 1853 /* setup proc 0's pcb */ 1854 thread0.td_pcb->pcb_flags = 0; 1855 thread0.td_pcb->pcb_cr3 = KPML4phys; 1856 thread0.td_pcb->pcb_ext = 0; 1857 lwp0.lwp_md.md_regs = &proc0_tf; /* XXX needed? */ 1858 1859 /* Location of kernel stack for locore */ 1860 return ((u_int64_t)thread0.td_pcb); 1861 } 1862 1863 /* 1864 * Initialize machine-dependant portions of the global data structure. 1865 * Note that the global data area and cpu0's idlestack in the private 1866 * data space were allocated in locore. 1867 * 1868 * Note: the idlethread's cpl is 0 1869 * 1870 * WARNING! Called from early boot, 'mycpu' may not work yet. 1871 */ 1872 void 1873 cpu_gdinit(struct mdglobaldata *gd, int cpu) 1874 { 1875 if (cpu) 1876 gd->mi.gd_curthread = &gd->mi.gd_idlethread; 1877 1878 lwkt_init_thread(&gd->mi.gd_idlethread, 1879 gd->mi.gd_prvspace->idlestack, 1880 sizeof(gd->mi.gd_prvspace->idlestack), 1881 0, &gd->mi); 1882 lwkt_set_comm(&gd->mi.gd_idlethread, "idle_%d", cpu); 1883 gd->mi.gd_idlethread.td_switch = cpu_lwkt_switch; 1884 gd->mi.gd_idlethread.td_sp -= sizeof(void *); 1885 *(void **)gd->mi.gd_idlethread.td_sp = cpu_idle_restore; 1886 } 1887 1888 int 1889 is_globaldata_space(vm_offset_t saddr, vm_offset_t eaddr) 1890 { 1891 if (saddr >= (vm_offset_t)&CPU_prvspace[0] && 1892 eaddr <= (vm_offset_t)&CPU_prvspace[MAXCPU]) { 1893 return (TRUE); 1894 } 1895 return (FALSE); 1896 } 1897 1898 struct globaldata * 1899 globaldata_find(int cpu) 1900 { 1901 KKASSERT(cpu >= 0 && cpu < ncpus); 1902 return(&CPU_prvspace[cpu].mdglobaldata.mi); 1903 } 1904 1905 #if defined(I586_CPU) && !defined(NO_F00F_HACK) 1906 static void f00f_hack(void *unused); 1907 SYSINIT(f00f_hack, SI_BOOT2_BIOS, SI_ORDER_ANY, f00f_hack, NULL); 1908 1909 static void 1910 f00f_hack(void *unused) 1911 { 1912 struct gate_descriptor *new_idt; 1913 vm_offset_t tmp; 1914 1915 if (!has_f00f_bug) 1916 return; 1917 1918 kprintf("Intel Pentium detected, installing workaround for F00F bug\n"); 1919 1920 r_idt.rd_limit = sizeof(idt0) - 1; 1921 1922 tmp = kmem_alloc(&kernel_map, PAGE_SIZE * 2); 1923 if (tmp == 0) 1924 panic("kmem_alloc returned 0"); 1925 if (((unsigned int)tmp & (PAGE_SIZE-1)) != 0) 1926 panic("kmem_alloc returned non-page-aligned memory"); 1927 /* Put the first seven entries in the lower page */ 1928 new_idt = (struct gate_descriptor*)(tmp + PAGE_SIZE - (7*8)); 1929 bcopy(idt, new_idt, sizeof(idt0)); 1930 r_idt.rd_base = (int)new_idt; 1931 lidt(&r_idt); 1932 idt = new_idt; 1933 if (vm_map_protect(&kernel_map, tmp, tmp + PAGE_SIZE, 1934 VM_PROT_READ, FALSE) != KERN_SUCCESS) 1935 panic("vm_map_protect failed"); 1936 return; 1937 } 1938 #endif /* defined(I586_CPU) && !NO_F00F_HACK */ 1939 1940 int 1941 ptrace_set_pc(struct lwp *lp, unsigned long addr) 1942 { 1943 lp->lwp_md.md_regs->tf_rip = addr; 1944 return (0); 1945 } 1946 1947 int 1948 ptrace_single_step(struct lwp *lp) 1949 { 1950 lp->lwp_md.md_regs->tf_rflags |= PSL_T; 1951 return (0); 1952 } 1953 1954 int 1955 fill_regs(struct lwp *lp, struct reg *regs) 1956 { 1957 struct trapframe *tp; 1958 1959 tp = lp->lwp_md.md_regs; 1960 bcopy(&tp->tf_rdi, ®s->r_rdi, sizeof(*regs)); 1961 return (0); 1962 } 1963 1964 int 1965 set_regs(struct lwp *lp, struct reg *regs) 1966 { 1967 struct trapframe *tp; 1968 1969 tp = lp->lwp_md.md_regs; 1970 if (!EFL_SECURE(regs->r_rflags, tp->tf_rflags) || 1971 !CS_SECURE(regs->r_cs)) 1972 return (EINVAL); 1973 bcopy(®s->r_rdi, &tp->tf_rdi, sizeof(*regs)); 1974 return (0); 1975 } 1976 1977 #ifndef CPU_DISABLE_SSE 1978 static void 1979 fill_fpregs_xmm(struct savexmm *sv_xmm, struct save87 *sv_87) 1980 { 1981 struct env87 *penv_87 = &sv_87->sv_env; 1982 struct envxmm *penv_xmm = &sv_xmm->sv_env; 1983 int i; 1984 1985 /* FPU control/status */ 1986 penv_87->en_cw = penv_xmm->en_cw; 1987 penv_87->en_sw = penv_xmm->en_sw; 1988 penv_87->en_tw = penv_xmm->en_tw; 1989 penv_87->en_fip = penv_xmm->en_fip; 1990 penv_87->en_fcs = penv_xmm->en_fcs; 1991 penv_87->en_opcode = penv_xmm->en_opcode; 1992 penv_87->en_foo = penv_xmm->en_foo; 1993 penv_87->en_fos = penv_xmm->en_fos; 1994 1995 /* FPU registers */ 1996 for (i = 0; i < 8; ++i) 1997 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; 1998 1999 sv_87->sv_ex_sw = sv_xmm->sv_ex_sw; 2000 } 2001 2002 static void 2003 set_fpregs_xmm(struct save87 *sv_87, struct savexmm *sv_xmm) 2004 { 2005 struct env87 *penv_87 = &sv_87->sv_env; 2006 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2007 int i; 2008 2009 /* FPU control/status */ 2010 penv_xmm->en_cw = penv_87->en_cw; 2011 penv_xmm->en_sw = penv_87->en_sw; 2012 penv_xmm->en_tw = penv_87->en_tw; 2013 penv_xmm->en_fip = penv_87->en_fip; 2014 penv_xmm->en_fcs = penv_87->en_fcs; 2015 penv_xmm->en_opcode = penv_87->en_opcode; 2016 penv_xmm->en_foo = penv_87->en_foo; 2017 penv_xmm->en_fos = penv_87->en_fos; 2018 2019 /* FPU registers */ 2020 for (i = 0; i < 8; ++i) 2021 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; 2022 2023 sv_xmm->sv_ex_sw = sv_87->sv_ex_sw; 2024 } 2025 #endif /* CPU_DISABLE_SSE */ 2026 2027 int 2028 fill_fpregs(struct lwp *lp, struct fpreg *fpregs) 2029 { 2030 #ifndef CPU_DISABLE_SSE 2031 if (cpu_fxsr) { 2032 fill_fpregs_xmm(&lp->lwp_thread->td_pcb->pcb_save.sv_xmm, 2033 (struct save87 *)fpregs); 2034 return (0); 2035 } 2036 #endif /* CPU_DISABLE_SSE */ 2037 bcopy(&lp->lwp_thread->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs); 2038 return (0); 2039 } 2040 2041 int 2042 set_fpregs(struct lwp *lp, struct fpreg *fpregs) 2043 { 2044 #ifndef CPU_DISABLE_SSE 2045 if (cpu_fxsr) { 2046 set_fpregs_xmm((struct save87 *)fpregs, 2047 &lp->lwp_thread->td_pcb->pcb_save.sv_xmm); 2048 return (0); 2049 } 2050 #endif /* CPU_DISABLE_SSE */ 2051 bcopy(fpregs, &lp->lwp_thread->td_pcb->pcb_save.sv_87, sizeof *fpregs); 2052 return (0); 2053 } 2054 2055 int 2056 fill_dbregs(struct lwp *lp, struct dbreg *dbregs) 2057 { 2058 if (lp == NULL) { 2059 dbregs->dr[0] = rdr0(); 2060 dbregs->dr[1] = rdr1(); 2061 dbregs->dr[2] = rdr2(); 2062 dbregs->dr[3] = rdr3(); 2063 dbregs->dr[4] = rdr4(); 2064 dbregs->dr[5] = rdr5(); 2065 dbregs->dr[6] = rdr6(); 2066 dbregs->dr[7] = rdr7(); 2067 } else { 2068 struct pcb *pcb; 2069 2070 pcb = lp->lwp_thread->td_pcb; 2071 dbregs->dr[0] = pcb->pcb_dr0; 2072 dbregs->dr[1] = pcb->pcb_dr1; 2073 dbregs->dr[2] = pcb->pcb_dr2; 2074 dbregs->dr[3] = pcb->pcb_dr3; 2075 dbregs->dr[4] = 0; 2076 dbregs->dr[5] = 0; 2077 dbregs->dr[6] = pcb->pcb_dr6; 2078 dbregs->dr[7] = pcb->pcb_dr7; 2079 } 2080 return (0); 2081 } 2082 2083 int 2084 set_dbregs(struct lwp *lp, struct dbreg *dbregs) 2085 { 2086 if (lp == NULL) { 2087 load_dr0(dbregs->dr[0]); 2088 load_dr1(dbregs->dr[1]); 2089 load_dr2(dbregs->dr[2]); 2090 load_dr3(dbregs->dr[3]); 2091 load_dr4(dbregs->dr[4]); 2092 load_dr5(dbregs->dr[5]); 2093 load_dr6(dbregs->dr[6]); 2094 load_dr7(dbregs->dr[7]); 2095 } else { 2096 struct pcb *pcb; 2097 struct ucred *ucred; 2098 int i; 2099 uint64_t mask1, mask2; 2100 2101 /* 2102 * Don't let an illegal value for dr7 get set. Specifically, 2103 * check for undefined settings. Setting these bit patterns 2104 * result in undefined behaviour and can lead to an unexpected 2105 * TRCTRAP. 2106 */ 2107 /* JG this loop looks unreadable */ 2108 /* Check 4 2-bit fields for invalid patterns. 2109 * These fields are R/Wi, for i = 0..3 2110 */ 2111 /* Is 10 in LENi allowed when running in compatibility mode? */ 2112 /* Pattern 10 in R/Wi might be used to indicate 2113 * breakpoint on I/O. Further analysis should be 2114 * carried to decide if it is safe and useful to 2115 * provide access to that capability 2116 */ 2117 for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 4; 2118 i++, mask1 <<= 4, mask2 <<= 4) 2119 if ((dbregs->dr[7] & mask1) == mask2) 2120 return (EINVAL); 2121 2122 pcb = lp->lwp_thread->td_pcb; 2123 ucred = lp->lwp_proc->p_ucred; 2124 2125 /* 2126 * Don't let a process set a breakpoint that is not within the 2127 * process's address space. If a process could do this, it 2128 * could halt the system by setting a breakpoint in the kernel 2129 * (if ddb was enabled). Thus, we need to check to make sure 2130 * that no breakpoints are being enabled for addresses outside 2131 * process's address space, unless, perhaps, we were called by 2132 * uid 0. 2133 * 2134 * XXX - what about when the watched area of the user's 2135 * address space is written into from within the kernel 2136 * ... wouldn't that still cause a breakpoint to be generated 2137 * from within kernel mode? 2138 */ 2139 2140 if (priv_check_cred(ucred, PRIV_ROOT, 0) != 0) { 2141 if (dbregs->dr[7] & 0x3) { 2142 /* dr0 is enabled */ 2143 if (dbregs->dr[0] >= VM_MAX_USER_ADDRESS) 2144 return (EINVAL); 2145 } 2146 2147 if (dbregs->dr[7] & (0x3<<2)) { 2148 /* dr1 is enabled */ 2149 if (dbregs->dr[1] >= VM_MAX_USER_ADDRESS) 2150 return (EINVAL); 2151 } 2152 2153 if (dbregs->dr[7] & (0x3<<4)) { 2154 /* dr2 is enabled */ 2155 if (dbregs->dr[2] >= VM_MAX_USER_ADDRESS) 2156 return (EINVAL); 2157 } 2158 2159 if (dbregs->dr[7] & (0x3<<6)) { 2160 /* dr3 is enabled */ 2161 if (dbregs->dr[3] >= VM_MAX_USER_ADDRESS) 2162 return (EINVAL); 2163 } 2164 } 2165 2166 pcb->pcb_dr0 = dbregs->dr[0]; 2167 pcb->pcb_dr1 = dbregs->dr[1]; 2168 pcb->pcb_dr2 = dbregs->dr[2]; 2169 pcb->pcb_dr3 = dbregs->dr[3]; 2170 pcb->pcb_dr6 = dbregs->dr[6]; 2171 pcb->pcb_dr7 = dbregs->dr[7]; 2172 2173 pcb->pcb_flags |= PCB_DBREGS; 2174 } 2175 2176 return (0); 2177 } 2178 2179 /* 2180 * Return > 0 if a hardware breakpoint has been hit, and the 2181 * breakpoint was in user space. Return 0, otherwise. 2182 */ 2183 int 2184 user_dbreg_trap(void) 2185 { 2186 u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */ 2187 u_int64_t bp; /* breakpoint bits extracted from dr6 */ 2188 int nbp; /* number of breakpoints that triggered */ 2189 caddr_t addr[4]; /* breakpoint addresses */ 2190 int i; 2191 2192 dr7 = rdr7(); 2193 if ((dr7 & 0xff) == 0) { 2194 /* 2195 * all GE and LE bits in the dr7 register are zero, 2196 * thus the trap couldn't have been caused by the 2197 * hardware debug registers 2198 */ 2199 return 0; 2200 } 2201 2202 nbp = 0; 2203 dr6 = rdr6(); 2204 bp = dr6 & 0xf; 2205 2206 if (bp == 0) { 2207 /* 2208 * None of the breakpoint bits are set meaning this 2209 * trap was not caused by any of the debug registers 2210 */ 2211 return 0; 2212 } 2213 2214 /* 2215 * at least one of the breakpoints were hit, check to see 2216 * which ones and if any of them are user space addresses 2217 */ 2218 2219 if (bp & 0x01) { 2220 addr[nbp++] = (caddr_t)rdr0(); 2221 } 2222 if (bp & 0x02) { 2223 addr[nbp++] = (caddr_t)rdr1(); 2224 } 2225 if (bp & 0x04) { 2226 addr[nbp++] = (caddr_t)rdr2(); 2227 } 2228 if (bp & 0x08) { 2229 addr[nbp++] = (caddr_t)rdr3(); 2230 } 2231 2232 for (i=0; i<nbp; i++) { 2233 if (addr[i] < 2234 (caddr_t)VM_MAX_USER_ADDRESS) { 2235 /* 2236 * addr[i] is in user space 2237 */ 2238 return nbp; 2239 } 2240 } 2241 2242 /* 2243 * None of the breakpoints are in user space. 2244 */ 2245 return 0; 2246 } 2247 2248 2249 #ifndef DDB 2250 void 2251 Debugger(const char *msg) 2252 { 2253 kprintf("Debugger(\"%s\") called.\n", msg); 2254 } 2255 #endif /* no DDB */ 2256 2257 #ifdef DDB 2258 2259 /* 2260 * Provide inb() and outb() as functions. They are normally only 2261 * available as macros calling inlined functions, thus cannot be 2262 * called inside DDB. 2263 * 2264 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined. 2265 */ 2266 2267 #undef inb 2268 #undef outb 2269 2270 /* silence compiler warnings */ 2271 u_char inb(u_int); 2272 void outb(u_int, u_char); 2273 2274 u_char 2275 inb(u_int port) 2276 { 2277 u_char data; 2278 /* 2279 * We use %%dx and not %1 here because i/o is done at %dx and not at 2280 * %edx, while gcc generates inferior code (movw instead of movl) 2281 * if we tell it to load (u_short) port. 2282 */ 2283 __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); 2284 return (data); 2285 } 2286 2287 void 2288 outb(u_int port, u_char data) 2289 { 2290 u_char al; 2291 /* 2292 * Use an unnecessary assignment to help gcc's register allocator. 2293 * This make a large difference for gcc-1.40 and a tiny difference 2294 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for 2295 * best results. gcc-2.6.0 can't handle this. 2296 */ 2297 al = data; 2298 __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); 2299 } 2300 2301 #endif /* DDB */ 2302 2303 2304 2305 #include "opt_cpu.h" 2306 2307 2308 /* 2309 * initialize all the SMP locks 2310 */ 2311 2312 /* critical region when masking or unmasking interupts */ 2313 struct spinlock_deprecated imen_spinlock; 2314 2315 /* critical region for old style disable_intr/enable_intr */ 2316 struct spinlock_deprecated mpintr_spinlock; 2317 2318 /* critical region around INTR() routines */ 2319 struct spinlock_deprecated intr_spinlock; 2320 2321 /* lock region used by kernel profiling */ 2322 struct spinlock_deprecated mcount_spinlock; 2323 2324 /* locks com (tty) data/hardware accesses: a FASTINTR() */ 2325 struct spinlock_deprecated com_spinlock; 2326 2327 /* lock regions around the clock hardware */ 2328 struct spinlock_deprecated clock_spinlock; 2329 2330 static void 2331 init_locks(void) 2332 { 2333 #ifdef SMP 2334 /* 2335 * Get the initial mplock with a count of 1 for the BSP. 2336 * This uses a LOGICAL cpu ID, ie BSP == 0. 2337 */ 2338 cpu_get_initial_mplock(); 2339 #endif 2340 /* DEPRECATED */ 2341 spin_lock_init(&mcount_spinlock); 2342 spin_lock_init(&intr_spinlock); 2343 spin_lock_init(&mpintr_spinlock); 2344 spin_lock_init(&imen_spinlock); 2345 spin_lock_init(&com_spinlock); 2346 spin_lock_init(&clock_spinlock); 2347 2348 /* our token pool needs to work early */ 2349 lwkt_token_pool_init(); 2350 } 2351 2352