1 /*- 2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 3 * Copyright (c) 1992 Terrence R. Lambert. 4 * Copyright (c) 2003 Peter Wemm. 5 * Copyright (c) 2008 The DragonFly Project. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * William Jolitz. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $ 41 */ 42 43 //#include "use_npx.h" 44 #include "use_isa.h" 45 #include "opt_atalk.h" 46 #include "opt_compat.h" 47 #include "opt_cpu.h" 48 #include "opt_ddb.h" 49 #include "opt_directio.h" 50 #include "opt_inet.h" 51 #include "opt_ipx.h" 52 #include "opt_msgbuf.h" 53 #include "opt_swap.h" 54 #include "opt_apic.h" 55 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/sysproto.h> 59 #include <sys/signalvar.h> 60 #include <sys/kernel.h> 61 #include <sys/linker.h> 62 #include <sys/malloc.h> 63 #include <sys/proc.h> 64 #include <sys/priv.h> 65 #include <sys/buf.h> 66 #include <sys/reboot.h> 67 #include <sys/mbuf.h> 68 #include <sys/msgbuf.h> 69 #include <sys/sysent.h> 70 #include <sys/sysctl.h> 71 #include <sys/vmmeter.h> 72 #include <sys/bus.h> 73 #include <sys/upcall.h> 74 #include <sys/usched.h> 75 #include <sys/reg.h> 76 77 #include <vm/vm.h> 78 #include <vm/vm_param.h> 79 #include <sys/lock.h> 80 #include <vm/vm_kern.h> 81 #include <vm/vm_object.h> 82 #include <vm/vm_page.h> 83 #include <vm/vm_map.h> 84 #include <vm/vm_pager.h> 85 #include <vm/vm_extern.h> 86 87 #include <sys/thread2.h> 88 #include <sys/mplock2.h> 89 90 #include <sys/user.h> 91 #include <sys/exec.h> 92 #include <sys/cons.h> 93 94 #include <ddb/ddb.h> 95 96 #include <machine/cpu.h> 97 #include <machine/clock.h> 98 #include <machine/specialreg.h> 99 #if JG 100 #include <machine/bootinfo.h> 101 #endif 102 #include <machine/md_var.h> 103 #include <machine/metadata.h> 104 #include <machine/pc/bios.h> 105 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */ 106 #include <machine/globaldata.h> /* CPU_prvspace */ 107 #include <machine/smp.h> 108 #ifdef PERFMON 109 #include <machine/perfmon.h> 110 #endif 111 #include <machine/cputypes.h> 112 113 #ifdef OLD_BUS_ARCH 114 #include <bus/isa/isa_device.h> 115 #endif 116 #include <machine_base/isa/intr_machdep.h> 117 #include <bus/isa/rtc.h> 118 #include <sys/random.h> 119 #include <sys/ptrace.h> 120 #include <machine/sigframe.h> 121 122 #include <sys/machintr.h> 123 124 #define PHYSMAP_ENTRIES 10 125 126 extern void init386(int first); 127 extern void dblfault_handler(void); 128 extern u_int64_t hammer_time(u_int64_t, u_int64_t); 129 130 extern void printcpuinfo(void); /* XXX header file */ 131 extern void identify_cpu(void); 132 #if JG 133 extern void finishidentcpu(void); 134 #endif 135 extern void panicifcpuunsupported(void); 136 137 static void cpu_startup(void *); 138 #ifndef CPU_DISABLE_SSE 139 static void set_fpregs_xmm(struct save87 *, struct savexmm *); 140 static void fill_fpregs_xmm(struct savexmm *, struct save87 *); 141 #endif /* CPU_DISABLE_SSE */ 142 #ifdef DIRECTIO 143 extern void ffs_rawread_setup(void); 144 #endif /* DIRECTIO */ 145 static void init_locks(void); 146 147 SYSINIT(cpu, SI_BOOT2_SMP, SI_ORDER_FIRST, cpu_startup, NULL) 148 149 #ifdef DDB 150 extern vm_offset_t ksym_start, ksym_end; 151 #endif 152 153 struct privatespace CPU_prvspace[MAXCPU] __aligned(4096); /* XXX */ 154 155 int _udatasel, _ucodesel, _ucode32sel; 156 u_long atdevbase; 157 #ifdef SMP 158 int64_t tsc_offsets[MAXCPU]; 159 #else 160 int64_t tsc_offsets[1]; 161 #endif 162 163 #if defined(SWTCH_OPTIM_STATS) 164 extern int swtch_optim_stats; 165 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, 166 CTLFLAG_RD, &swtch_optim_stats, 0, ""); 167 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, 168 CTLFLAG_RD, &tlb_flush_count, 0, ""); 169 #endif 170 171 int physmem = 0; 172 173 u_long ebda_addr = 0; 174 175 static int 176 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS) 177 { 178 int error = sysctl_handle_int(oidp, 0, ctob(physmem), req); 179 return (error); 180 } 181 182 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD, 183 0, 0, sysctl_hw_physmem, "IU", ""); 184 185 static int 186 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS) 187 { 188 int error = sysctl_handle_int(oidp, 0, 189 ctob(physmem - vmstats.v_wire_count), req); 190 return (error); 191 } 192 193 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD, 194 0, 0, sysctl_hw_usermem, "IU", ""); 195 196 static int 197 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS) 198 { 199 int error = sysctl_handle_int(oidp, 0, 200 x86_64_btop(avail_end - avail_start), req); 201 return (error); 202 } 203 204 SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_INT|CTLFLAG_RD, 205 0, 0, sysctl_hw_availpages, "I", ""); 206 207 vm_paddr_t Maxmem; 208 vm_paddr_t Realmem; 209 210 /* 211 * The number of PHYSMAP entries must be one less than the number of 212 * PHYSSEG entries because the PHYSMAP entry that spans the largest 213 * physical address that is accessible by ISA DMA is split into two 214 * PHYSSEG entries. 215 */ 216 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 217 218 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 219 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 220 221 /* must be 2 less so 0 0 can signal end of chunks */ 222 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2) 223 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2) 224 225 static vm_offset_t buffer_sva, buffer_eva; 226 vm_offset_t clean_sva, clean_eva; 227 static vm_offset_t pager_sva, pager_eva; 228 static struct trapframe proc0_tf; 229 230 static void 231 cpu_startup(void *dummy) 232 { 233 caddr_t v; 234 vm_size_t size = 0; 235 vm_offset_t firstaddr; 236 237 if (boothowto & RB_VERBOSE) 238 bootverbose++; 239 240 /* 241 * Good {morning,afternoon,evening,night}. 242 */ 243 kprintf("%s", version); 244 startrtclock(); 245 printcpuinfo(); 246 panicifcpuunsupported(); 247 #ifdef PERFMON 248 perfmon_init(); 249 #endif 250 kprintf("real memory = %ju (%ju MB)\n", 251 (intmax_t)Realmem, 252 (intmax_t)Realmem / 1024 / 1024); 253 /* 254 * Display any holes after the first chunk of extended memory. 255 */ 256 if (bootverbose) { 257 int indx; 258 259 kprintf("Physical memory chunk(s):\n"); 260 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 261 vm_paddr_t size1 = phys_avail[indx + 1] - phys_avail[indx]; 262 263 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n", 264 (intmax_t)phys_avail[indx], 265 (intmax_t)phys_avail[indx + 1] - 1, 266 (intmax_t)size1, 267 (intmax_t)(size1 / PAGE_SIZE)); 268 } 269 } 270 271 /* 272 * Allocate space for system data structures. 273 * The first available kernel virtual address is in "v". 274 * As pages of kernel virtual memory are allocated, "v" is incremented. 275 * As pages of memory are allocated and cleared, 276 * "firstaddr" is incremented. 277 * An index into the kernel page table corresponding to the 278 * virtual memory address maintained in "v" is kept in "mapaddr". 279 */ 280 281 /* 282 * Make two passes. The first pass calculates how much memory is 283 * needed and allocates it. The second pass assigns virtual 284 * addresses to the various data structures. 285 */ 286 firstaddr = 0; 287 again: 288 v = (caddr_t)firstaddr; 289 290 #define valloc(name, type, num) \ 291 (name) = (type *)v; v = (caddr_t)((name)+(num)) 292 #define valloclim(name, type, num, lim) \ 293 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) 294 295 /* 296 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. 297 * For the first 64MB of ram nominally allocate sufficient buffers to 298 * cover 1/4 of our ram. Beyond the first 64MB allocate additional 299 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing 300 * the buffer cache we limit the eventual kva reservation to 301 * maxbcache bytes. 302 * 303 * factor represents the 1/4 x ram conversion. 304 */ 305 if (nbuf == 0) { 306 int factor = 4 * BKVASIZE / 1024; 307 int kbytes = physmem * (PAGE_SIZE / 1024); 308 309 nbuf = 50; 310 if (kbytes > 4096) 311 nbuf += min((kbytes - 4096) / factor, 65536 / factor); 312 if (kbytes > 65536) 313 nbuf += (kbytes - 65536) * 2 / (factor * 5); 314 if (maxbcache && nbuf > maxbcache / BKVASIZE) 315 nbuf = maxbcache / BKVASIZE; 316 } 317 318 /* 319 * Do not allow the buffer_map to be more then 1/2 the size of the 320 * kernel_map. 321 */ 322 if (nbuf > (virtual_end - virtual_start) / (BKVASIZE * 2)) { 323 nbuf = (virtual_end - virtual_start) / (BKVASIZE * 2); 324 kprintf("Warning: nbufs capped at %d\n", nbuf); 325 } 326 327 nswbuf = max(min(nbuf/4, 256), 16); 328 #ifdef NSWBUF_MIN 329 if (nswbuf < NSWBUF_MIN) 330 nswbuf = NSWBUF_MIN; 331 #endif 332 #ifdef DIRECTIO 333 ffs_rawread_setup(); 334 #endif 335 336 valloc(swbuf, struct buf, nswbuf); 337 valloc(buf, struct buf, nbuf); 338 339 /* 340 * End of first pass, size has been calculated so allocate memory 341 */ 342 if (firstaddr == 0) { 343 size = (vm_size_t)(v - firstaddr); 344 firstaddr = kmem_alloc(&kernel_map, round_page(size)); 345 if (firstaddr == 0) 346 panic("startup: no room for tables"); 347 goto again; 348 } 349 350 /* 351 * End of second pass, addresses have been assigned 352 */ 353 if ((vm_size_t)(v - firstaddr) != size) 354 panic("startup: table size inconsistency"); 355 356 kmem_suballoc(&kernel_map, &clean_map, &clean_sva, &clean_eva, 357 (nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size); 358 kmem_suballoc(&clean_map, &buffer_map, &buffer_sva, &buffer_eva, 359 (nbuf*BKVASIZE)); 360 buffer_map.system_map = 1; 361 kmem_suballoc(&clean_map, &pager_map, &pager_sva, &pager_eva, 362 (nswbuf*MAXPHYS) + pager_map_size); 363 pager_map.system_map = 1; 364 365 #if defined(USERCONFIG) 366 userconfig(); 367 cninit(); /* the preferred console may have changed */ 368 #endif 369 370 kprintf("avail memory = %ju (%ju MB)\n", 371 (uintmax_t)ptoa(vmstats.v_free_count), 372 (uintmax_t)ptoa(vmstats.v_free_count) / 1024 / 1024); 373 374 /* 375 * Set up buffers, so they can be used to read disk labels. 376 */ 377 bufinit(); 378 vm_pager_bufferinit(); 379 380 #ifdef SMP 381 /* 382 * OK, enough kmem_alloc/malloc state should be up, lets get on with it! 383 */ 384 mp_start(); /* fire up the APs and APICs */ 385 mp_announce(); 386 #endif /* SMP */ 387 cpu_setregs(); 388 } 389 390 /* 391 * Send an interrupt to process. 392 * 393 * Stack is set up to allow sigcode stored 394 * at top to call routine, followed by kcall 395 * to sigreturn routine below. After sigreturn 396 * resets the signal mask, the stack, and the 397 * frame pointer, it returns to the user 398 * specified pc, psl. 399 */ 400 void 401 sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code) 402 { 403 struct lwp *lp = curthread->td_lwp; 404 struct proc *p = lp->lwp_proc; 405 struct trapframe *regs; 406 struct sigacts *psp = p->p_sigacts; 407 struct sigframe sf, *sfp; 408 int oonstack; 409 char *sp; 410 411 regs = lp->lwp_md.md_regs; 412 oonstack = (lp->lwp_sigstk.ss_flags & SS_ONSTACK) ? 1 : 0; 413 414 /* Save user context */ 415 bzero(&sf, sizeof(struct sigframe)); 416 sf.sf_uc.uc_sigmask = *mask; 417 sf.sf_uc.uc_stack = lp->lwp_sigstk; 418 sf.sf_uc.uc_mcontext.mc_onstack = oonstack; 419 KKASSERT(__offsetof(struct trapframe, tf_rdi) == 0); 420 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(struct trapframe)); 421 422 /* Make the size of the saved context visible to userland */ 423 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); 424 425 /* Save mailbox pending state for syscall interlock semantics */ 426 if (p->p_flag & P_MAILBOX) 427 sf.sf_uc.uc_mcontext.mc_xflags |= PGEX_MAILBOX; 428 429 /* Allocate and validate space for the signal handler context. */ 430 if ((lp->lwp_flag & LWP_ALTSTACK) != 0 && !oonstack && 431 SIGISMEMBER(psp->ps_sigonstack, sig)) { 432 sp = (char *)(lp->lwp_sigstk.ss_sp + lp->lwp_sigstk.ss_size - 433 sizeof(struct sigframe)); 434 lp->lwp_sigstk.ss_flags |= SS_ONSTACK; 435 } else { 436 /* We take red zone into account */ 437 sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128; 438 } 439 440 /* Align to 16 bytes */ 441 sfp = (struct sigframe *)((intptr_t)sp & ~(intptr_t)0xF); 442 443 /* Translate the signal is appropriate */ 444 if (p->p_sysent->sv_sigtbl) { 445 if (sig <= p->p_sysent->sv_sigsize) 446 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 447 } 448 449 /* 450 * Build the argument list for the signal handler. 451 * 452 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx) 453 */ 454 regs->tf_rdi = sig; /* argument 1 */ 455 regs->tf_rdx = (register_t)&sfp->sf_uc; /* argument 3 */ 456 457 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 458 /* 459 * Signal handler installed with SA_SIGINFO. 460 * 461 * action(signo, siginfo, ucontext) 462 */ 463 regs->tf_rsi = (register_t)&sfp->sf_si; /* argument 2 */ 464 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */ 465 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 466 467 /* fill siginfo structure */ 468 sf.sf_si.si_signo = sig; 469 sf.sf_si.si_code = code; 470 sf.sf_si.si_addr = (void *)regs->tf_addr; 471 } else { 472 /* 473 * Old FreeBSD-style arguments. 474 * 475 * handler (signo, code, [uc], addr) 476 */ 477 regs->tf_rsi = (register_t)code; /* argument 2 */ 478 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */ 479 sf.sf_ahu.sf_handler = catcher; 480 } 481 482 /* 483 * If we're a vm86 process, we want to save the segment registers. 484 * We also change eflags to be our emulated eflags, not the actual 485 * eflags. 486 */ 487 #if JG 488 if (regs->tf_eflags & PSL_VM) { 489 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 490 struct vm86_kernel *vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86; 491 492 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 493 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 494 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 495 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 496 497 if (vm86->vm86_has_vme == 0) 498 sf.sf_uc.uc_mcontext.mc_eflags = 499 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 500 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 501 502 /* 503 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 504 * syscalls made by the signal handler. This just avoids 505 * wasting time for our lazy fixup of such faults. PSL_NT 506 * does nothing in vm86 mode, but vm86 programs can set it 507 * almost legitimately in probes for old cpu types. 508 */ 509 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 510 } 511 #endif 512 513 /* 514 * Save the FPU state and reinit the FP unit 515 */ 516 npxpush(&sf.sf_uc.uc_mcontext); 517 518 /* 519 * Copy the sigframe out to the user's stack. 520 */ 521 if (copyout(&sf, sfp, sizeof(struct sigframe)) != 0) { 522 /* 523 * Something is wrong with the stack pointer. 524 * ...Kill the process. 525 */ 526 sigexit(lp, SIGILL); 527 } 528 529 regs->tf_rsp = (register_t)sfp; 530 regs->tf_rip = PS_STRINGS - *(p->p_sysent->sv_szsigcode); 531 532 /* 533 * i386 abi specifies that the direction flag must be cleared 534 * on function entry 535 */ 536 regs->tf_rflags &= ~(PSL_T|PSL_D); 537 538 /* 539 * 64 bit mode has a code and stack selector but 540 * no data or extra selector. %fs and %gs are not 541 * stored in-context. 542 */ 543 regs->tf_cs = _ucodesel; 544 regs->tf_ss = _udatasel; 545 } 546 547 /* 548 * Sanitize the trapframe for a virtual kernel passing control to a custom 549 * VM context. Remove any items that would otherwise create a privilage 550 * issue. 551 * 552 * XXX at the moment we allow userland to set the resume flag. Is this a 553 * bad idea? 554 */ 555 int 556 cpu_sanitize_frame(struct trapframe *frame) 557 { 558 frame->tf_cs = _ucodesel; 559 frame->tf_ss = _udatasel; 560 /* XXX VM (8086) mode not supported? */ 561 frame->tf_rflags &= (PSL_RF | PSL_USERCHANGE | PSL_VM_UNSUPP); 562 frame->tf_rflags |= PSL_RESERVED_DEFAULT | PSL_I; 563 564 return(0); 565 } 566 567 /* 568 * Sanitize the tls so loading the descriptor does not blow up 569 * on us. For x86_64 we don't have to do anything. 570 */ 571 int 572 cpu_sanitize_tls(struct savetls *tls) 573 { 574 return(0); 575 } 576 577 /* 578 * sigreturn(ucontext_t *sigcntxp) 579 * 580 * System call to cleanup state after a signal 581 * has been taken. Reset signal mask and 582 * stack state from context left by sendsig (above). 583 * Return to previous pc and psl as specified by 584 * context left by sendsig. Check carefully to 585 * make sure that the user has not modified the 586 * state to gain improper privileges. 587 * 588 * MPSAFE 589 */ 590 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 591 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 592 593 int 594 sys_sigreturn(struct sigreturn_args *uap) 595 { 596 struct lwp *lp = curthread->td_lwp; 597 struct proc *p = lp->lwp_proc; 598 struct trapframe *regs; 599 ucontext_t uc; 600 ucontext_t *ucp; 601 register_t rflags; 602 int cs; 603 int error; 604 605 /* 606 * We have to copy the information into kernel space so userland 607 * can't modify it while we are sniffing it. 608 */ 609 regs = lp->lwp_md.md_regs; 610 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 611 if (error) 612 return (error); 613 ucp = &uc; 614 rflags = ucp->uc_mcontext.mc_rflags; 615 616 /* VM (8086) mode not supported */ 617 rflags &= ~PSL_VM_UNSUPP; 618 619 #if JG 620 if (eflags & PSL_VM) { 621 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 622 struct vm86_kernel *vm86; 623 624 /* 625 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 626 * set up the vm86 area, and we can't enter vm86 mode. 627 */ 628 if (lp->lwp_thread->td_pcb->pcb_ext == 0) 629 return (EINVAL); 630 vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86; 631 if (vm86->vm86_inited == 0) 632 return (EINVAL); 633 634 /* go back to user mode if both flags are set */ 635 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) 636 trapsignal(lp, SIGBUS, 0); 637 638 if (vm86->vm86_has_vme) { 639 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 640 (eflags & VME_USERCHANGE) | PSL_VM; 641 } else { 642 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 643 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 644 (eflags & VM_USERCHANGE) | PSL_VM; 645 } 646 bcopy(&ucp->uc_mcontext.mc_gs, tf, sizeof(struct trapframe)); 647 tf->tf_eflags = eflags; 648 tf->tf_vm86_ds = tf->tf_ds; 649 tf->tf_vm86_es = tf->tf_es; 650 tf->tf_vm86_fs = tf->tf_fs; 651 tf->tf_vm86_gs = tf->tf_gs; 652 tf->tf_ds = _udatasel; 653 tf->tf_es = _udatasel; 654 tf->tf_fs = _udatasel; 655 tf->tf_gs = _udatasel; 656 } else 657 #endif 658 { 659 /* 660 * Don't allow users to change privileged or reserved flags. 661 */ 662 /* 663 * XXX do allow users to change the privileged flag PSL_RF. 664 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers 665 * should sometimes set it there too. tf_eflags is kept in 666 * the signal context during signal handling and there is no 667 * other place to remember it, so the PSL_RF bit may be 668 * corrupted by the signal handler without us knowing. 669 * Corruption of the PSL_RF bit at worst causes one more or 670 * one less debugger trap, so allowing it is fairly harmless. 671 */ 672 if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { 673 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags); 674 return(EINVAL); 675 } 676 677 /* 678 * Don't allow users to load a valid privileged %cs. Let the 679 * hardware check for invalid selectors, excess privilege in 680 * other selectors, invalid %eip's and invalid %esp's. 681 */ 682 cs = ucp->uc_mcontext.mc_cs; 683 if (!CS_SECURE(cs)) { 684 kprintf("sigreturn: cs = 0x%x\n", cs); 685 trapsignal(lp, SIGBUS, T_PROTFLT); 686 return(EINVAL); 687 } 688 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(struct trapframe)); 689 } 690 691 /* 692 * Restore the FPU state from the frame 693 */ 694 crit_enter(); 695 npxpop(&ucp->uc_mcontext); 696 697 /* 698 * Merge saved signal mailbox pending flag to maintain interlock 699 * semantics against system calls. 700 */ 701 if (ucp->uc_mcontext.mc_xflags & PGEX_MAILBOX) 702 p->p_flag |= P_MAILBOX; 703 704 if (ucp->uc_mcontext.mc_onstack & 1) 705 lp->lwp_sigstk.ss_flags |= SS_ONSTACK; 706 else 707 lp->lwp_sigstk.ss_flags &= ~SS_ONSTACK; 708 709 lp->lwp_sigmask = ucp->uc_sigmask; 710 SIG_CANTMASK(lp->lwp_sigmask); 711 crit_exit(); 712 return(EJUSTRETURN); 713 } 714 715 /* 716 * Stack frame on entry to function. %rax will contain the function vector, 717 * %rcx will contain the function data. flags, rcx, and rax will have 718 * already been pushed on the stack. 719 */ 720 struct upc_frame { 721 register_t rax; 722 register_t rcx; 723 register_t rdx; 724 register_t flags; 725 register_t oldip; 726 }; 727 728 void 729 sendupcall(struct vmupcall *vu, int morepending) 730 { 731 struct lwp *lp = curthread->td_lwp; 732 struct trapframe *regs; 733 struct upcall upcall; 734 struct upc_frame upc_frame; 735 int crit_count = 0; 736 737 /* 738 * If we are a virtual kernel running an emulated user process 739 * context, switch back to the virtual kernel context before 740 * trying to post the signal. 741 */ 742 if (lp->lwp_vkernel && lp->lwp_vkernel->ve) { 743 lp->lwp_md.md_regs->tf_trapno = 0; 744 vkernel_trap(lp, lp->lwp_md.md_regs); 745 } 746 747 /* 748 * Get the upcall data structure 749 */ 750 if (copyin(lp->lwp_upcall, &upcall, sizeof(upcall)) || 751 copyin((char *)upcall.upc_uthread + upcall.upc_critoff, &crit_count, sizeof(int)) 752 ) { 753 vu->vu_pending = 0; 754 kprintf("bad upcall address\n"); 755 return; 756 } 757 758 /* 759 * If the data structure is already marked pending or has a critical 760 * section count, mark the data structure as pending and return 761 * without doing an upcall. vu_pending is left set. 762 */ 763 if (upcall.upc_pending || crit_count >= vu->vu_pending) { 764 if (upcall.upc_pending < vu->vu_pending) { 765 upcall.upc_pending = vu->vu_pending; 766 copyout(&upcall.upc_pending, &lp->lwp_upcall->upc_pending, 767 sizeof(upcall.upc_pending)); 768 } 769 return; 770 } 771 772 /* 773 * We can run this upcall now, clear vu_pending. 774 * 775 * Bump our critical section count and set or clear the 776 * user pending flag depending on whether more upcalls are 777 * pending. The user will be responsible for calling 778 * upc_dispatch(-1) to process remaining upcalls. 779 */ 780 vu->vu_pending = 0; 781 upcall.upc_pending = morepending; 782 ++crit_count; 783 copyout(&upcall.upc_pending, &lp->lwp_upcall->upc_pending, 784 sizeof(upcall.upc_pending)); 785 copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, 786 sizeof(int)); 787 788 /* 789 * Construct a stack frame and issue the upcall 790 */ 791 regs = lp->lwp_md.md_regs; 792 upc_frame.rax = regs->tf_rax; 793 upc_frame.rcx = regs->tf_rcx; 794 upc_frame.rdx = regs->tf_rdx; 795 upc_frame.flags = regs->tf_rflags; 796 upc_frame.oldip = regs->tf_rip; 797 if (copyout(&upc_frame, (void *)(regs->tf_rsp - sizeof(upc_frame)), 798 sizeof(upc_frame)) != 0) { 799 kprintf("bad stack on upcall\n"); 800 } else { 801 regs->tf_rax = (register_t)vu->vu_func; 802 regs->tf_rcx = (register_t)vu->vu_data; 803 regs->tf_rdx = (register_t)lp->lwp_upcall; 804 regs->tf_rip = (register_t)vu->vu_ctx; 805 regs->tf_rsp -= sizeof(upc_frame); 806 } 807 } 808 809 /* 810 * fetchupcall occurs in the context of a system call, which means that 811 * we have to return EJUSTRETURN in order to prevent eax and edx from 812 * being overwritten by the syscall return value. 813 * 814 * if vu is not NULL we return the new context in %edx, the new data in %ecx, 815 * and the function pointer in %eax. 816 */ 817 int 818 fetchupcall(struct vmupcall *vu, int morepending, void *rsp) 819 { 820 struct upc_frame upc_frame; 821 struct lwp *lp = curthread->td_lwp; 822 struct trapframe *regs; 823 int error; 824 struct upcall upcall; 825 int crit_count; 826 827 regs = lp->lwp_md.md_regs; 828 829 error = copyout(&morepending, &lp->lwp_upcall->upc_pending, sizeof(int)); 830 if (error == 0) { 831 if (vu) { 832 /* 833 * This jumps us to the next ready context. 834 */ 835 vu->vu_pending = 0; 836 error = copyin(lp->lwp_upcall, &upcall, sizeof(upcall)); 837 crit_count = 0; 838 if (error == 0) 839 error = copyin((char *)upcall.upc_uthread + upcall.upc_critoff, &crit_count, sizeof(int)); 840 ++crit_count; 841 if (error == 0) 842 error = copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, sizeof(int)); 843 regs->tf_rax = (register_t)vu->vu_func; 844 regs->tf_rcx = (register_t)vu->vu_data; 845 regs->tf_rdx = (register_t)lp->lwp_upcall; 846 regs->tf_rip = (register_t)vu->vu_ctx; 847 regs->tf_rsp = (register_t)rsp; 848 } else { 849 /* 850 * This returns us to the originally interrupted code. 851 */ 852 error = copyin(rsp, &upc_frame, sizeof(upc_frame)); 853 regs->tf_rax = upc_frame.rax; 854 regs->tf_rcx = upc_frame.rcx; 855 regs->tf_rdx = upc_frame.rdx; 856 regs->tf_rflags = (regs->tf_rflags & ~PSL_USERCHANGE) | 857 (upc_frame.flags & PSL_USERCHANGE); 858 regs->tf_rip = upc_frame.oldip; 859 regs->tf_rsp = (register_t)((char *)rsp + sizeof(upc_frame)); 860 } 861 } 862 if (error == 0) 863 error = EJUSTRETURN; 864 return(error); 865 } 866 867 /* 868 * Machine dependent boot() routine 869 * 870 * I haven't seen anything to put here yet 871 * Possibly some stuff might be grafted back here from boot() 872 */ 873 void 874 cpu_boot(int howto) 875 { 876 } 877 878 /* 879 * Shutdown the CPU as much as possible 880 */ 881 void 882 cpu_halt(void) 883 { 884 for (;;) 885 __asm__ __volatile("hlt"); 886 } 887 888 /* 889 * cpu_idle() represents the idle LWKT. You cannot return from this function 890 * (unless you want to blow things up!). Instead we look for runnable threads 891 * and loop or halt as appropriate. Giant is not held on entry to the thread. 892 * 893 * The main loop is entered with a critical section held, we must release 894 * the critical section before doing anything else. lwkt_switch() will 895 * check for pending interrupts due to entering and exiting its own 896 * critical section. 897 * 898 * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up. 899 * However, there are cases where the idlethread will be entered with 900 * the possibility that no IPI will occur and in such cases 901 * lwkt_switch() sets TDF_IDLE_NOHLT. 902 * 903 * NOTE: cpu_idle_hlt again defaults to 2 (use ACPI sleep states). Set to 904 * 1 to just use hlt and for debugging purposes. 905 * 906 * NOTE: cpu_idle_repeat determines how many entries into the idle thread 907 * must occur before it starts using ACPI halt. 908 */ 909 static int cpu_idle_hlt = 2; 910 static int cpu_idle_hltcnt; 911 static int cpu_idle_spincnt; 912 static u_int cpu_idle_repeat = 4; 913 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, 914 &cpu_idle_hlt, 0, "Idle loop HLT enable"); 915 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hltcnt, CTLFLAG_RW, 916 &cpu_idle_hltcnt, 0, "Idle loop entry halts"); 917 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_spincnt, CTLFLAG_RW, 918 &cpu_idle_spincnt, 0, "Idle loop entry spins"); 919 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_repeat, CTLFLAG_RW, 920 &cpu_idle_repeat, 0, "Idle entries before acpi hlt"); 921 922 static void 923 cpu_idle_default_hook(void) 924 { 925 /* 926 * We must guarentee that hlt is exactly the instruction 927 * following the sti. 928 */ 929 __asm __volatile("sti; hlt"); 930 } 931 932 /* Other subsystems (e.g., ACPI) can hook this later. */ 933 void (*cpu_idle_hook)(void) = cpu_idle_default_hook; 934 935 void 936 cpu_idle(void) 937 { 938 globaldata_t gd = mycpu; 939 struct thread *td __debugvar = gd->gd_curthread; 940 int reqflags; 941 int quick; 942 943 crit_exit(); 944 KKASSERT(td->td_critcount == 0); 945 for (;;) { 946 /* 947 * See if there are any LWKTs ready to go. 948 */ 949 lwkt_switch(); 950 951 /* 952 * When halting inside a cli we must check for reqflags 953 * races, particularly [re]schedule requests. Running 954 * splz() does the job. 955 * 956 * cpu_idle_hlt: 957 * 0 Never halt, just spin 958 * 959 * 1 Always use HLT (or MONITOR/MWAIT if avail). 960 * This typically eats more power than the 961 * ACPI halt. 962 * 963 * 2 Use HLT/MONITOR/MWAIT up to a point and then 964 * use the ACPI halt (default). This is a hybrid 965 * approach. See machdep.cpu_idle_repeat. 966 * 967 * 3 Always use the ACPI halt. This typically 968 * eats the least amount of power but the cpu 969 * will be slow waking up. Slows down e.g. 970 * compiles and other pipe/event oriented stuff. 971 * 972 * NOTE: Interrupts are enabled and we are not in a critical 973 * section. 974 * 975 * NOTE: Preemptions do not reset gd_idle_repeat. Also we 976 * don't bother capping gd_idle_repeat, it is ok if 977 * it overflows. 978 */ 979 ++gd->gd_idle_repeat; 980 reqflags = gd->gd_reqflags; 981 quick = (cpu_idle_hlt == 1) || 982 (cpu_idle_hlt < 3 && 983 gd->gd_idle_repeat < cpu_idle_repeat); 984 985 if (quick && (cpu_mi_feature & CPU_MI_MONITOR) && 986 (reqflags & RQF_IDLECHECK_WK_MASK) == 0) { 987 cpu_mmw_pause_int(&gd->gd_reqflags, reqflags); 988 ++cpu_idle_hltcnt; 989 } else if (cpu_idle_hlt) { 990 __asm __volatile("cli"); 991 splz(); 992 if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) { 993 if (quick) 994 cpu_idle_default_hook(); 995 else 996 cpu_idle_hook(); 997 } 998 __asm __volatile("sti"); 999 ++cpu_idle_hltcnt; 1000 } else { 1001 splz(); 1002 __asm __volatile("sti"); 1003 ++cpu_idle_spincnt; 1004 } 1005 } 1006 } 1007 1008 #ifdef SMP 1009 1010 /* 1011 * This routine is called if a spinlock has been held through the 1012 * exponential backoff period and is seriously contested. On a real cpu 1013 * we let it spin. 1014 */ 1015 void 1016 cpu_spinlock_contested(void) 1017 { 1018 cpu_pause(); 1019 } 1020 1021 #endif 1022 1023 /* 1024 * Clear registers on exec 1025 */ 1026 void 1027 exec_setregs(u_long entry, u_long stack, u_long ps_strings) 1028 { 1029 struct thread *td = curthread; 1030 struct lwp *lp = td->td_lwp; 1031 struct pcb *pcb = td->td_pcb; 1032 struct trapframe *regs = lp->lwp_md.md_regs; 1033 1034 /* was i386_user_cleanup() in NetBSD */ 1035 user_ldt_free(pcb); 1036 1037 bzero((char *)regs, sizeof(struct trapframe)); 1038 regs->tf_rip = entry; 1039 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; /* align the stack */ 1040 regs->tf_rdi = stack; /* argv */ 1041 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); 1042 regs->tf_ss = _udatasel; 1043 regs->tf_cs = _ucodesel; 1044 regs->tf_rbx = ps_strings; 1045 1046 /* 1047 * Reset the hardware debug registers if they were in use. 1048 * They won't have any meaning for the newly exec'd process. 1049 */ 1050 if (pcb->pcb_flags & PCB_DBREGS) { 1051 pcb->pcb_dr0 = 0; 1052 pcb->pcb_dr1 = 0; 1053 pcb->pcb_dr2 = 0; 1054 pcb->pcb_dr3 = 0; 1055 pcb->pcb_dr6 = 0; 1056 pcb->pcb_dr7 = 0; /* JG set bit 10? */ 1057 if (pcb == td->td_pcb) { 1058 /* 1059 * Clear the debug registers on the running 1060 * CPU, otherwise they will end up affecting 1061 * the next process we switch to. 1062 */ 1063 reset_dbregs(); 1064 } 1065 pcb->pcb_flags &= ~PCB_DBREGS; 1066 } 1067 1068 /* 1069 * Initialize the math emulator (if any) for the current process. 1070 * Actually, just clear the bit that says that the emulator has 1071 * been initialized. Initialization is delayed until the process 1072 * traps to the emulator (if it is done at all) mainly because 1073 * emulators don't provide an entry point for initialization. 1074 */ 1075 pcb->pcb_flags &= ~FP_SOFTFP; 1076 1077 /* 1078 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing 1079 * gd_npxthread. Otherwise a preemptive interrupt thread 1080 * may panic in npxdna(). 1081 */ 1082 crit_enter(); 1083 load_cr0(rcr0() | CR0_MP); 1084 1085 /* 1086 * NOTE: The MSR values must be correct so we can return to 1087 * userland. gd_user_fs/gs must be correct so the switch 1088 * code knows what the current MSR values are. 1089 */ 1090 pcb->pcb_fsbase = 0; /* Values loaded from PCB on switch */ 1091 pcb->pcb_gsbase = 0; 1092 mdcpu->gd_user_fs = 0; /* Cache of current MSR values */ 1093 mdcpu->gd_user_gs = 0; 1094 wrmsr(MSR_FSBASE, 0); /* Set MSR values for return to userland */ 1095 wrmsr(MSR_KGSBASE, 0); 1096 1097 /* Initialize the npx (if any) for the current process. */ 1098 npxinit(__INITIAL_NPXCW__); 1099 crit_exit(); 1100 1101 pcb->pcb_ds = _udatasel; 1102 pcb->pcb_es = _udatasel; 1103 pcb->pcb_fs = _udatasel; 1104 pcb->pcb_gs = _udatasel; 1105 } 1106 1107 void 1108 cpu_setregs(void) 1109 { 1110 register_t cr0; 1111 1112 cr0 = rcr0(); 1113 cr0 |= CR0_NE; /* Done by npxinit() */ 1114 cr0 |= CR0_MP | CR0_TS; /* Done at every execve() too. */ 1115 cr0 |= CR0_WP | CR0_AM; 1116 load_cr0(cr0); 1117 load_gs(_udatasel); 1118 } 1119 1120 static int 1121 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS) 1122 { 1123 int error; 1124 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, 1125 req); 1126 if (!error && req->newptr) 1127 resettodr(); 1128 return (error); 1129 } 1130 1131 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, 1132 &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); 1133 1134 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, 1135 CTLFLAG_RW, &disable_rtc_set, 0, ""); 1136 1137 #if JG 1138 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, 1139 CTLFLAG_RD, &bootinfo, bootinfo, ""); 1140 #endif 1141 1142 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, 1143 CTLFLAG_RW, &wall_cmos_clock, 0, ""); 1144 1145 extern u_long bootdev; /* not a cdev_t - encoding is different */ 1146 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, 1147 CTLFLAG_RD, &bootdev, 0, "Boot device (not in cdev_t format)"); 1148 1149 /* 1150 * Initialize 386 and configure to run kernel 1151 */ 1152 1153 /* 1154 * Initialize segments & interrupt table 1155 */ 1156 1157 int _default_ldt; 1158 struct user_segment_descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ 1159 static struct gate_descriptor idt0[NIDT]; 1160 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 1161 #if JG 1162 union descriptor ldt[NLDT]; /* local descriptor table */ 1163 #endif 1164 1165 /* table descriptors - used to load tables by cpu */ 1166 struct region_descriptor r_gdt, r_idt; 1167 1168 #if defined(I586_CPU) && !defined(NO_F00F_HACK) 1169 extern int has_f00f_bug; 1170 #endif 1171 1172 /* JG proc0paddr is a virtual address */ 1173 void *proc0paddr; 1174 /* JG alignment? */ 1175 char proc0paddr_buff[LWKT_THREAD_STACK]; 1176 1177 1178 /* software prototypes -- in more palatable form */ 1179 struct soft_segment_descriptor gdt_segs[] = { 1180 /* GNULL_SEL 0 Null Descriptor */ 1181 { 0x0, /* segment base address */ 1182 0x0, /* length */ 1183 0, /* segment type */ 1184 0, /* segment descriptor priority level */ 1185 0, /* segment descriptor present */ 1186 0, /* long */ 1187 0, /* default 32 vs 16 bit size */ 1188 0 /* limit granularity (byte/page units)*/ }, 1189 /* GCODE_SEL 1 Code Descriptor for kernel */ 1190 { 0x0, /* segment base address */ 1191 0xfffff, /* length - all address space */ 1192 SDT_MEMERA, /* segment type */ 1193 SEL_KPL, /* segment descriptor priority level */ 1194 1, /* segment descriptor present */ 1195 1, /* long */ 1196 0, /* default 32 vs 16 bit size */ 1197 1 /* limit granularity (byte/page units)*/ }, 1198 /* GDATA_SEL 2 Data Descriptor for kernel */ 1199 { 0x0, /* segment base address */ 1200 0xfffff, /* length - all address space */ 1201 SDT_MEMRWA, /* segment type */ 1202 SEL_KPL, /* segment descriptor priority level */ 1203 1, /* segment descriptor present */ 1204 1, /* long */ 1205 0, /* default 32 vs 16 bit size */ 1206 1 /* limit granularity (byte/page units)*/ }, 1207 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */ 1208 { 0x0, /* segment base address */ 1209 0xfffff, /* length - all address space */ 1210 SDT_MEMERA, /* segment type */ 1211 SEL_UPL, /* segment descriptor priority level */ 1212 1, /* segment descriptor present */ 1213 0, /* long */ 1214 1, /* default 32 vs 16 bit size */ 1215 1 /* limit granularity (byte/page units)*/ }, 1216 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */ 1217 { 0x0, /* segment base address */ 1218 0xfffff, /* length - all address space */ 1219 SDT_MEMRWA, /* segment type */ 1220 SEL_UPL, /* segment descriptor priority level */ 1221 1, /* segment descriptor present */ 1222 0, /* long */ 1223 1, /* default 32 vs 16 bit size */ 1224 1 /* limit granularity (byte/page units)*/ }, 1225 /* GUCODE_SEL 5 64 bit Code Descriptor for user */ 1226 { 0x0, /* segment base address */ 1227 0xfffff, /* length - all address space */ 1228 SDT_MEMERA, /* segment type */ 1229 SEL_UPL, /* segment descriptor priority level */ 1230 1, /* segment descriptor present */ 1231 1, /* long */ 1232 0, /* default 32 vs 16 bit size */ 1233 1 /* limit granularity (byte/page units)*/ }, 1234 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */ 1235 { 1236 0x0, /* segment base address */ 1237 sizeof(struct x86_64tss)-1,/* length - all address space */ 1238 SDT_SYSTSS, /* segment type */ 1239 SEL_KPL, /* segment descriptor priority level */ 1240 1, /* segment descriptor present */ 1241 0, /* long */ 1242 0, /* unused - default 32 vs 16 bit size */ 1243 0 /* limit granularity (byte/page units)*/ }, 1244 /* Actually, the TSS is a system descriptor which is double size */ 1245 { 0x0, /* segment base address */ 1246 0x0, /* length */ 1247 0, /* segment type */ 1248 0, /* segment descriptor priority level */ 1249 0, /* segment descriptor present */ 1250 0, /* long */ 1251 0, /* default 32 vs 16 bit size */ 1252 0 /* limit granularity (byte/page units)*/ }, 1253 /* GUGS32_SEL 8 32 bit GS Descriptor for user */ 1254 { 0x0, /* segment base address */ 1255 0xfffff, /* length - all address space */ 1256 SDT_MEMRWA, /* segment type */ 1257 SEL_UPL, /* segment descriptor priority level */ 1258 1, /* segment descriptor present */ 1259 0, /* long */ 1260 1, /* default 32 vs 16 bit size */ 1261 1 /* limit granularity (byte/page units)*/ }, 1262 }; 1263 1264 void 1265 setidt(int idx, inthand_t *func, int typ, int dpl, int ist) 1266 { 1267 struct gate_descriptor *ip; 1268 1269 ip = idt + idx; 1270 ip->gd_looffset = (uintptr_t)func; 1271 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 1272 ip->gd_ist = ist; 1273 ip->gd_xx = 0; 1274 ip->gd_type = typ; 1275 ip->gd_dpl = dpl; 1276 ip->gd_p = 1; 1277 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 1278 } 1279 1280 #define IDTVEC(name) __CONCAT(X,name) 1281 1282 extern inthand_t 1283 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 1284 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 1285 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 1286 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 1287 IDTVEC(xmm), IDTVEC(dblfault), 1288 IDTVEC(fast_syscall), IDTVEC(fast_syscall32); 1289 1290 #ifdef DEBUG_INTERRUPTS 1291 extern inthand_t *Xrsvdary[256]; 1292 #endif 1293 1294 void 1295 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd) 1296 { 1297 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 1298 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 1299 ssd->ssd_type = sd->sd_type; 1300 ssd->ssd_dpl = sd->sd_dpl; 1301 ssd->ssd_p = sd->sd_p; 1302 ssd->ssd_def32 = sd->sd_def32; 1303 ssd->ssd_gran = sd->sd_gran; 1304 } 1305 1306 void 1307 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd) 1308 { 1309 1310 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 1311 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 1312 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 1313 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 1314 sd->sd_type = ssd->ssd_type; 1315 sd->sd_dpl = ssd->ssd_dpl; 1316 sd->sd_p = ssd->ssd_p; 1317 sd->sd_long = ssd->ssd_long; 1318 sd->sd_def32 = ssd->ssd_def32; 1319 sd->sd_gran = ssd->ssd_gran; 1320 } 1321 1322 void 1323 ssdtosyssd(struct soft_segment_descriptor *ssd, 1324 struct system_segment_descriptor *sd) 1325 { 1326 1327 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 1328 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 1329 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 1330 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 1331 sd->sd_type = ssd->ssd_type; 1332 sd->sd_dpl = ssd->ssd_dpl; 1333 sd->sd_p = ssd->ssd_p; 1334 sd->sd_gran = ssd->ssd_gran; 1335 } 1336 1337 u_int basemem; 1338 1339 /* 1340 * Populate the (physmap) array with base/bound pairs describing the 1341 * available physical memory in the system, then test this memory and 1342 * build the phys_avail array describing the actually-available memory. 1343 * 1344 * If we cannot accurately determine the physical memory map, then use 1345 * value from the 0xE801 call, and failing that, the RTC. 1346 * 1347 * Total memory size may be set by the kernel environment variable 1348 * hw.physmem or the compile-time define MAXMEM. 1349 * 1350 * XXX first should be vm_paddr_t. 1351 */ 1352 static void 1353 getmemsize(caddr_t kmdp, u_int64_t first) 1354 { 1355 int i, off, physmap_idx, pa_indx, da_indx; 1356 vm_paddr_t pa, physmap[PHYSMAP_SIZE]; 1357 u_long physmem_tunable; 1358 pt_entry_t *pte; 1359 struct bios_smap *smapbase, *smap, *smapend; 1360 u_int32_t smapsize; 1361 quad_t dcons_addr, dcons_size; 1362 1363 bzero(physmap, sizeof(physmap)); 1364 basemem = 0; 1365 physmap_idx = 0; 1366 1367 /* 1368 * get memory map from INT 15:E820, kindly supplied by the loader. 1369 * 1370 * subr_module.c says: 1371 * "Consumer may safely assume that size value precedes data." 1372 * ie: an int32_t immediately precedes smap. 1373 */ 1374 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1375 MODINFO_METADATA | MODINFOMD_SMAP); 1376 if (smapbase == NULL) 1377 panic("No BIOS smap info from loader!"); 1378 1379 smapsize = *((u_int32_t *)smapbase - 1); 1380 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1381 1382 for (smap = smapbase; smap < smapend; smap++) { 1383 if (boothowto & RB_VERBOSE) 1384 kprintf("SMAP type=%02x base=%016lx len=%016lx\n", 1385 smap->type, smap->base, smap->length); 1386 1387 if (smap->type != SMAP_TYPE_MEMORY) 1388 continue; 1389 1390 if (smap->length == 0) 1391 continue; 1392 1393 for (i = 0; i <= physmap_idx; i += 2) { 1394 if (smap->base < physmap[i + 1]) { 1395 if (boothowto & RB_VERBOSE) { 1396 kprintf("Overlapping or non-monotonic " 1397 "memory region, ignoring " 1398 "second region\n"); 1399 } 1400 continue; 1401 } 1402 } 1403 Realmem += smap->length; 1404 1405 if (smap->base == physmap[physmap_idx + 1]) { 1406 physmap[physmap_idx + 1] += smap->length; 1407 continue; 1408 } 1409 1410 physmap_idx += 2; 1411 if (physmap_idx == PHYSMAP_SIZE) { 1412 kprintf("Too many segments in the physical " 1413 "address map, giving up\n"); 1414 break; 1415 } 1416 physmap[physmap_idx] = smap->base; 1417 physmap[physmap_idx + 1] = smap->base + smap->length; 1418 } 1419 1420 /* 1421 * Find the 'base memory' segment for SMP 1422 */ 1423 basemem = 0; 1424 for (i = 0; i <= physmap_idx; i += 2) { 1425 if (physmap[i] == 0x00000000) { 1426 basemem = physmap[i + 1] / 1024; 1427 break; 1428 } 1429 } 1430 if (basemem == 0) 1431 panic("BIOS smap did not include a basemem segment!"); 1432 1433 #ifdef SMP 1434 /* make hole for AP bootstrap code */ 1435 physmap[1] = mp_bootaddress(physmap[1] / 1024); 1436 1437 /* Save EBDA address, if any */ 1438 ebda_addr = (u_long)(*(u_short *)(KERNBASE + 0x40e)); 1439 ebda_addr <<= 4; 1440 #endif 1441 1442 /* 1443 * Maxmem isn't the "maximum memory", it's one larger than the 1444 * highest page of the physical address space. It should be 1445 * called something like "Maxphyspage". We may adjust this 1446 * based on ``hw.physmem'' and the results of the memory test. 1447 */ 1448 Maxmem = atop(physmap[physmap_idx + 1]); 1449 1450 #ifdef MAXMEM 1451 Maxmem = MAXMEM / 4; 1452 #endif 1453 1454 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 1455 Maxmem = atop(physmem_tunable); 1456 1457 /* 1458 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 1459 * in the system. 1460 */ 1461 if (Maxmem > atop(physmap[physmap_idx + 1])) 1462 Maxmem = atop(physmap[physmap_idx + 1]); 1463 1464 /* 1465 * 1466 */ 1467 if (Maxmem > atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS)) { 1468 kprintf("Limiting Maxmem due to DMAP size\n"); 1469 Maxmem = atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS); 1470 } 1471 1472 if (atop(physmap[physmap_idx + 1]) != Maxmem && 1473 (boothowto & RB_VERBOSE)) 1474 kprintf("Physical memory use set to %ldK\n", Maxmem * 4); 1475 1476 /* call pmap initialization to make new kernel address space */ 1477 pmap_bootstrap(&first); 1478 1479 /* 1480 * Size up each available chunk of physical memory. 1481 */ 1482 physmap[0] = PAGE_SIZE; /* mask off page 0 */ 1483 pa_indx = 0; 1484 da_indx = 1; 1485 phys_avail[pa_indx++] = physmap[0]; 1486 phys_avail[pa_indx] = physmap[0]; 1487 dump_avail[da_indx] = physmap[0]; 1488 pte = CMAP1; 1489 1490 /* 1491 * Get dcons buffer address 1492 */ 1493 if (kgetenv_quad("dcons.addr", &dcons_addr) == 0 || 1494 kgetenv_quad("dcons.size", &dcons_size) == 0) 1495 dcons_addr = 0; 1496 1497 /* 1498 * physmap is in bytes, so when converting to page boundaries, 1499 * round up the start address and round down the end address. 1500 */ 1501 for (i = 0; i <= physmap_idx; i += 2) { 1502 vm_paddr_t end; 1503 1504 end = ptoa((vm_paddr_t)Maxmem); 1505 if (physmap[i + 1] < end) 1506 end = trunc_page(physmap[i + 1]); 1507 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 1508 int tmp, page_bad, full; 1509 int *ptr = (int *)CADDR1; 1510 1511 full = FALSE; 1512 /* 1513 * block out kernel memory as not available. 1514 */ 1515 if (pa >= 0x100000 && pa < first) 1516 goto do_dump_avail; 1517 1518 /* 1519 * block out dcons buffer 1520 */ 1521 if (dcons_addr > 0 1522 && pa >= trunc_page(dcons_addr) 1523 && pa < dcons_addr + dcons_size) 1524 goto do_dump_avail; 1525 1526 page_bad = FALSE; 1527 1528 /* 1529 * map page into kernel: valid, read/write,non-cacheable 1530 */ 1531 *pte = pa | PG_V | PG_RW | PG_N; 1532 cpu_invltlb(); 1533 1534 tmp = *(int *)ptr; 1535 /* 1536 * Test for alternating 1's and 0's 1537 */ 1538 *(volatile int *)ptr = 0xaaaaaaaa; 1539 if (*(volatile int *)ptr != 0xaaaaaaaa) 1540 page_bad = TRUE; 1541 /* 1542 * Test for alternating 0's and 1's 1543 */ 1544 *(volatile int *)ptr = 0x55555555; 1545 if (*(volatile int *)ptr != 0x55555555) 1546 page_bad = TRUE; 1547 /* 1548 * Test for all 1's 1549 */ 1550 *(volatile int *)ptr = 0xffffffff; 1551 if (*(volatile int *)ptr != 0xffffffff) 1552 page_bad = TRUE; 1553 /* 1554 * Test for all 0's 1555 */ 1556 *(volatile int *)ptr = 0x0; 1557 if (*(volatile int *)ptr != 0x0) 1558 page_bad = TRUE; 1559 /* 1560 * Restore original value. 1561 */ 1562 *(int *)ptr = tmp; 1563 1564 /* 1565 * Adjust array of valid/good pages. 1566 */ 1567 if (page_bad == TRUE) 1568 continue; 1569 /* 1570 * If this good page is a continuation of the 1571 * previous set of good pages, then just increase 1572 * the end pointer. Otherwise start a new chunk. 1573 * Note that "end" points one higher than end, 1574 * making the range >= start and < end. 1575 * If we're also doing a speculative memory 1576 * test and we at or past the end, bump up Maxmem 1577 * so that we keep going. The first bad page 1578 * will terminate the loop. 1579 */ 1580 if (phys_avail[pa_indx] == pa) { 1581 phys_avail[pa_indx] += PAGE_SIZE; 1582 } else { 1583 pa_indx++; 1584 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 1585 kprintf( 1586 "Too many holes in the physical address space, giving up\n"); 1587 pa_indx--; 1588 full = TRUE; 1589 goto do_dump_avail; 1590 } 1591 phys_avail[pa_indx++] = pa; /* start */ 1592 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 1593 } 1594 physmem++; 1595 do_dump_avail: 1596 if (dump_avail[da_indx] == pa) { 1597 dump_avail[da_indx] += PAGE_SIZE; 1598 } else { 1599 da_indx++; 1600 if (da_indx == DUMP_AVAIL_ARRAY_END) { 1601 da_indx--; 1602 goto do_next; 1603 } 1604 dump_avail[da_indx++] = pa; /* start */ 1605 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 1606 } 1607 do_next: 1608 if (full) 1609 break; 1610 } 1611 } 1612 *pte = 0; 1613 cpu_invltlb(); 1614 1615 /* 1616 * XXX 1617 * The last chunk must contain at least one page plus the message 1618 * buffer to avoid complicating other code (message buffer address 1619 * calculation, etc.). 1620 */ 1621 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 1622 round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) { 1623 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 1624 phys_avail[pa_indx--] = 0; 1625 phys_avail[pa_indx--] = 0; 1626 } 1627 1628 Maxmem = atop(phys_avail[pa_indx]); 1629 1630 /* Trim off space for the message buffer. */ 1631 phys_avail[pa_indx] -= round_page(MSGBUF_SIZE); 1632 1633 avail_end = phys_avail[pa_indx]; 1634 1635 /* Map the message buffer. */ 1636 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) 1637 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + 1638 off); 1639 } 1640 1641 #ifdef SMP 1642 #ifdef APIC_IO 1643 int apic_io_enable = 1; /* Enabled by default for kernels compiled w/APIC_IO */ 1644 #else 1645 int apic_io_enable = 0; /* Disabled by default for kernels compiled without */ 1646 #endif 1647 TUNABLE_INT("hw.apic_io_enable", &apic_io_enable); 1648 extern struct machintr_abi MachIntrABI_APIC; 1649 #endif 1650 1651 extern struct machintr_abi MachIntrABI_ICU; 1652 struct machintr_abi MachIntrABI; 1653 1654 /* 1655 * IDT VECTORS: 1656 * 0 Divide by zero 1657 * 1 Debug 1658 * 2 NMI 1659 * 3 BreakPoint 1660 * 4 OverFlow 1661 * 5 Bound-Range 1662 * 6 Invalid OpCode 1663 * 7 Device Not Available (x87) 1664 * 8 Double-Fault 1665 * 9 Coprocessor Segment overrun (unsupported, reserved) 1666 * 10 Invalid-TSS 1667 * 11 Segment not present 1668 * 12 Stack 1669 * 13 General Protection 1670 * 14 Page Fault 1671 * 15 Reserved 1672 * 16 x87 FP Exception pending 1673 * 17 Alignment Check 1674 * 18 Machine Check 1675 * 19 SIMD floating point 1676 * 20-31 reserved 1677 * 32-255 INTn/external sources 1678 */ 1679 u_int64_t 1680 hammer_time(u_int64_t modulep, u_int64_t physfree) 1681 { 1682 caddr_t kmdp; 1683 int gsel_tss, x; 1684 #if JG 1685 int metadata_missing, off; 1686 #endif 1687 struct mdglobaldata *gd; 1688 u_int64_t msr; 1689 1690 /* 1691 * Prevent lowering of the ipl if we call tsleep() early. 1692 */ 1693 gd = &CPU_prvspace[0].mdglobaldata; 1694 bzero(gd, sizeof(*gd)); 1695 1696 /* 1697 * Note: on both UP and SMP curthread must be set non-NULL 1698 * early in the boot sequence because the system assumes 1699 * that 'curthread' is never NULL. 1700 */ 1701 1702 gd->mi.gd_curthread = &thread0; 1703 thread0.td_gd = &gd->mi; 1704 1705 atdevbase = ISA_HOLE_START + PTOV_OFFSET; 1706 1707 #if JG 1708 metadata_missing = 0; 1709 if (bootinfo.bi_modulep) { 1710 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; 1711 preload_bootstrap_relocate(KERNBASE); 1712 } else { 1713 metadata_missing = 1; 1714 } 1715 if (bootinfo.bi_envp) 1716 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; 1717 #endif 1718 1719 preload_metadata = (caddr_t)(uintptr_t)(modulep + PTOV_OFFSET); 1720 preload_bootstrap_relocate(PTOV_OFFSET); 1721 kmdp = preload_search_by_type("elf kernel"); 1722 if (kmdp == NULL) 1723 kmdp = preload_search_by_type("elf64 kernel"); 1724 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 1725 kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + PTOV_OFFSET; 1726 #ifdef DDB 1727 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 1728 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 1729 #endif 1730 1731 /* 1732 * Setup MachIntrABI 1733 * XXX: Where is the correct place for it? 1734 */ 1735 MachIntrABI = MachIntrABI_ICU; 1736 #ifdef SMP 1737 TUNABLE_INT_FETCH("hw.apic_io_enable", &apic_io_enable); 1738 if (apic_io_enable) 1739 MachIntrABI = MachIntrABI_APIC; 1740 #endif 1741 1742 /* 1743 * start with one cpu. Note: with one cpu, ncpus2_shift, ncpus2_mask, 1744 * and ncpus_fit_mask remain 0. 1745 */ 1746 ncpus = 1; 1747 ncpus2 = 1; 1748 ncpus_fit = 1; 1749 /* Init basic tunables, hz etc */ 1750 init_param1(); 1751 1752 /* 1753 * make gdt memory segments 1754 */ 1755 gdt_segs[GPROC0_SEL].ssd_base = 1756 (uintptr_t) &CPU_prvspace[0].mdglobaldata.gd_common_tss; 1757 1758 gd->mi.gd_prvspace = &CPU_prvspace[0]; 1759 1760 for (x = 0; x < NGDT; x++) { 1761 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1)) 1762 ssdtosd(&gdt_segs[x], &gdt[x]); 1763 } 1764 ssdtosyssd(&gdt_segs[GPROC0_SEL], 1765 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1766 1767 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 1768 r_gdt.rd_base = (long) gdt; 1769 lgdt(&r_gdt); 1770 1771 wrmsr(MSR_FSBASE, 0); /* User value */ 1772 wrmsr(MSR_GSBASE, (u_int64_t)&gd->mi); 1773 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 1774 1775 mi_gdinit(&gd->mi, 0); 1776 cpu_gdinit(gd, 0); 1777 proc0paddr = proc0paddr_buff; 1778 mi_proc0init(&gd->mi, proc0paddr); 1779 safepri = TDPRI_MAX; 1780 1781 /* spinlocks and the BGL */ 1782 init_locks(); 1783 1784 /* exceptions */ 1785 for (x = 0; x < NIDT; x++) 1786 setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); 1787 setidt(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); 1788 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0); 1789 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 1); 1790 setidt(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0); 1791 setidt(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0); 1792 setidt(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); 1793 setidt(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); 1794 setidt(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); 1795 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 1796 setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); 1797 setidt(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); 1798 setidt(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); 1799 setidt(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); 1800 setidt(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); 1801 setidt(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); 1802 setidt(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); 1803 setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); 1804 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0); 1805 setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); 1806 1807 r_idt.rd_limit = sizeof(idt0) - 1; 1808 r_idt.rd_base = (long) idt; 1809 lidt(&r_idt); 1810 1811 /* 1812 * Initialize the console before we print anything out. 1813 */ 1814 cninit(); 1815 1816 #if JG 1817 if (metadata_missing) 1818 kprintf("WARNING: loader(8) metadata is missing!\n"); 1819 #endif 1820 1821 #if NISA >0 1822 isa_defaultirq(); 1823 #endif 1824 rand_initialize(); 1825 1826 #ifdef DDB 1827 kdb_init(); 1828 if (boothowto & RB_KDB) 1829 Debugger("Boot flags requested debugger"); 1830 #endif 1831 1832 #if JG 1833 finishidentcpu(); /* Final stage of CPU initialization */ 1834 setidt(6, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 1835 setidt(13, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 1836 #endif 1837 identify_cpu(); /* Final stage of CPU initialization */ 1838 initializecpu(); /* Initialize CPU registers */ 1839 1840 /* make an initial tss so cpu can get interrupt stack on syscall! */ 1841 gd->gd_common_tss.tss_rsp0 = 1842 (register_t)(thread0.td_kstack + 1843 KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb)); 1844 /* Ensure the stack is aligned to 16 bytes */ 1845 gd->gd_common_tss.tss_rsp0 &= ~(register_t)0xF; 1846 1847 /* double fault stack */ 1848 gd->gd_common_tss.tss_ist1 = 1849 (long)&gd->mi.gd_prvspace->idlestack[ 1850 sizeof(gd->mi.gd_prvspace->idlestack)]; 1851 1852 /* Set the IO permission bitmap (empty due to tss seg limit) */ 1853 gd->gd_common_tss.tss_iobase = sizeof(struct x86_64tss); 1854 1855 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 1856 gd->gd_tss_gdt = &gdt[GPROC0_SEL]; 1857 gd->gd_common_tssd = *gd->gd_tss_gdt; 1858 ltr(gsel_tss); 1859 1860 /* Set up the fast syscall stuff */ 1861 msr = rdmsr(MSR_EFER) | EFER_SCE; 1862 wrmsr(MSR_EFER, msr); 1863 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); 1864 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 1865 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1866 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 1867 wrmsr(MSR_STAR, msr); 1868 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); 1869 1870 getmemsize(kmdp, physfree); 1871 init_param2(physmem); 1872 1873 /* now running on new page tables, configured,and u/iom is accessible */ 1874 1875 /* Map the message buffer. */ 1876 #if JG 1877 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) 1878 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); 1879 #endif 1880 1881 msgbufinit(msgbufp, MSGBUF_SIZE); 1882 1883 1884 /* transfer to user mode */ 1885 1886 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 1887 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 1888 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 1889 1890 load_ds(_udatasel); 1891 load_es(_udatasel); 1892 load_fs(_udatasel); 1893 1894 /* setup proc 0's pcb */ 1895 thread0.td_pcb->pcb_flags = 0; 1896 thread0.td_pcb->pcb_cr3 = KPML4phys; 1897 thread0.td_pcb->pcb_ext = 0; 1898 lwp0.lwp_md.md_regs = &proc0_tf; /* XXX needed? */ 1899 1900 /* Location of kernel stack for locore */ 1901 return ((u_int64_t)thread0.td_pcb); 1902 } 1903 1904 /* 1905 * Initialize machine-dependant portions of the global data structure. 1906 * Note that the global data area and cpu0's idlestack in the private 1907 * data space were allocated in locore. 1908 * 1909 * Note: the idlethread's cpl is 0 1910 * 1911 * WARNING! Called from early boot, 'mycpu' may not work yet. 1912 */ 1913 void 1914 cpu_gdinit(struct mdglobaldata *gd, int cpu) 1915 { 1916 if (cpu) 1917 gd->mi.gd_curthread = &gd->mi.gd_idlethread; 1918 1919 lwkt_init_thread(&gd->mi.gd_idlethread, 1920 gd->mi.gd_prvspace->idlestack, 1921 sizeof(gd->mi.gd_prvspace->idlestack), 1922 0, &gd->mi); 1923 lwkt_set_comm(&gd->mi.gd_idlethread, "idle_%d", cpu); 1924 gd->mi.gd_idlethread.td_switch = cpu_lwkt_switch; 1925 gd->mi.gd_idlethread.td_sp -= sizeof(void *); 1926 *(void **)gd->mi.gd_idlethread.td_sp = cpu_idle_restore; 1927 } 1928 1929 int 1930 is_globaldata_space(vm_offset_t saddr, vm_offset_t eaddr) 1931 { 1932 if (saddr >= (vm_offset_t)&CPU_prvspace[0] && 1933 eaddr <= (vm_offset_t)&CPU_prvspace[MAXCPU]) { 1934 return (TRUE); 1935 } 1936 return (FALSE); 1937 } 1938 1939 struct globaldata * 1940 globaldata_find(int cpu) 1941 { 1942 KKASSERT(cpu >= 0 && cpu < ncpus); 1943 return(&CPU_prvspace[cpu].mdglobaldata.mi); 1944 } 1945 1946 #if defined(I586_CPU) && !defined(NO_F00F_HACK) 1947 static void f00f_hack(void *unused); 1948 SYSINIT(f00f_hack, SI_BOOT2_BIOS, SI_ORDER_ANY, f00f_hack, NULL); 1949 1950 static void 1951 f00f_hack(void *unused) 1952 { 1953 struct gate_descriptor *new_idt; 1954 vm_offset_t tmp; 1955 1956 if (!has_f00f_bug) 1957 return; 1958 1959 kprintf("Intel Pentium detected, installing workaround for F00F bug\n"); 1960 1961 r_idt.rd_limit = sizeof(idt0) - 1; 1962 1963 tmp = kmem_alloc(&kernel_map, PAGE_SIZE * 2); 1964 if (tmp == 0) 1965 panic("kmem_alloc returned 0"); 1966 if (((unsigned int)tmp & (PAGE_SIZE-1)) != 0) 1967 panic("kmem_alloc returned non-page-aligned memory"); 1968 /* Put the first seven entries in the lower page */ 1969 new_idt = (struct gate_descriptor*)(tmp + PAGE_SIZE - (7*8)); 1970 bcopy(idt, new_idt, sizeof(idt0)); 1971 r_idt.rd_base = (int)new_idt; 1972 lidt(&r_idt); 1973 idt = new_idt; 1974 if (vm_map_protect(&kernel_map, tmp, tmp + PAGE_SIZE, 1975 VM_PROT_READ, FALSE) != KERN_SUCCESS) 1976 panic("vm_map_protect failed"); 1977 return; 1978 } 1979 #endif /* defined(I586_CPU) && !NO_F00F_HACK */ 1980 1981 int 1982 ptrace_set_pc(struct lwp *lp, unsigned long addr) 1983 { 1984 lp->lwp_md.md_regs->tf_rip = addr; 1985 return (0); 1986 } 1987 1988 int 1989 ptrace_single_step(struct lwp *lp) 1990 { 1991 lp->lwp_md.md_regs->tf_rflags |= PSL_T; 1992 return (0); 1993 } 1994 1995 int 1996 fill_regs(struct lwp *lp, struct reg *regs) 1997 { 1998 struct trapframe *tp; 1999 2000 tp = lp->lwp_md.md_regs; 2001 bcopy(&tp->tf_rdi, ®s->r_rdi, sizeof(*regs)); 2002 return (0); 2003 } 2004 2005 int 2006 set_regs(struct lwp *lp, struct reg *regs) 2007 { 2008 struct trapframe *tp; 2009 2010 tp = lp->lwp_md.md_regs; 2011 if (!EFL_SECURE(regs->r_rflags, tp->tf_rflags) || 2012 !CS_SECURE(regs->r_cs)) 2013 return (EINVAL); 2014 bcopy(®s->r_rdi, &tp->tf_rdi, sizeof(*regs)); 2015 return (0); 2016 } 2017 2018 #ifndef CPU_DISABLE_SSE 2019 static void 2020 fill_fpregs_xmm(struct savexmm *sv_xmm, struct save87 *sv_87) 2021 { 2022 struct env87 *penv_87 = &sv_87->sv_env; 2023 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2024 int i; 2025 2026 /* FPU control/status */ 2027 penv_87->en_cw = penv_xmm->en_cw; 2028 penv_87->en_sw = penv_xmm->en_sw; 2029 penv_87->en_tw = penv_xmm->en_tw; 2030 penv_87->en_fip = penv_xmm->en_fip; 2031 penv_87->en_fcs = penv_xmm->en_fcs; 2032 penv_87->en_opcode = penv_xmm->en_opcode; 2033 penv_87->en_foo = penv_xmm->en_foo; 2034 penv_87->en_fos = penv_xmm->en_fos; 2035 2036 /* FPU registers */ 2037 for (i = 0; i < 8; ++i) 2038 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; 2039 2040 sv_87->sv_ex_sw = sv_xmm->sv_ex_sw; 2041 } 2042 2043 static void 2044 set_fpregs_xmm(struct save87 *sv_87, struct savexmm *sv_xmm) 2045 { 2046 struct env87 *penv_87 = &sv_87->sv_env; 2047 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2048 int i; 2049 2050 /* FPU control/status */ 2051 penv_xmm->en_cw = penv_87->en_cw; 2052 penv_xmm->en_sw = penv_87->en_sw; 2053 penv_xmm->en_tw = penv_87->en_tw; 2054 penv_xmm->en_fip = penv_87->en_fip; 2055 penv_xmm->en_fcs = penv_87->en_fcs; 2056 penv_xmm->en_opcode = penv_87->en_opcode; 2057 penv_xmm->en_foo = penv_87->en_foo; 2058 penv_xmm->en_fos = penv_87->en_fos; 2059 2060 /* FPU registers */ 2061 for (i = 0; i < 8; ++i) 2062 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; 2063 2064 sv_xmm->sv_ex_sw = sv_87->sv_ex_sw; 2065 } 2066 #endif /* CPU_DISABLE_SSE */ 2067 2068 int 2069 fill_fpregs(struct lwp *lp, struct fpreg *fpregs) 2070 { 2071 #ifndef CPU_DISABLE_SSE 2072 if (cpu_fxsr) { 2073 fill_fpregs_xmm(&lp->lwp_thread->td_pcb->pcb_save.sv_xmm, 2074 (struct save87 *)fpregs); 2075 return (0); 2076 } 2077 #endif /* CPU_DISABLE_SSE */ 2078 bcopy(&lp->lwp_thread->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs); 2079 return (0); 2080 } 2081 2082 int 2083 set_fpregs(struct lwp *lp, struct fpreg *fpregs) 2084 { 2085 #ifndef CPU_DISABLE_SSE 2086 if (cpu_fxsr) { 2087 set_fpregs_xmm((struct save87 *)fpregs, 2088 &lp->lwp_thread->td_pcb->pcb_save.sv_xmm); 2089 return (0); 2090 } 2091 #endif /* CPU_DISABLE_SSE */ 2092 bcopy(fpregs, &lp->lwp_thread->td_pcb->pcb_save.sv_87, sizeof *fpregs); 2093 return (0); 2094 } 2095 2096 int 2097 fill_dbregs(struct lwp *lp, struct dbreg *dbregs) 2098 { 2099 if (lp == NULL) { 2100 dbregs->dr[0] = rdr0(); 2101 dbregs->dr[1] = rdr1(); 2102 dbregs->dr[2] = rdr2(); 2103 dbregs->dr[3] = rdr3(); 2104 dbregs->dr[4] = rdr4(); 2105 dbregs->dr[5] = rdr5(); 2106 dbregs->dr[6] = rdr6(); 2107 dbregs->dr[7] = rdr7(); 2108 } else { 2109 struct pcb *pcb; 2110 2111 pcb = lp->lwp_thread->td_pcb; 2112 dbregs->dr[0] = pcb->pcb_dr0; 2113 dbregs->dr[1] = pcb->pcb_dr1; 2114 dbregs->dr[2] = pcb->pcb_dr2; 2115 dbregs->dr[3] = pcb->pcb_dr3; 2116 dbregs->dr[4] = 0; 2117 dbregs->dr[5] = 0; 2118 dbregs->dr[6] = pcb->pcb_dr6; 2119 dbregs->dr[7] = pcb->pcb_dr7; 2120 } 2121 return (0); 2122 } 2123 2124 int 2125 set_dbregs(struct lwp *lp, struct dbreg *dbregs) 2126 { 2127 if (lp == NULL) { 2128 load_dr0(dbregs->dr[0]); 2129 load_dr1(dbregs->dr[1]); 2130 load_dr2(dbregs->dr[2]); 2131 load_dr3(dbregs->dr[3]); 2132 load_dr4(dbregs->dr[4]); 2133 load_dr5(dbregs->dr[5]); 2134 load_dr6(dbregs->dr[6]); 2135 load_dr7(dbregs->dr[7]); 2136 } else { 2137 struct pcb *pcb; 2138 struct ucred *ucred; 2139 int i; 2140 uint64_t mask1, mask2; 2141 2142 /* 2143 * Don't let an illegal value for dr7 get set. Specifically, 2144 * check for undefined settings. Setting these bit patterns 2145 * result in undefined behaviour and can lead to an unexpected 2146 * TRCTRAP. 2147 */ 2148 /* JG this loop looks unreadable */ 2149 /* Check 4 2-bit fields for invalid patterns. 2150 * These fields are R/Wi, for i = 0..3 2151 */ 2152 /* Is 10 in LENi allowed when running in compatibility mode? */ 2153 /* Pattern 10 in R/Wi might be used to indicate 2154 * breakpoint on I/O. Further analysis should be 2155 * carried to decide if it is safe and useful to 2156 * provide access to that capability 2157 */ 2158 for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 4; 2159 i++, mask1 <<= 4, mask2 <<= 4) 2160 if ((dbregs->dr[7] & mask1) == mask2) 2161 return (EINVAL); 2162 2163 pcb = lp->lwp_thread->td_pcb; 2164 ucred = lp->lwp_proc->p_ucred; 2165 2166 /* 2167 * Don't let a process set a breakpoint that is not within the 2168 * process's address space. If a process could do this, it 2169 * could halt the system by setting a breakpoint in the kernel 2170 * (if ddb was enabled). Thus, we need to check to make sure 2171 * that no breakpoints are being enabled for addresses outside 2172 * process's address space, unless, perhaps, we were called by 2173 * uid 0. 2174 * 2175 * XXX - what about when the watched area of the user's 2176 * address space is written into from within the kernel 2177 * ... wouldn't that still cause a breakpoint to be generated 2178 * from within kernel mode? 2179 */ 2180 2181 if (priv_check_cred(ucred, PRIV_ROOT, 0) != 0) { 2182 if (dbregs->dr[7] & 0x3) { 2183 /* dr0 is enabled */ 2184 if (dbregs->dr[0] >= VM_MAX_USER_ADDRESS) 2185 return (EINVAL); 2186 } 2187 2188 if (dbregs->dr[7] & (0x3<<2)) { 2189 /* dr1 is enabled */ 2190 if (dbregs->dr[1] >= VM_MAX_USER_ADDRESS) 2191 return (EINVAL); 2192 } 2193 2194 if (dbregs->dr[7] & (0x3<<4)) { 2195 /* dr2 is enabled */ 2196 if (dbregs->dr[2] >= VM_MAX_USER_ADDRESS) 2197 return (EINVAL); 2198 } 2199 2200 if (dbregs->dr[7] & (0x3<<6)) { 2201 /* dr3 is enabled */ 2202 if (dbregs->dr[3] >= VM_MAX_USER_ADDRESS) 2203 return (EINVAL); 2204 } 2205 } 2206 2207 pcb->pcb_dr0 = dbregs->dr[0]; 2208 pcb->pcb_dr1 = dbregs->dr[1]; 2209 pcb->pcb_dr2 = dbregs->dr[2]; 2210 pcb->pcb_dr3 = dbregs->dr[3]; 2211 pcb->pcb_dr6 = dbregs->dr[6]; 2212 pcb->pcb_dr7 = dbregs->dr[7]; 2213 2214 pcb->pcb_flags |= PCB_DBREGS; 2215 } 2216 2217 return (0); 2218 } 2219 2220 /* 2221 * Return > 0 if a hardware breakpoint has been hit, and the 2222 * breakpoint was in user space. Return 0, otherwise. 2223 */ 2224 int 2225 user_dbreg_trap(void) 2226 { 2227 u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */ 2228 u_int64_t bp; /* breakpoint bits extracted from dr6 */ 2229 int nbp; /* number of breakpoints that triggered */ 2230 caddr_t addr[4]; /* breakpoint addresses */ 2231 int i; 2232 2233 dr7 = rdr7(); 2234 if ((dr7 & 0xff) == 0) { 2235 /* 2236 * all GE and LE bits in the dr7 register are zero, 2237 * thus the trap couldn't have been caused by the 2238 * hardware debug registers 2239 */ 2240 return 0; 2241 } 2242 2243 nbp = 0; 2244 dr6 = rdr6(); 2245 bp = dr6 & 0xf; 2246 2247 if (bp == 0) { 2248 /* 2249 * None of the breakpoint bits are set meaning this 2250 * trap was not caused by any of the debug registers 2251 */ 2252 return 0; 2253 } 2254 2255 /* 2256 * at least one of the breakpoints were hit, check to see 2257 * which ones and if any of them are user space addresses 2258 */ 2259 2260 if (bp & 0x01) { 2261 addr[nbp++] = (caddr_t)rdr0(); 2262 } 2263 if (bp & 0x02) { 2264 addr[nbp++] = (caddr_t)rdr1(); 2265 } 2266 if (bp & 0x04) { 2267 addr[nbp++] = (caddr_t)rdr2(); 2268 } 2269 if (bp & 0x08) { 2270 addr[nbp++] = (caddr_t)rdr3(); 2271 } 2272 2273 for (i=0; i<nbp; i++) { 2274 if (addr[i] < 2275 (caddr_t)VM_MAX_USER_ADDRESS) { 2276 /* 2277 * addr[i] is in user space 2278 */ 2279 return nbp; 2280 } 2281 } 2282 2283 /* 2284 * None of the breakpoints are in user space. 2285 */ 2286 return 0; 2287 } 2288 2289 2290 #ifndef DDB 2291 void 2292 Debugger(const char *msg) 2293 { 2294 kprintf("Debugger(\"%s\") called.\n", msg); 2295 } 2296 #endif /* no DDB */ 2297 2298 #ifdef DDB 2299 2300 /* 2301 * Provide inb() and outb() as functions. They are normally only 2302 * available as macros calling inlined functions, thus cannot be 2303 * called inside DDB. 2304 * 2305 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined. 2306 */ 2307 2308 #undef inb 2309 #undef outb 2310 2311 /* silence compiler warnings */ 2312 u_char inb(u_int); 2313 void outb(u_int, u_char); 2314 2315 u_char 2316 inb(u_int port) 2317 { 2318 u_char data; 2319 /* 2320 * We use %%dx and not %1 here because i/o is done at %dx and not at 2321 * %edx, while gcc generates inferior code (movw instead of movl) 2322 * if we tell it to load (u_short) port. 2323 */ 2324 __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); 2325 return (data); 2326 } 2327 2328 void 2329 outb(u_int port, u_char data) 2330 { 2331 u_char al; 2332 /* 2333 * Use an unnecessary assignment to help gcc's register allocator. 2334 * This make a large difference for gcc-1.40 and a tiny difference 2335 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for 2336 * best results. gcc-2.6.0 can't handle this. 2337 */ 2338 al = data; 2339 __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); 2340 } 2341 2342 #endif /* DDB */ 2343 2344 2345 2346 #include "opt_cpu.h" 2347 2348 2349 /* 2350 * initialize all the SMP locks 2351 */ 2352 2353 /* critical region when masking or unmasking interupts */ 2354 struct spinlock_deprecated imen_spinlock; 2355 2356 /* critical region for old style disable_intr/enable_intr */ 2357 struct spinlock_deprecated mpintr_spinlock; 2358 2359 /* critical region around INTR() routines */ 2360 struct spinlock_deprecated intr_spinlock; 2361 2362 /* lock region used by kernel profiling */ 2363 struct spinlock_deprecated mcount_spinlock; 2364 2365 /* locks com (tty) data/hardware accesses: a FASTINTR() */ 2366 struct spinlock_deprecated com_spinlock; 2367 2368 /* lock regions around the clock hardware */ 2369 struct spinlock_deprecated clock_spinlock; 2370 2371 static void 2372 init_locks(void) 2373 { 2374 #ifdef SMP 2375 /* 2376 * Get the initial mplock with a count of 1 for the BSP. 2377 * This uses a LOGICAL cpu ID, ie BSP == 0. 2378 */ 2379 cpu_get_initial_mplock(); 2380 #endif 2381 /* DEPRECATED */ 2382 spin_lock_init(&mcount_spinlock); 2383 spin_lock_init(&intr_spinlock); 2384 spin_lock_init(&mpintr_spinlock); 2385 spin_lock_init(&imen_spinlock); 2386 spin_lock_init(&com_spinlock); 2387 spin_lock_init(&clock_spinlock); 2388 2389 /* our token pool needs to work early */ 2390 lwkt_token_pool_init(); 2391 } 2392 2393