1 /*- 2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 3 * Copyright (c) 1992 Terrence R. Lambert. 4 * Copyright (c) 2003 Peter Wemm. 5 * Copyright (c) 2008 The DragonFly Project. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * William Jolitz. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $ 41 */ 42 43 #include "use_ether.h" 44 //#include "use_npx.h" 45 #include "use_isa.h" 46 #include "opt_atalk.h" 47 #include "opt_compat.h" 48 #include "opt_cpu.h" 49 #include "opt_ddb.h" 50 #include "opt_directio.h" 51 #include "opt_inet.h" 52 #include "opt_ipx.h" 53 #include "opt_msgbuf.h" 54 #include "opt_swap.h" 55 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/sysproto.h> 59 #include <sys/signalvar.h> 60 #include <sys/kernel.h> 61 #include <sys/linker.h> 62 #include <sys/malloc.h> 63 #include <sys/proc.h> 64 #include <sys/priv.h> 65 #include <sys/buf.h> 66 #include <sys/reboot.h> 67 #include <sys/mbuf.h> 68 #include <sys/msgbuf.h> 69 #include <sys/sysent.h> 70 #include <sys/sysctl.h> 71 #include <sys/vmmeter.h> 72 #include <sys/bus.h> 73 #include <sys/upcall.h> 74 #include <sys/usched.h> 75 #include <sys/reg.h> 76 77 #include <vm/vm.h> 78 #include <vm/vm_param.h> 79 #include <sys/lock.h> 80 #include <vm/vm_kern.h> 81 #include <vm/vm_object.h> 82 #include <vm/vm_page.h> 83 #include <vm/vm_map.h> 84 #include <vm/vm_pager.h> 85 #include <vm/vm_extern.h> 86 87 #include <sys/thread2.h> 88 #include <sys/mplock2.h> 89 90 #include <sys/user.h> 91 #include <sys/exec.h> 92 #include <sys/cons.h> 93 94 #include <ddb/ddb.h> 95 96 #include <machine/cpu.h> 97 #include <machine/clock.h> 98 #include <machine/specialreg.h> 99 #if JG 100 #include <machine/bootinfo.h> 101 #endif 102 #include <machine/md_var.h> 103 #include <machine/metadata.h> 104 #include <machine/pc/bios.h> 105 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */ 106 #include <machine/globaldata.h> /* CPU_prvspace */ 107 #include <machine/smp.h> 108 #ifdef PERFMON 109 #include <machine/perfmon.h> 110 #endif 111 #include <machine/cputypes.h> 112 113 #ifdef OLD_BUS_ARCH 114 #include <bus/isa/isa_device.h> 115 #endif 116 #include <machine_base/isa/intr_machdep.h> 117 #include <bus/isa/rtc.h> 118 #include <sys/random.h> 119 #include <sys/ptrace.h> 120 #include <machine/sigframe.h> 121 122 #define PHYSMAP_ENTRIES 10 123 124 extern void init386(int first); 125 extern void dblfault_handler(void); 126 extern u_int64_t hammer_time(u_int64_t, u_int64_t); 127 128 extern void printcpuinfo(void); /* XXX header file */ 129 extern void identify_cpu(void); 130 #if JG 131 extern void finishidentcpu(void); 132 #endif 133 extern void panicifcpuunsupported(void); 134 135 static void cpu_startup(void *); 136 #ifndef CPU_DISABLE_SSE 137 static void set_fpregs_xmm(struct save87 *, struct savexmm *); 138 static void fill_fpregs_xmm(struct savexmm *, struct save87 *); 139 #endif /* CPU_DISABLE_SSE */ 140 #ifdef DIRECTIO 141 extern void ffs_rawread_setup(void); 142 #endif /* DIRECTIO */ 143 static void init_locks(void); 144 145 SYSINIT(cpu, SI_BOOT2_SMP, SI_ORDER_FIRST, cpu_startup, NULL) 146 147 #ifdef DDB 148 extern vm_offset_t ksym_start, ksym_end; 149 #endif 150 151 uint64_t KPTphys; 152 uint64_t SMPptpa; 153 pt_entry_t *SMPpt; 154 155 156 struct privatespace CPU_prvspace[MAXCPU]; 157 158 int _udatasel, _ucodesel, _ucode32sel; 159 u_long atdevbase; 160 #ifdef SMP 161 int64_t tsc_offsets[MAXCPU]; 162 #else 163 int64_t tsc_offsets[1]; 164 #endif 165 166 #if defined(SWTCH_OPTIM_STATS) 167 extern int swtch_optim_stats; 168 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, 169 CTLFLAG_RD, &swtch_optim_stats, 0, ""); 170 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, 171 CTLFLAG_RD, &tlb_flush_count, 0, ""); 172 #endif 173 174 int physmem = 0; 175 176 static int 177 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS) 178 { 179 int error = sysctl_handle_int(oidp, 0, ctob(physmem), req); 180 return (error); 181 } 182 183 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD, 184 0, 0, sysctl_hw_physmem, "IU", ""); 185 186 static int 187 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS) 188 { 189 int error = sysctl_handle_int(oidp, 0, 190 ctob(physmem - vmstats.v_wire_count), req); 191 return (error); 192 } 193 194 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD, 195 0, 0, sysctl_hw_usermem, "IU", ""); 196 197 static int 198 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS) 199 { 200 int error = sysctl_handle_int(oidp, 0, 201 x86_64_btop(avail_end - avail_start), req); 202 return (error); 203 } 204 205 SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_INT|CTLFLAG_RD, 206 0, 0, sysctl_hw_availpages, "I", ""); 207 208 vm_paddr_t Maxmem = 0; 209 210 /* 211 * The number of PHYSMAP entries must be one less than the number of 212 * PHYSSEG entries because the PHYSMAP entry that spans the largest 213 * physical address that is accessible by ISA DMA is split into two 214 * PHYSSEG entries. 215 */ 216 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 217 218 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 219 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 220 221 /* must be 2 less so 0 0 can signal end of chunks */ 222 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2) 223 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2) 224 225 static vm_offset_t buffer_sva, buffer_eva; 226 vm_offset_t clean_sva, clean_eva; 227 static vm_offset_t pager_sva, pager_eva; 228 static struct trapframe proc0_tf; 229 230 static void 231 cpu_startup(void *dummy) 232 { 233 caddr_t v; 234 vm_size_t size = 0; 235 vm_offset_t firstaddr; 236 237 if (boothowto & RB_VERBOSE) 238 bootverbose++; 239 240 /* 241 * Good {morning,afternoon,evening,night}. 242 */ 243 kprintf("%s", version); 244 startrtclock(); 245 printcpuinfo(); 246 panicifcpuunsupported(); 247 #ifdef PERFMON 248 perfmon_init(); 249 #endif 250 kprintf("real memory = %ju (%juK bytes)\n", 251 (intmax_t)ptoa(Maxmem), 252 (intmax_t)ptoa(Maxmem) / 1024); 253 /* 254 * Display any holes after the first chunk of extended memory. 255 */ 256 if (bootverbose) { 257 int indx; 258 259 kprintf("Physical memory chunk(s):\n"); 260 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 261 vm_paddr_t size1 = phys_avail[indx + 1] - phys_avail[indx]; 262 263 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n", 264 (intmax_t)phys_avail[indx], 265 (intmax_t)phys_avail[indx + 1] - 1, 266 (intmax_t)size1, 267 (intmax_t)(size1 / PAGE_SIZE)); 268 } 269 } 270 271 /* 272 * Allocate space for system data structures. 273 * The first available kernel virtual address is in "v". 274 * As pages of kernel virtual memory are allocated, "v" is incremented. 275 * As pages of memory are allocated and cleared, 276 * "firstaddr" is incremented. 277 * An index into the kernel page table corresponding to the 278 * virtual memory address maintained in "v" is kept in "mapaddr". 279 */ 280 281 /* 282 * Make two passes. The first pass calculates how much memory is 283 * needed and allocates it. The second pass assigns virtual 284 * addresses to the various data structures. 285 */ 286 firstaddr = 0; 287 again: 288 v = (caddr_t)firstaddr; 289 290 #define valloc(name, type, num) \ 291 (name) = (type *)v; v = (caddr_t)((name)+(num)) 292 #define valloclim(name, type, num, lim) \ 293 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) 294 295 /* 296 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. 297 * For the first 64MB of ram nominally allocate sufficient buffers to 298 * cover 1/4 of our ram. Beyond the first 64MB allocate additional 299 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing 300 * the buffer cache we limit the eventual kva reservation to 301 * maxbcache bytes. 302 * 303 * factor represents the 1/4 x ram conversion. 304 */ 305 if (nbuf == 0) { 306 int factor = 4 * BKVASIZE / 1024; 307 int kbytes = physmem * (PAGE_SIZE / 1024); 308 309 nbuf = 50; 310 if (kbytes > 4096) 311 nbuf += min((kbytes - 4096) / factor, 65536 / factor); 312 if (kbytes > 65536) 313 nbuf += (kbytes - 65536) * 2 / (factor * 5); 314 if (maxbcache && nbuf > maxbcache / BKVASIZE) 315 nbuf = maxbcache / BKVASIZE; 316 } 317 318 /* 319 * Do not allow the buffer_map to be more then 1/2 the size of the 320 * kernel_map. 321 */ 322 if (nbuf > (virtual_end - virtual_start) / (BKVASIZE * 2)) { 323 nbuf = (virtual_end - virtual_start) / (BKVASIZE * 2); 324 kprintf("Warning: nbufs capped at %d\n", nbuf); 325 } 326 327 nswbuf = max(min(nbuf/4, 256), 16); 328 #ifdef NSWBUF_MIN 329 if (nswbuf < NSWBUF_MIN) 330 nswbuf = NSWBUF_MIN; 331 #endif 332 #ifdef DIRECTIO 333 ffs_rawread_setup(); 334 #endif 335 336 valloc(swbuf, struct buf, nswbuf); 337 valloc(buf, struct buf, nbuf); 338 339 /* 340 * End of first pass, size has been calculated so allocate memory 341 */ 342 if (firstaddr == 0) { 343 size = (vm_size_t)(v - firstaddr); 344 firstaddr = kmem_alloc(&kernel_map, round_page(size)); 345 if (firstaddr == 0) 346 panic("startup: no room for tables"); 347 goto again; 348 } 349 350 /* 351 * End of second pass, addresses have been assigned 352 */ 353 if ((vm_size_t)(v - firstaddr) != size) 354 panic("startup: table size inconsistency"); 355 356 kmem_suballoc(&kernel_map, &clean_map, &clean_sva, &clean_eva, 357 (nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size); 358 kmem_suballoc(&clean_map, &buffer_map, &buffer_sva, &buffer_eva, 359 (nbuf*BKVASIZE)); 360 buffer_map.system_map = 1; 361 kmem_suballoc(&clean_map, &pager_map, &pager_sva, &pager_eva, 362 (nswbuf*MAXPHYS) + pager_map_size); 363 pager_map.system_map = 1; 364 365 #if defined(USERCONFIG) 366 userconfig(); 367 cninit(); /* the preferred console may have changed */ 368 #endif 369 370 kprintf("avail memory = %lu (%luK bytes)\n", 371 ptoa(vmstats.v_free_count), 372 ptoa(vmstats.v_free_count) / 1024); 373 374 /* 375 * Set up buffers, so they can be used to read disk labels. 376 */ 377 bufinit(); 378 vm_pager_bufferinit(); 379 380 #ifdef SMP 381 /* 382 * OK, enough kmem_alloc/malloc state should be up, lets get on with it! 383 */ 384 mp_start(); /* fire up the APs and APICs */ 385 mp_announce(); 386 #endif /* SMP */ 387 cpu_setregs(); 388 } 389 390 /* 391 * Send an interrupt to process. 392 * 393 * Stack is set up to allow sigcode stored 394 * at top to call routine, followed by kcall 395 * to sigreturn routine below. After sigreturn 396 * resets the signal mask, the stack, and the 397 * frame pointer, it returns to the user 398 * specified pc, psl. 399 */ 400 void 401 sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code) 402 { 403 struct lwp *lp = curthread->td_lwp; 404 struct proc *p = lp->lwp_proc; 405 struct trapframe *regs; 406 struct sigacts *psp = p->p_sigacts; 407 struct sigframe sf, *sfp; 408 int oonstack; 409 char *sp; 410 411 regs = lp->lwp_md.md_regs; 412 oonstack = (lp->lwp_sigstk.ss_flags & SS_ONSTACK) ? 1 : 0; 413 414 /* Save user context */ 415 bzero(&sf, sizeof(struct sigframe)); 416 sf.sf_uc.uc_sigmask = *mask; 417 sf.sf_uc.uc_stack = lp->lwp_sigstk; 418 sf.sf_uc.uc_mcontext.mc_onstack = oonstack; 419 KKASSERT(__offsetof(struct trapframe, tf_rdi) == 0); 420 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(struct trapframe)); 421 422 /* Make the size of the saved context visible to userland */ 423 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); 424 425 /* Save mailbox pending state for syscall interlock semantics */ 426 if (p->p_flag & P_MAILBOX) 427 sf.sf_uc.uc_mcontext.mc_xflags |= PGEX_MAILBOX; 428 429 /* Allocate and validate space for the signal handler context. */ 430 if ((lp->lwp_flag & LWP_ALTSTACK) != 0 && !oonstack && 431 SIGISMEMBER(psp->ps_sigonstack, sig)) { 432 sp = (char *)(lp->lwp_sigstk.ss_sp + lp->lwp_sigstk.ss_size - 433 sizeof(struct sigframe)); 434 lp->lwp_sigstk.ss_flags |= SS_ONSTACK; 435 } else { 436 /* We take red zone into account */ 437 sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128; 438 } 439 440 /* Align to 16 bytes */ 441 sfp = (struct sigframe *)((intptr_t)sp & ~0xFUL); 442 443 /* Translate the signal is appropriate */ 444 if (p->p_sysent->sv_sigtbl) { 445 if (sig <= p->p_sysent->sv_sigsize) 446 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 447 } 448 449 /* 450 * Build the argument list for the signal handler. 451 * 452 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx) 453 */ 454 regs->tf_rdi = sig; /* argument 1 */ 455 regs->tf_rdx = (register_t)&sfp->sf_uc; /* argument 3 */ 456 457 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 458 /* 459 * Signal handler installed with SA_SIGINFO. 460 * 461 * action(signo, siginfo, ucontext) 462 */ 463 regs->tf_rsi = (register_t)&sfp->sf_si; /* argument 2 */ 464 regs->tf_rcx = (register_t)regs->tf_err; /* argument 4 */ 465 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 466 467 /* fill siginfo structure */ 468 sf.sf_si.si_signo = sig; 469 sf.sf_si.si_code = code; 470 sf.sf_si.si_addr = (void *)regs->tf_err; 471 } else { 472 /* 473 * Old FreeBSD-style arguments. 474 * 475 * handler (signo, code, [uc], addr) 476 */ 477 regs->tf_rsi = (register_t)code; /* argument 2 */ 478 regs->tf_rcx = (register_t)regs->tf_err; /* argument 4 */ 479 sf.sf_ahu.sf_handler = catcher; 480 } 481 482 /* 483 * If we're a vm86 process, we want to save the segment registers. 484 * We also change eflags to be our emulated eflags, not the actual 485 * eflags. 486 */ 487 #if JG 488 if (regs->tf_eflags & PSL_VM) { 489 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 490 struct vm86_kernel *vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86; 491 492 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 493 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 494 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 495 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 496 497 if (vm86->vm86_has_vme == 0) 498 sf.sf_uc.uc_mcontext.mc_eflags = 499 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 500 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 501 502 /* 503 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 504 * syscalls made by the signal handler. This just avoids 505 * wasting time for our lazy fixup of such faults. PSL_NT 506 * does nothing in vm86 mode, but vm86 programs can set it 507 * almost legitimately in probes for old cpu types. 508 */ 509 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 510 } 511 #endif 512 513 /* 514 * Save the FPU state and reinit the FP unit 515 */ 516 npxpush(&sf.sf_uc.uc_mcontext); 517 518 /* 519 * Copy the sigframe out to the user's stack. 520 */ 521 if (copyout(&sf, sfp, sizeof(struct sigframe)) != 0) { 522 /* 523 * Something is wrong with the stack pointer. 524 * ...Kill the process. 525 */ 526 sigexit(lp, SIGILL); 527 } 528 529 regs->tf_rsp = (register_t)sfp; 530 regs->tf_rip = PS_STRINGS - *(p->p_sysent->sv_szsigcode); 531 532 /* 533 * i386 abi specifies that the direction flag must be cleared 534 * on function entry 535 */ 536 regs->tf_rflags &= ~(PSL_T|PSL_D); 537 538 /* 539 * 64 bit mode has a code and stack selector but 540 * no data or extra selector. %fs and %gs are not 541 * stored in-context. 542 */ 543 regs->tf_cs = _ucodesel; 544 regs->tf_ss = _udatasel; 545 } 546 547 /* 548 * Sanitize the trapframe for a virtual kernel passing control to a custom 549 * VM context. Remove any items that would otherwise create a privilage 550 * issue. 551 * 552 * XXX at the moment we allow userland to set the resume flag. Is this a 553 * bad idea? 554 */ 555 int 556 cpu_sanitize_frame(struct trapframe *frame) 557 { 558 frame->tf_cs = _ucodesel; 559 frame->tf_ss = _udatasel; 560 /* XXX VM (8086) mode not supported? */ 561 frame->tf_rflags &= (PSL_RF | PSL_USERCHANGE | PSL_VM_UNSUPP); 562 frame->tf_rflags |= PSL_RESERVED_DEFAULT | PSL_I; 563 564 return(0); 565 } 566 567 /* 568 * Sanitize the tls so loading the descriptor does not blow up 569 * on us. For x86_64 we don't have to do anything. 570 */ 571 int 572 cpu_sanitize_tls(struct savetls *tls) 573 { 574 return(0); 575 } 576 577 /* 578 * sigreturn(ucontext_t *sigcntxp) 579 * 580 * System call to cleanup state after a signal 581 * has been taken. Reset signal mask and 582 * stack state from context left by sendsig (above). 583 * Return to previous pc and psl as specified by 584 * context left by sendsig. Check carefully to 585 * make sure that the user has not modified the 586 * state to gain improper privileges. 587 * 588 * MPSAFE 589 */ 590 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 591 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 592 593 int 594 sys_sigreturn(struct sigreturn_args *uap) 595 { 596 struct lwp *lp = curthread->td_lwp; 597 struct proc *p = lp->lwp_proc; 598 struct trapframe *regs; 599 ucontext_t uc; 600 ucontext_t *ucp; 601 register_t rflags; 602 int cs; 603 int error; 604 605 /* 606 * We have to copy the information into kernel space so userland 607 * can't modify it while we are sniffing it. 608 */ 609 regs = lp->lwp_md.md_regs; 610 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 611 if (error) 612 return (error); 613 ucp = &uc; 614 rflags = ucp->uc_mcontext.mc_rflags; 615 616 /* VM (8086) mode not supported */ 617 rflags &= ~PSL_VM_UNSUPP; 618 619 #if JG 620 if (eflags & PSL_VM) { 621 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 622 struct vm86_kernel *vm86; 623 624 /* 625 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 626 * set up the vm86 area, and we can't enter vm86 mode. 627 */ 628 if (lp->lwp_thread->td_pcb->pcb_ext == 0) 629 return (EINVAL); 630 vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86; 631 if (vm86->vm86_inited == 0) 632 return (EINVAL); 633 634 /* go back to user mode if both flags are set */ 635 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) 636 trapsignal(lp, SIGBUS, 0); 637 638 if (vm86->vm86_has_vme) { 639 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 640 (eflags & VME_USERCHANGE) | PSL_VM; 641 } else { 642 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 643 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 644 (eflags & VM_USERCHANGE) | PSL_VM; 645 } 646 bcopy(&ucp->uc_mcontext.mc_gs, tf, sizeof(struct trapframe)); 647 tf->tf_eflags = eflags; 648 tf->tf_vm86_ds = tf->tf_ds; 649 tf->tf_vm86_es = tf->tf_es; 650 tf->tf_vm86_fs = tf->tf_fs; 651 tf->tf_vm86_gs = tf->tf_gs; 652 tf->tf_ds = _udatasel; 653 tf->tf_es = _udatasel; 654 tf->tf_fs = _udatasel; 655 tf->tf_gs = _udatasel; 656 } else 657 #endif 658 { 659 /* 660 * Don't allow users to change privileged or reserved flags. 661 */ 662 /* 663 * XXX do allow users to change the privileged flag PSL_RF. 664 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers 665 * should sometimes set it there too. tf_eflags is kept in 666 * the signal context during signal handling and there is no 667 * other place to remember it, so the PSL_RF bit may be 668 * corrupted by the signal handler without us knowing. 669 * Corruption of the PSL_RF bit at worst causes one more or 670 * one less debugger trap, so allowing it is fairly harmless. 671 */ 672 if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { 673 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags); 674 return(EINVAL); 675 } 676 677 /* 678 * Don't allow users to load a valid privileged %cs. Let the 679 * hardware check for invalid selectors, excess privilege in 680 * other selectors, invalid %eip's and invalid %esp's. 681 */ 682 cs = ucp->uc_mcontext.mc_cs; 683 if (!CS_SECURE(cs)) { 684 kprintf("sigreturn: cs = 0x%x\n", cs); 685 trapsignal(lp, SIGBUS, T_PROTFLT); 686 return(EINVAL); 687 } 688 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(struct trapframe)); 689 } 690 691 /* 692 * Restore the FPU state from the frame 693 */ 694 crit_enter(); 695 npxpop(&ucp->uc_mcontext); 696 697 /* 698 * Merge saved signal mailbox pending flag to maintain interlock 699 * semantics against system calls. 700 */ 701 if (ucp->uc_mcontext.mc_xflags & PGEX_MAILBOX) 702 p->p_flag |= P_MAILBOX; 703 704 if (ucp->uc_mcontext.mc_onstack & 1) 705 lp->lwp_sigstk.ss_flags |= SS_ONSTACK; 706 else 707 lp->lwp_sigstk.ss_flags &= ~SS_ONSTACK; 708 709 lp->lwp_sigmask = ucp->uc_sigmask; 710 SIG_CANTMASK(lp->lwp_sigmask); 711 crit_exit(); 712 return(EJUSTRETURN); 713 } 714 715 /* 716 * Stack frame on entry to function. %rax will contain the function vector, 717 * %rcx will contain the function data. flags, rcx, and rax will have 718 * already been pushed on the stack. 719 */ 720 struct upc_frame { 721 register_t rax; 722 register_t rcx; 723 register_t rdx; 724 register_t flags; 725 register_t oldip; 726 }; 727 728 void 729 sendupcall(struct vmupcall *vu, int morepending) 730 { 731 struct lwp *lp = curthread->td_lwp; 732 struct trapframe *regs; 733 struct upcall upcall; 734 struct upc_frame upc_frame; 735 int crit_count = 0; 736 737 /* 738 * If we are a virtual kernel running an emulated user process 739 * context, switch back to the virtual kernel context before 740 * trying to post the signal. 741 */ 742 if (lp->lwp_vkernel && lp->lwp_vkernel->ve) { 743 lp->lwp_md.md_regs->tf_trapno = 0; 744 vkernel_trap(lp, lp->lwp_md.md_regs); 745 } 746 747 /* 748 * Get the upcall data structure 749 */ 750 if (copyin(lp->lwp_upcall, &upcall, sizeof(upcall)) || 751 copyin((char *)upcall.upc_uthread + upcall.upc_critoff, &crit_count, sizeof(int)) 752 ) { 753 vu->vu_pending = 0; 754 kprintf("bad upcall address\n"); 755 return; 756 } 757 758 /* 759 * If the data structure is already marked pending or has a critical 760 * section count, mark the data structure as pending and return 761 * without doing an upcall. vu_pending is left set. 762 */ 763 if (upcall.upc_pending || crit_count >= vu->vu_pending) { 764 if (upcall.upc_pending < vu->vu_pending) { 765 upcall.upc_pending = vu->vu_pending; 766 copyout(&upcall.upc_pending, &lp->lwp_upcall->upc_pending, 767 sizeof(upcall.upc_pending)); 768 } 769 return; 770 } 771 772 /* 773 * We can run this upcall now, clear vu_pending. 774 * 775 * Bump our critical section count and set or clear the 776 * user pending flag depending on whether more upcalls are 777 * pending. The user will be responsible for calling 778 * upc_dispatch(-1) to process remaining upcalls. 779 */ 780 vu->vu_pending = 0; 781 upcall.upc_pending = morepending; 782 crit_count += TDPRI_CRIT; 783 copyout(&upcall.upc_pending, &lp->lwp_upcall->upc_pending, 784 sizeof(upcall.upc_pending)); 785 copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, 786 sizeof(int)); 787 788 /* 789 * Construct a stack frame and issue the upcall 790 */ 791 regs = lp->lwp_md.md_regs; 792 upc_frame.rax = regs->tf_rax; 793 upc_frame.rcx = regs->tf_rcx; 794 upc_frame.rdx = regs->tf_rdx; 795 upc_frame.flags = regs->tf_rflags; 796 upc_frame.oldip = regs->tf_rip; 797 if (copyout(&upc_frame, (void *)(regs->tf_rsp - sizeof(upc_frame)), 798 sizeof(upc_frame)) != 0) { 799 kprintf("bad stack on upcall\n"); 800 } else { 801 regs->tf_rax = (register_t)vu->vu_func; 802 regs->tf_rcx = (register_t)vu->vu_data; 803 regs->tf_rdx = (register_t)lp->lwp_upcall; 804 regs->tf_rip = (register_t)vu->vu_ctx; 805 regs->tf_rsp -= sizeof(upc_frame); 806 } 807 } 808 809 /* 810 * fetchupcall occurs in the context of a system call, which means that 811 * we have to return EJUSTRETURN in order to prevent eax and edx from 812 * being overwritten by the syscall return value. 813 * 814 * if vu is not NULL we return the new context in %edx, the new data in %ecx, 815 * and the function pointer in %eax. 816 */ 817 int 818 fetchupcall(struct vmupcall *vu, int morepending, void *rsp) 819 { 820 struct upc_frame upc_frame; 821 struct lwp *lp = curthread->td_lwp; 822 struct trapframe *regs; 823 int error; 824 struct upcall upcall; 825 int crit_count; 826 827 regs = lp->lwp_md.md_regs; 828 829 error = copyout(&morepending, &lp->lwp_upcall->upc_pending, sizeof(int)); 830 if (error == 0) { 831 if (vu) { 832 /* 833 * This jumps us to the next ready context. 834 */ 835 vu->vu_pending = 0; 836 error = copyin(lp->lwp_upcall, &upcall, sizeof(upcall)); 837 crit_count = 0; 838 if (error == 0) 839 error = copyin((char *)upcall.upc_uthread + upcall.upc_critoff, &crit_count, sizeof(int)); 840 crit_count += TDPRI_CRIT; 841 if (error == 0) 842 error = copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, sizeof(int)); 843 regs->tf_rax = (register_t)vu->vu_func; 844 regs->tf_rcx = (register_t)vu->vu_data; 845 regs->tf_rdx = (register_t)lp->lwp_upcall; 846 regs->tf_rip = (register_t)vu->vu_ctx; 847 regs->tf_rsp = (register_t)rsp; 848 } else { 849 /* 850 * This returns us to the originally interrupted code. 851 */ 852 error = copyin(rsp, &upc_frame, sizeof(upc_frame)); 853 regs->tf_rax = upc_frame.rax; 854 regs->tf_rcx = upc_frame.rcx; 855 regs->tf_rdx = upc_frame.rdx; 856 regs->tf_rflags = (regs->tf_rflags & ~PSL_USERCHANGE) | 857 (upc_frame.flags & PSL_USERCHANGE); 858 regs->tf_rip = upc_frame.oldip; 859 regs->tf_rsp = (register_t)((char *)rsp + sizeof(upc_frame)); 860 } 861 } 862 if (error == 0) 863 error = EJUSTRETURN; 864 return(error); 865 } 866 867 /* 868 * Machine dependent boot() routine 869 * 870 * I haven't seen anything to put here yet 871 * Possibly some stuff might be grafted back here from boot() 872 */ 873 void 874 cpu_boot(int howto) 875 { 876 } 877 878 /* 879 * Shutdown the CPU as much as possible 880 */ 881 void 882 cpu_halt(void) 883 { 884 for (;;) 885 __asm__ __volatile("hlt"); 886 } 887 888 /* 889 * cpu_idle() represents the idle LWKT. You cannot return from this function 890 * (unless you want to blow things up!). Instead we look for runnable threads 891 * and loop or halt as appropriate. Giant is not held on entry to the thread. 892 * 893 * The main loop is entered with a critical section held, we must release 894 * the critical section before doing anything else. lwkt_switch() will 895 * check for pending interrupts due to entering and exiting its own 896 * critical section. 897 * 898 * Note on cpu_idle_hlt: On an SMP system we rely on a scheduler IPI 899 * to wake a HLTed cpu up. However, there are cases where the idlethread 900 * will be entered with the possibility that no IPI will occur and in such 901 * cases lwkt_switch() sets TDF_IDLE_NOHLT. 902 */ 903 static int cpu_idle_hlt = 1; 904 static int cpu_idle_hltcnt; 905 static int cpu_idle_spincnt; 906 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, 907 &cpu_idle_hlt, 0, "Idle loop HLT enable"); 908 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hltcnt, CTLFLAG_RW, 909 &cpu_idle_hltcnt, 0, "Idle loop entry halts"); 910 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_spincnt, CTLFLAG_RW, 911 &cpu_idle_spincnt, 0, "Idle loop entry spins"); 912 913 static void 914 cpu_idle_default_hook(void) 915 { 916 /* 917 * We must guarentee that hlt is exactly the instruction 918 * following the sti. 919 */ 920 __asm __volatile("sti; hlt"); 921 } 922 923 /* Other subsystems (e.g., ACPI) can hook this later. */ 924 void (*cpu_idle_hook)(void) = cpu_idle_default_hook; 925 926 void 927 cpu_idle(void) 928 { 929 struct thread *td = curthread; 930 931 crit_exit(); 932 KKASSERT(td->td_pri < TDPRI_CRIT); 933 for (;;) { 934 /* 935 * See if there are any LWKTs ready to go. 936 */ 937 lwkt_switch(); 938 939 /* 940 * If we are going to halt call splz unconditionally after 941 * CLIing to catch any interrupt races. Note that we are 942 * at SPL0 and interrupts are enabled. 943 */ 944 if (cpu_idle_hlt && !lwkt_runnable() && 945 (td->td_flags & TDF_IDLE_NOHLT) == 0) { 946 __asm __volatile("cli"); 947 splz(); 948 if (!lwkt_runnable()) 949 cpu_idle_hook(); 950 #ifdef SMP 951 else 952 __asm __volatile("pause"); 953 #endif 954 ++cpu_idle_hltcnt; 955 } else { 956 td->td_flags &= ~TDF_IDLE_NOHLT; 957 splz(); 958 #ifdef SMP 959 __asm __volatile("sti; pause"); 960 #else 961 __asm __volatile("sti"); 962 #endif 963 ++cpu_idle_spincnt; 964 } 965 } 966 } 967 968 #ifdef SMP 969 970 /* 971 * This routine is called when the only runnable threads require 972 * the MP lock, and the scheduler couldn't get it. On a real cpu 973 * we let the scheduler spin. 974 */ 975 void 976 cpu_mplock_contested(void) 977 { 978 cpu_pause(); 979 } 980 981 /* 982 * This routine is called if a spinlock has been held through the 983 * exponential backoff period and is seriously contested. On a real cpu 984 * we let it spin. 985 */ 986 void 987 cpu_spinlock_contested(void) 988 { 989 cpu_pause(); 990 } 991 992 #endif 993 994 /* 995 * Clear registers on exec 996 */ 997 void 998 exec_setregs(u_long entry, u_long stack, u_long ps_strings) 999 { 1000 struct thread *td = curthread; 1001 struct lwp *lp = td->td_lwp; 1002 struct pcb *pcb = td->td_pcb; 1003 struct trapframe *regs = lp->lwp_md.md_regs; 1004 1005 /* was i386_user_cleanup() in NetBSD */ 1006 user_ldt_free(pcb); 1007 1008 bzero((char *)regs, sizeof(struct trapframe)); 1009 regs->tf_rip = entry; 1010 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; /* align the stack */ 1011 regs->tf_rdi = stack; /* argv */ 1012 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); 1013 regs->tf_ss = _udatasel; 1014 regs->tf_cs = _ucodesel; 1015 regs->tf_rbx = ps_strings; 1016 1017 /* 1018 * Reset the hardware debug registers if they were in use. 1019 * They won't have any meaning for the newly exec'd process. 1020 */ 1021 if (pcb->pcb_flags & PCB_DBREGS) { 1022 pcb->pcb_dr0 = 0; 1023 pcb->pcb_dr1 = 0; 1024 pcb->pcb_dr2 = 0; 1025 pcb->pcb_dr3 = 0; 1026 pcb->pcb_dr6 = 0; 1027 pcb->pcb_dr7 = 0; /* JG set bit 10? */ 1028 if (pcb == td->td_pcb) { 1029 /* 1030 * Clear the debug registers on the running 1031 * CPU, otherwise they will end up affecting 1032 * the next process we switch to. 1033 */ 1034 reset_dbregs(); 1035 } 1036 pcb->pcb_flags &= ~PCB_DBREGS; 1037 } 1038 1039 /* 1040 * Initialize the math emulator (if any) for the current process. 1041 * Actually, just clear the bit that says that the emulator has 1042 * been initialized. Initialization is delayed until the process 1043 * traps to the emulator (if it is done at all) mainly because 1044 * emulators don't provide an entry point for initialization. 1045 */ 1046 pcb->pcb_flags &= ~FP_SOFTFP; 1047 1048 /* 1049 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing 1050 * gd_npxthread. Otherwise a preemptive interrupt thread 1051 * may panic in npxdna(). 1052 */ 1053 crit_enter(); 1054 load_cr0(rcr0() | CR0_MP); 1055 1056 /* 1057 * NOTE: The MSR values must be correct so we can return to 1058 * userland. gd_user_fs/gs must be correct so the switch 1059 * code knows what the current MSR values are. 1060 */ 1061 pcb->pcb_fsbase = 0; /* Values loaded from PCB on switch */ 1062 pcb->pcb_gsbase = 0; 1063 mdcpu->gd_user_fs = 0; /* Cache of current MSR values */ 1064 mdcpu->gd_user_gs = 0; 1065 wrmsr(MSR_FSBASE, 0); /* Set MSR values for return to userland */ 1066 wrmsr(MSR_KGSBASE, 0); 1067 1068 /* Initialize the npx (if any) for the current process. */ 1069 npxinit(__INITIAL_NPXCW__); 1070 crit_exit(); 1071 1072 pcb->pcb_ds = _udatasel; 1073 pcb->pcb_es = _udatasel; 1074 pcb->pcb_fs = _udatasel; 1075 pcb->pcb_gs = _udatasel; 1076 } 1077 1078 void 1079 cpu_setregs(void) 1080 { 1081 register_t cr0; 1082 1083 cr0 = rcr0(); 1084 cr0 |= CR0_NE; /* Done by npxinit() */ 1085 cr0 |= CR0_MP | CR0_TS; /* Done at every execve() too. */ 1086 cr0 |= CR0_WP | CR0_AM; 1087 load_cr0(cr0); 1088 load_gs(_udatasel); 1089 } 1090 1091 static int 1092 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS) 1093 { 1094 int error; 1095 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, 1096 req); 1097 if (!error && req->newptr) 1098 resettodr(); 1099 return (error); 1100 } 1101 1102 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, 1103 &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); 1104 1105 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, 1106 CTLFLAG_RW, &disable_rtc_set, 0, ""); 1107 1108 #if JG 1109 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, 1110 CTLFLAG_RD, &bootinfo, bootinfo, ""); 1111 #endif 1112 1113 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, 1114 CTLFLAG_RW, &wall_cmos_clock, 0, ""); 1115 1116 extern u_long bootdev; /* not a cdev_t - encoding is different */ 1117 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, 1118 CTLFLAG_RD, &bootdev, 0, "Boot device (not in cdev_t format)"); 1119 1120 /* 1121 * Initialize 386 and configure to run kernel 1122 */ 1123 1124 /* 1125 * Initialize segments & interrupt table 1126 */ 1127 1128 int _default_ldt; 1129 struct user_segment_descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ 1130 static struct gate_descriptor idt0[NIDT]; 1131 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 1132 #if JG 1133 union descriptor ldt[NLDT]; /* local descriptor table */ 1134 #endif 1135 1136 /* table descriptors - used to load tables by cpu */ 1137 struct region_descriptor r_gdt, r_idt; 1138 1139 #if defined(I586_CPU) && !defined(NO_F00F_HACK) 1140 extern int has_f00f_bug; 1141 #endif 1142 1143 static char dblfault_stack[PAGE_SIZE] __aligned(16); 1144 1145 /* JG proc0paddr is a virtual address */ 1146 void *proc0paddr; 1147 /* JG alignment? */ 1148 char proc0paddr_buff[LWKT_THREAD_STACK]; 1149 1150 1151 /* software prototypes -- in more palatable form */ 1152 struct soft_segment_descriptor gdt_segs[] = { 1153 /* GNULL_SEL 0 Null Descriptor */ 1154 { 0x0, /* segment base address */ 1155 0x0, /* length */ 1156 0, /* segment type */ 1157 0, /* segment descriptor priority level */ 1158 0, /* segment descriptor present */ 1159 0, /* long */ 1160 0, /* default 32 vs 16 bit size */ 1161 0 /* limit granularity (byte/page units)*/ }, 1162 /* GCODE_SEL 1 Code Descriptor for kernel */ 1163 { 0x0, /* segment base address */ 1164 0xfffff, /* length - all address space */ 1165 SDT_MEMERA, /* segment type */ 1166 SEL_KPL, /* segment descriptor priority level */ 1167 1, /* segment descriptor present */ 1168 1, /* long */ 1169 0, /* default 32 vs 16 bit size */ 1170 1 /* limit granularity (byte/page units)*/ }, 1171 /* GDATA_SEL 2 Data Descriptor for kernel */ 1172 { 0x0, /* segment base address */ 1173 0xfffff, /* length - all address space */ 1174 SDT_MEMRWA, /* segment type */ 1175 SEL_KPL, /* segment descriptor priority level */ 1176 1, /* segment descriptor present */ 1177 1, /* long */ 1178 0, /* default 32 vs 16 bit size */ 1179 1 /* limit granularity (byte/page units)*/ }, 1180 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */ 1181 { 0x0, /* segment base address */ 1182 0xfffff, /* length - all address space */ 1183 SDT_MEMERA, /* segment type */ 1184 SEL_UPL, /* segment descriptor priority level */ 1185 1, /* segment descriptor present */ 1186 0, /* long */ 1187 1, /* default 32 vs 16 bit size */ 1188 1 /* limit granularity (byte/page units)*/ }, 1189 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */ 1190 { 0x0, /* segment base address */ 1191 0xfffff, /* length - all address space */ 1192 SDT_MEMRWA, /* segment type */ 1193 SEL_UPL, /* segment descriptor priority level */ 1194 1, /* segment descriptor present */ 1195 0, /* long */ 1196 1, /* default 32 vs 16 bit size */ 1197 1 /* limit granularity (byte/page units)*/ }, 1198 /* GUCODE_SEL 5 64 bit Code Descriptor for user */ 1199 { 0x0, /* segment base address */ 1200 0xfffff, /* length - all address space */ 1201 SDT_MEMERA, /* segment type */ 1202 SEL_UPL, /* segment descriptor priority level */ 1203 1, /* segment descriptor present */ 1204 1, /* long */ 1205 0, /* default 32 vs 16 bit size */ 1206 1 /* limit granularity (byte/page units)*/ }, 1207 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */ 1208 { 1209 0x0, /* segment base address */ 1210 sizeof(struct x86_64tss)-1,/* length - all address space */ 1211 SDT_SYSTSS, /* segment type */ 1212 SEL_KPL, /* segment descriptor priority level */ 1213 1, /* segment descriptor present */ 1214 0, /* long */ 1215 0, /* unused - default 32 vs 16 bit size */ 1216 0 /* limit granularity (byte/page units)*/ }, 1217 /* Actually, the TSS is a system descriptor which is double size */ 1218 { 0x0, /* segment base address */ 1219 0x0, /* length */ 1220 0, /* segment type */ 1221 0, /* segment descriptor priority level */ 1222 0, /* segment descriptor present */ 1223 0, /* long */ 1224 0, /* default 32 vs 16 bit size */ 1225 0 /* limit granularity (byte/page units)*/ }, 1226 /* GUGS32_SEL 8 32 bit GS Descriptor for user */ 1227 { 0x0, /* segment base address */ 1228 0xfffff, /* length - all address space */ 1229 SDT_MEMRWA, /* segment type */ 1230 SEL_UPL, /* segment descriptor priority level */ 1231 1, /* segment descriptor present */ 1232 0, /* long */ 1233 1, /* default 32 vs 16 bit size */ 1234 1 /* limit granularity (byte/page units)*/ }, 1235 }; 1236 1237 void 1238 setidt(int idx, inthand_t *func, int typ, int dpl, int ist) 1239 { 1240 struct gate_descriptor *ip; 1241 1242 ip = idt + idx; 1243 ip->gd_looffset = (uintptr_t)func; 1244 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 1245 ip->gd_ist = ist; 1246 ip->gd_xx = 0; 1247 ip->gd_type = typ; 1248 ip->gd_dpl = dpl; 1249 ip->gd_p = 1; 1250 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 1251 } 1252 1253 #define IDTVEC(name) __CONCAT(X,name) 1254 1255 extern inthand_t 1256 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 1257 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 1258 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 1259 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 1260 IDTVEC(xmm), IDTVEC(dblfault), 1261 IDTVEC(fast_syscall), IDTVEC(fast_syscall32); 1262 1263 #ifdef DEBUG_INTERRUPTS 1264 extern inthand_t *Xrsvdary[256]; 1265 #endif 1266 1267 void 1268 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd) 1269 { 1270 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 1271 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 1272 ssd->ssd_type = sd->sd_type; 1273 ssd->ssd_dpl = sd->sd_dpl; 1274 ssd->ssd_p = sd->sd_p; 1275 ssd->ssd_def32 = sd->sd_def32; 1276 ssd->ssd_gran = sd->sd_gran; 1277 } 1278 1279 void 1280 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd) 1281 { 1282 1283 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 1284 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 1285 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 1286 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 1287 sd->sd_type = ssd->ssd_type; 1288 sd->sd_dpl = ssd->ssd_dpl; 1289 sd->sd_p = ssd->ssd_p; 1290 sd->sd_long = ssd->ssd_long; 1291 sd->sd_def32 = ssd->ssd_def32; 1292 sd->sd_gran = ssd->ssd_gran; 1293 } 1294 1295 void 1296 ssdtosyssd(struct soft_segment_descriptor *ssd, 1297 struct system_segment_descriptor *sd) 1298 { 1299 1300 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 1301 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 1302 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 1303 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 1304 sd->sd_type = ssd->ssd_type; 1305 sd->sd_dpl = ssd->ssd_dpl; 1306 sd->sd_p = ssd->ssd_p; 1307 sd->sd_gran = ssd->ssd_gran; 1308 } 1309 1310 u_int basemem; 1311 1312 /* 1313 * Populate the (physmap) array with base/bound pairs describing the 1314 * available physical memory in the system, then test this memory and 1315 * build the phys_avail array describing the actually-available memory. 1316 * 1317 * If we cannot accurately determine the physical memory map, then use 1318 * value from the 0xE801 call, and failing that, the RTC. 1319 * 1320 * Total memory size may be set by the kernel environment variable 1321 * hw.physmem or the compile-time define MAXMEM. 1322 * 1323 * XXX first should be vm_paddr_t. 1324 */ 1325 static void 1326 getmemsize(caddr_t kmdp, u_int64_t first) 1327 { 1328 int i, off, physmap_idx, pa_indx, da_indx; 1329 vm_paddr_t pa, physmap[PHYSMAP_SIZE]; 1330 u_long physmem_tunable; 1331 pt_entry_t *pte; 1332 struct bios_smap *smapbase, *smap, *smapend; 1333 u_int32_t smapsize; 1334 quad_t dcons_addr, dcons_size; 1335 1336 bzero(physmap, sizeof(physmap)); 1337 basemem = 0; 1338 physmap_idx = 0; 1339 1340 /* 1341 * get memory map from INT 15:E820, kindly supplied by the loader. 1342 * 1343 * subr_module.c says: 1344 * "Consumer may safely assume that size value precedes data." 1345 * ie: an int32_t immediately precedes smap. 1346 */ 1347 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1348 MODINFO_METADATA | MODINFOMD_SMAP); 1349 if (smapbase == NULL) 1350 panic("No BIOS smap info from loader!"); 1351 1352 smapsize = *((u_int32_t *)smapbase - 1); 1353 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1354 1355 for (smap = smapbase; smap < smapend; smap++) { 1356 if (boothowto & RB_VERBOSE) 1357 kprintf("SMAP type=%02x base=%016lx len=%016lx\n", 1358 smap->type, smap->base, smap->length); 1359 1360 if (smap->type != SMAP_TYPE_MEMORY) 1361 continue; 1362 1363 if (smap->length == 0) 1364 continue; 1365 1366 for (i = 0; i <= physmap_idx; i += 2) { 1367 if (smap->base < physmap[i + 1]) { 1368 if (boothowto & RB_VERBOSE) 1369 kprintf( 1370 "Overlapping or non-monotonic memory region, ignoring second region\n"); 1371 continue; 1372 } 1373 } 1374 1375 if (smap->base == physmap[physmap_idx + 1]) { 1376 physmap[physmap_idx + 1] += smap->length; 1377 continue; 1378 } 1379 1380 physmap_idx += 2; 1381 if (physmap_idx == PHYSMAP_SIZE) { 1382 kprintf( 1383 "Too many segments in the physical address map, giving up\n"); 1384 break; 1385 } 1386 physmap[physmap_idx] = smap->base; 1387 physmap[physmap_idx + 1] = smap->base + smap->length; 1388 } 1389 1390 /* 1391 * Find the 'base memory' segment for SMP 1392 */ 1393 basemem = 0; 1394 for (i = 0; i <= physmap_idx; i += 2) { 1395 if (physmap[i] == 0x00000000) { 1396 basemem = physmap[i + 1] / 1024; 1397 break; 1398 } 1399 } 1400 if (basemem == 0) 1401 panic("BIOS smap did not include a basemem segment!"); 1402 1403 #ifdef SMP 1404 /* make hole for AP bootstrap code */ 1405 physmap[1] = mp_bootaddress(physmap[1] / 1024); 1406 1407 /* look for the MP hardware - needed for apic addresses */ 1408 mp_probe(); 1409 #endif 1410 1411 /* 1412 * Maxmem isn't the "maximum memory", it's one larger than the 1413 * highest page of the physical address space. It should be 1414 * called something like "Maxphyspage". We may adjust this 1415 * based on ``hw.physmem'' and the results of the memory test. 1416 */ 1417 Maxmem = atop(physmap[physmap_idx + 1]); 1418 1419 #ifdef MAXMEM 1420 Maxmem = MAXMEM / 4; 1421 #endif 1422 1423 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 1424 Maxmem = atop(physmem_tunable); 1425 1426 /* 1427 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 1428 * in the system. 1429 */ 1430 if (Maxmem > atop(physmap[physmap_idx + 1])) 1431 Maxmem = atop(physmap[physmap_idx + 1]); 1432 1433 if (atop(physmap[physmap_idx + 1]) != Maxmem && 1434 (boothowto & RB_VERBOSE)) 1435 kprintf("Physical memory use set to %ldK\n", Maxmem * 4); 1436 1437 /* call pmap initialization to make new kernel address space */ 1438 pmap_bootstrap(&first); 1439 1440 /* 1441 * Size up each available chunk of physical memory. 1442 */ 1443 physmap[0] = PAGE_SIZE; /* mask off page 0 */ 1444 pa_indx = 0; 1445 da_indx = 1; 1446 phys_avail[pa_indx++] = physmap[0]; 1447 phys_avail[pa_indx] = physmap[0]; 1448 dump_avail[da_indx] = physmap[0]; 1449 pte = CMAP1; 1450 1451 /* 1452 * Get dcons buffer address 1453 */ 1454 if (kgetenv_quad("dcons.addr", &dcons_addr) == 0 || 1455 kgetenv_quad("dcons.size", &dcons_size) == 0) 1456 dcons_addr = 0; 1457 1458 /* 1459 * physmap is in bytes, so when converting to page boundaries, 1460 * round up the start address and round down the end address. 1461 */ 1462 for (i = 0; i <= physmap_idx; i += 2) { 1463 vm_paddr_t end; 1464 1465 end = ptoa((vm_paddr_t)Maxmem); 1466 if (physmap[i + 1] < end) 1467 end = trunc_page(physmap[i + 1]); 1468 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 1469 int tmp, page_bad, full; 1470 int *ptr = (int *)CADDR1; 1471 1472 full = FALSE; 1473 /* 1474 * block out kernel memory as not available. 1475 */ 1476 if (pa >= 0x100000 && pa < first) 1477 goto do_dump_avail; 1478 1479 /* 1480 * block out dcons buffer 1481 */ 1482 if (dcons_addr > 0 1483 && pa >= trunc_page(dcons_addr) 1484 && pa < dcons_addr + dcons_size) 1485 goto do_dump_avail; 1486 1487 page_bad = FALSE; 1488 1489 /* 1490 * map page into kernel: valid, read/write,non-cacheable 1491 */ 1492 *pte = pa | PG_V | PG_RW | PG_N; 1493 cpu_invltlb(); 1494 1495 tmp = *(int *)ptr; 1496 /* 1497 * Test for alternating 1's and 0's 1498 */ 1499 *(volatile int *)ptr = 0xaaaaaaaa; 1500 if (*(volatile int *)ptr != 0xaaaaaaaa) 1501 page_bad = TRUE; 1502 /* 1503 * Test for alternating 0's and 1's 1504 */ 1505 *(volatile int *)ptr = 0x55555555; 1506 if (*(volatile int *)ptr != 0x55555555) 1507 page_bad = TRUE; 1508 /* 1509 * Test for all 1's 1510 */ 1511 *(volatile int *)ptr = 0xffffffff; 1512 if (*(volatile int *)ptr != 0xffffffff) 1513 page_bad = TRUE; 1514 /* 1515 * Test for all 0's 1516 */ 1517 *(volatile int *)ptr = 0x0; 1518 if (*(volatile int *)ptr != 0x0) 1519 page_bad = TRUE; 1520 /* 1521 * Restore original value. 1522 */ 1523 *(int *)ptr = tmp; 1524 1525 /* 1526 * Adjust array of valid/good pages. 1527 */ 1528 if (page_bad == TRUE) 1529 continue; 1530 /* 1531 * If this good page is a continuation of the 1532 * previous set of good pages, then just increase 1533 * the end pointer. Otherwise start a new chunk. 1534 * Note that "end" points one higher than end, 1535 * making the range >= start and < end. 1536 * If we're also doing a speculative memory 1537 * test and we at or past the end, bump up Maxmem 1538 * so that we keep going. The first bad page 1539 * will terminate the loop. 1540 */ 1541 if (phys_avail[pa_indx] == pa) { 1542 phys_avail[pa_indx] += PAGE_SIZE; 1543 } else { 1544 pa_indx++; 1545 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 1546 kprintf( 1547 "Too many holes in the physical address space, giving up\n"); 1548 pa_indx--; 1549 full = TRUE; 1550 goto do_dump_avail; 1551 } 1552 phys_avail[pa_indx++] = pa; /* start */ 1553 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 1554 } 1555 physmem++; 1556 do_dump_avail: 1557 if (dump_avail[da_indx] == pa) { 1558 dump_avail[da_indx] += PAGE_SIZE; 1559 } else { 1560 da_indx++; 1561 if (da_indx == DUMP_AVAIL_ARRAY_END) { 1562 da_indx--; 1563 goto do_next; 1564 } 1565 dump_avail[da_indx++] = pa; /* start */ 1566 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 1567 } 1568 do_next: 1569 if (full) 1570 break; 1571 } 1572 } 1573 *pte = 0; 1574 cpu_invltlb(); 1575 1576 /* 1577 * XXX 1578 * The last chunk must contain at least one page plus the message 1579 * buffer to avoid complicating other code (message buffer address 1580 * calculation, etc.). 1581 */ 1582 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 1583 round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) { 1584 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 1585 phys_avail[pa_indx--] = 0; 1586 phys_avail[pa_indx--] = 0; 1587 } 1588 1589 Maxmem = atop(phys_avail[pa_indx]); 1590 1591 /* Trim off space for the message buffer. */ 1592 phys_avail[pa_indx] -= round_page(MSGBUF_SIZE); 1593 1594 avail_end = phys_avail[pa_indx]; 1595 1596 /* Map the message buffer. */ 1597 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) 1598 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + 1599 off); 1600 } 1601 1602 /* 1603 * IDT VECTORS: 1604 * 0 Divide by zero 1605 * 1 Debug 1606 * 2 NMI 1607 * 3 BreakPoint 1608 * 4 OverFlow 1609 * 5 Bound-Range 1610 * 6 Invalid OpCode 1611 * 7 Device Not Available (x87) 1612 * 8 Double-Fault 1613 * 9 Coprocessor Segment overrun (unsupported, reserved) 1614 * 10 Invalid-TSS 1615 * 11 Segment not present 1616 * 12 Stack 1617 * 13 General Protection 1618 * 14 Page Fault 1619 * 15 Reserved 1620 * 16 x87 FP Exception pending 1621 * 17 Alignment Check 1622 * 18 Machine Check 1623 * 19 SIMD floating point 1624 * 20-31 reserved 1625 * 32-255 INTn/external sources 1626 */ 1627 u_int64_t 1628 hammer_time(u_int64_t modulep, u_int64_t physfree) 1629 { 1630 caddr_t kmdp; 1631 int gsel_tss, x; 1632 #if JG 1633 int metadata_missing, off; 1634 #endif 1635 struct mdglobaldata *gd; 1636 u_int64_t msr; 1637 char *env; 1638 1639 #if JG 1640 /* 1641 * This must be done before the first references 1642 * to CPU_prvspace[0] are made. 1643 */ 1644 init_paging(&physfree); 1645 #endif 1646 1647 /* 1648 * Prevent lowering of the ipl if we call tsleep() early. 1649 */ 1650 gd = &CPU_prvspace[0].mdglobaldata; 1651 bzero(gd, sizeof(*gd)); 1652 1653 /* 1654 * Note: on both UP and SMP curthread must be set non-NULL 1655 * early in the boot sequence because the system assumes 1656 * that 'curthread' is never NULL. 1657 */ 1658 1659 gd->mi.gd_curthread = &thread0; 1660 thread0.td_gd = &gd->mi; 1661 1662 atdevbase = ISA_HOLE_START + PTOV_OFFSET; 1663 1664 #if JG 1665 metadata_missing = 0; 1666 if (bootinfo.bi_modulep) { 1667 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; 1668 preload_bootstrap_relocate(KERNBASE); 1669 } else { 1670 metadata_missing = 1; 1671 } 1672 if (bootinfo.bi_envp) 1673 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; 1674 #endif 1675 1676 preload_metadata = (caddr_t)(uintptr_t)(modulep + PTOV_OFFSET); 1677 preload_bootstrap_relocate(PTOV_OFFSET); 1678 kmdp = preload_search_by_type("elf kernel"); 1679 if (kmdp == NULL) 1680 kmdp = preload_search_by_type("elf64 kernel"); 1681 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 1682 kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + PTOV_OFFSET; 1683 #ifdef DDB 1684 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 1685 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 1686 #endif 1687 1688 /* 1689 * start with one cpu. Note: with one cpu, ncpus2_shift, ncpus2_mask, 1690 * and ncpus_fit_mask remain 0. 1691 */ 1692 ncpus = 1; 1693 ncpus2 = 1; 1694 ncpus_fit = 1; 1695 /* Init basic tunables, hz etc */ 1696 init_param1(); 1697 1698 /* 1699 * make gdt memory segments 1700 */ 1701 gdt_segs[GPROC0_SEL].ssd_base = 1702 (uintptr_t) &CPU_prvspace[0].mdglobaldata.gd_common_tss; 1703 1704 gd->mi.gd_prvspace = &CPU_prvspace[0]; 1705 1706 for (x = 0; x < NGDT; x++) { 1707 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1)) 1708 ssdtosd(&gdt_segs[x], &gdt[x]); 1709 } 1710 ssdtosyssd(&gdt_segs[GPROC0_SEL], 1711 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1712 1713 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 1714 r_gdt.rd_base = (long) gdt; 1715 lgdt(&r_gdt); 1716 1717 wrmsr(MSR_FSBASE, 0); /* User value */ 1718 wrmsr(MSR_GSBASE, (u_int64_t)&gd->mi); 1719 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 1720 1721 mi_gdinit(&gd->mi, 0); 1722 cpu_gdinit(gd, 0); 1723 proc0paddr = proc0paddr_buff; 1724 mi_proc0init(&gd->mi, proc0paddr); 1725 safepri = TDPRI_MAX; 1726 1727 /* spinlocks and the BGL */ 1728 init_locks(); 1729 1730 /* exceptions */ 1731 for (x = 0; x < NIDT; x++) 1732 setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); 1733 setidt(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); 1734 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0); 1735 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 1); 1736 setidt(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0); 1737 setidt(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0); 1738 setidt(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); 1739 setidt(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); 1740 setidt(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); 1741 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 1742 setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); 1743 setidt(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); 1744 setidt(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); 1745 setidt(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); 1746 setidt(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); 1747 setidt(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); 1748 setidt(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); 1749 setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); 1750 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0); 1751 setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); 1752 1753 r_idt.rd_limit = sizeof(idt0) - 1; 1754 r_idt.rd_base = (long) idt; 1755 lidt(&r_idt); 1756 1757 /* 1758 * Initialize the console before we print anything out. 1759 */ 1760 cninit(); 1761 1762 #if JG 1763 if (metadata_missing) 1764 kprintf("WARNING: loader(8) metadata is missing!\n"); 1765 #endif 1766 1767 #if NISA >0 1768 isa_defaultirq(); 1769 #endif 1770 rand_initialize(); 1771 1772 #ifdef DDB 1773 kdb_init(); 1774 if (boothowto & RB_KDB) 1775 Debugger("Boot flags requested debugger"); 1776 #endif 1777 1778 #if JG 1779 finishidentcpu(); /* Final stage of CPU initialization */ 1780 setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 1781 setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 1782 #endif 1783 identify_cpu(); /* Final stage of CPU initialization */ 1784 initializecpu(); /* Initialize CPU registers */ 1785 1786 /* make an initial tss so cpu can get interrupt stack on syscall! */ 1787 gd->gd_common_tss.tss_rsp0 = 1788 (register_t)(thread0.td_kstack + 1789 KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb)); 1790 /* Ensure the stack is aligned to 16 bytes */ 1791 gd->gd_common_tss.tss_rsp0 &= ~0xFul; 1792 gd->gd_rsp0 = gd->gd_common_tss.tss_rsp0; 1793 1794 /* doublefault stack space, runs on ist1 */ 1795 gd->gd_common_tss.tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)]; 1796 1797 /* Set the IO permission bitmap (empty due to tss seg limit) */ 1798 gd->gd_common_tss.tss_iobase = sizeof(struct x86_64tss); 1799 1800 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 1801 gd->gd_tss_gdt = &gdt[GPROC0_SEL]; 1802 gd->gd_common_tssd = *gd->gd_tss_gdt; 1803 ltr(gsel_tss); 1804 1805 /* Set up the fast syscall stuff */ 1806 msr = rdmsr(MSR_EFER) | EFER_SCE; 1807 wrmsr(MSR_EFER, msr); 1808 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); 1809 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 1810 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1811 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 1812 wrmsr(MSR_STAR, msr); 1813 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); 1814 1815 getmemsize(kmdp, physfree); 1816 init_param2(physmem); 1817 1818 /* now running on new page tables, configured,and u/iom is accessible */ 1819 1820 /* Map the message buffer. */ 1821 #if JG 1822 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) 1823 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); 1824 #endif 1825 1826 msgbufinit(msgbufp, MSGBUF_SIZE); 1827 1828 1829 /* transfer to user mode */ 1830 1831 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 1832 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 1833 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 1834 1835 load_ds(_udatasel); 1836 load_es(_udatasel); 1837 load_fs(_udatasel); 1838 1839 /* setup proc 0's pcb */ 1840 thread0.td_pcb->pcb_flags = 0; 1841 thread0.td_pcb->pcb_cr3 = KPML4phys; 1842 thread0.td_pcb->pcb_ext = 0; 1843 lwp0.lwp_md.md_regs = &proc0_tf; 1844 env = kgetenv("kernelname"); 1845 if (env != NULL) 1846 strlcpy(kernelname, env, sizeof(kernelname)); 1847 1848 /* Location of kernel stack for locore */ 1849 return ((u_int64_t)thread0.td_pcb); 1850 } 1851 1852 /* 1853 * Initialize machine-dependant portions of the global data structure. 1854 * Note that the global data area and cpu0's idlestack in the private 1855 * data space were allocated in locore. 1856 * 1857 * Note: the idlethread's cpl is 0 1858 * 1859 * WARNING! Called from early boot, 'mycpu' may not work yet. 1860 */ 1861 void 1862 cpu_gdinit(struct mdglobaldata *gd, int cpu) 1863 { 1864 if (cpu) 1865 gd->mi.gd_curthread = &gd->mi.gd_idlethread; 1866 1867 lwkt_init_thread(&gd->mi.gd_idlethread, 1868 gd->mi.gd_prvspace->idlestack, 1869 sizeof(gd->mi.gd_prvspace->idlestack), 1870 TDF_MPSAFE, &gd->mi); 1871 lwkt_set_comm(&gd->mi.gd_idlethread, "idle_%d", cpu); 1872 gd->mi.gd_idlethread.td_switch = cpu_lwkt_switch; 1873 gd->mi.gd_idlethread.td_sp -= sizeof(void *); 1874 *(void **)gd->mi.gd_idlethread.td_sp = cpu_idle_restore; 1875 } 1876 1877 int 1878 is_globaldata_space(vm_offset_t saddr, vm_offset_t eaddr) 1879 { 1880 if (saddr >= (vm_offset_t)&CPU_prvspace[0] && 1881 eaddr <= (vm_offset_t)&CPU_prvspace[MAXCPU]) { 1882 return (TRUE); 1883 } 1884 return (FALSE); 1885 } 1886 1887 struct globaldata * 1888 globaldata_find(int cpu) 1889 { 1890 KKASSERT(cpu >= 0 && cpu < ncpus); 1891 return(&CPU_prvspace[cpu].mdglobaldata.mi); 1892 } 1893 1894 #if defined(I586_CPU) && !defined(NO_F00F_HACK) 1895 static void f00f_hack(void *unused); 1896 SYSINIT(f00f_hack, SI_BOOT2_BIOS, SI_ORDER_ANY, f00f_hack, NULL); 1897 1898 static void 1899 f00f_hack(void *unused) 1900 { 1901 struct gate_descriptor *new_idt; 1902 vm_offset_t tmp; 1903 1904 if (!has_f00f_bug) 1905 return; 1906 1907 kprintf("Intel Pentium detected, installing workaround for F00F bug\n"); 1908 1909 r_idt.rd_limit = sizeof(idt0) - 1; 1910 1911 tmp = kmem_alloc(&kernel_map, PAGE_SIZE * 2); 1912 if (tmp == 0) 1913 panic("kmem_alloc returned 0"); 1914 if (((unsigned int)tmp & (PAGE_SIZE-1)) != 0) 1915 panic("kmem_alloc returned non-page-aligned memory"); 1916 /* Put the first seven entries in the lower page */ 1917 new_idt = (struct gate_descriptor*)(tmp + PAGE_SIZE - (7*8)); 1918 bcopy(idt, new_idt, sizeof(idt0)); 1919 r_idt.rd_base = (int)new_idt; 1920 lidt(&r_idt); 1921 idt = new_idt; 1922 if (vm_map_protect(&kernel_map, tmp, tmp + PAGE_SIZE, 1923 VM_PROT_READ, FALSE) != KERN_SUCCESS) 1924 panic("vm_map_protect failed"); 1925 return; 1926 } 1927 #endif /* defined(I586_CPU) && !NO_F00F_HACK */ 1928 1929 int 1930 ptrace_set_pc(struct lwp *lp, unsigned long addr) 1931 { 1932 lp->lwp_md.md_regs->tf_rip = addr; 1933 return (0); 1934 } 1935 1936 int 1937 ptrace_single_step(struct lwp *lp) 1938 { 1939 lp->lwp_md.md_regs->tf_rflags |= PSL_T; 1940 return (0); 1941 } 1942 1943 int 1944 fill_regs(struct lwp *lp, struct reg *regs) 1945 { 1946 struct pcb *pcb; 1947 struct trapframe *tp; 1948 1949 tp = lp->lwp_md.md_regs; 1950 bcopy(&tp->tf_rdi, ®s->r_rdi, sizeof(*regs)); 1951 1952 pcb = lp->lwp_thread->td_pcb; 1953 return (0); 1954 } 1955 1956 int 1957 set_regs(struct lwp *lp, struct reg *regs) 1958 { 1959 struct pcb *pcb; 1960 struct trapframe *tp; 1961 1962 tp = lp->lwp_md.md_regs; 1963 if (!EFL_SECURE(regs->r_rflags, tp->tf_rflags) || 1964 !CS_SECURE(regs->r_cs)) 1965 return (EINVAL); 1966 bcopy(®s->r_rdi, &tp->tf_rdi, sizeof(*regs)); 1967 pcb = lp->lwp_thread->td_pcb; 1968 return (0); 1969 } 1970 1971 #ifndef CPU_DISABLE_SSE 1972 static void 1973 fill_fpregs_xmm(struct savexmm *sv_xmm, struct save87 *sv_87) 1974 { 1975 struct env87 *penv_87 = &sv_87->sv_env; 1976 struct envxmm *penv_xmm = &sv_xmm->sv_env; 1977 int i; 1978 1979 /* FPU control/status */ 1980 penv_87->en_cw = penv_xmm->en_cw; 1981 penv_87->en_sw = penv_xmm->en_sw; 1982 penv_87->en_tw = penv_xmm->en_tw; 1983 penv_87->en_fip = penv_xmm->en_fip; 1984 penv_87->en_fcs = penv_xmm->en_fcs; 1985 penv_87->en_opcode = penv_xmm->en_opcode; 1986 penv_87->en_foo = penv_xmm->en_foo; 1987 penv_87->en_fos = penv_xmm->en_fos; 1988 1989 /* FPU registers */ 1990 for (i = 0; i < 8; ++i) 1991 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; 1992 1993 sv_87->sv_ex_sw = sv_xmm->sv_ex_sw; 1994 } 1995 1996 static void 1997 set_fpregs_xmm(struct save87 *sv_87, struct savexmm *sv_xmm) 1998 { 1999 struct env87 *penv_87 = &sv_87->sv_env; 2000 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2001 int i; 2002 2003 /* FPU control/status */ 2004 penv_xmm->en_cw = penv_87->en_cw; 2005 penv_xmm->en_sw = penv_87->en_sw; 2006 penv_xmm->en_tw = penv_87->en_tw; 2007 penv_xmm->en_fip = penv_87->en_fip; 2008 penv_xmm->en_fcs = penv_87->en_fcs; 2009 penv_xmm->en_opcode = penv_87->en_opcode; 2010 penv_xmm->en_foo = penv_87->en_foo; 2011 penv_xmm->en_fos = penv_87->en_fos; 2012 2013 /* FPU registers */ 2014 for (i = 0; i < 8; ++i) 2015 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; 2016 2017 sv_xmm->sv_ex_sw = sv_87->sv_ex_sw; 2018 } 2019 #endif /* CPU_DISABLE_SSE */ 2020 2021 int 2022 fill_fpregs(struct lwp *lp, struct fpreg *fpregs) 2023 { 2024 #ifndef CPU_DISABLE_SSE 2025 if (cpu_fxsr) { 2026 fill_fpregs_xmm(&lp->lwp_thread->td_pcb->pcb_save.sv_xmm, 2027 (struct save87 *)fpregs); 2028 return (0); 2029 } 2030 #endif /* CPU_DISABLE_SSE */ 2031 bcopy(&lp->lwp_thread->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs); 2032 return (0); 2033 } 2034 2035 int 2036 set_fpregs(struct lwp *lp, struct fpreg *fpregs) 2037 { 2038 #ifndef CPU_DISABLE_SSE 2039 if (cpu_fxsr) { 2040 set_fpregs_xmm((struct save87 *)fpregs, 2041 &lp->lwp_thread->td_pcb->pcb_save.sv_xmm); 2042 return (0); 2043 } 2044 #endif /* CPU_DISABLE_SSE */ 2045 bcopy(fpregs, &lp->lwp_thread->td_pcb->pcb_save.sv_87, sizeof *fpregs); 2046 return (0); 2047 } 2048 2049 int 2050 fill_dbregs(struct lwp *lp, struct dbreg *dbregs) 2051 { 2052 if (lp == NULL) { 2053 dbregs->dr[0] = rdr0(); 2054 dbregs->dr[1] = rdr1(); 2055 dbregs->dr[2] = rdr2(); 2056 dbregs->dr[3] = rdr3(); 2057 dbregs->dr[4] = rdr4(); 2058 dbregs->dr[5] = rdr5(); 2059 dbregs->dr[6] = rdr6(); 2060 dbregs->dr[7] = rdr7(); 2061 } else { 2062 struct pcb *pcb; 2063 2064 pcb = lp->lwp_thread->td_pcb; 2065 dbregs->dr[0] = pcb->pcb_dr0; 2066 dbregs->dr[1] = pcb->pcb_dr1; 2067 dbregs->dr[2] = pcb->pcb_dr2; 2068 dbregs->dr[3] = pcb->pcb_dr3; 2069 dbregs->dr[4] = 0; 2070 dbregs->dr[5] = 0; 2071 dbregs->dr[6] = pcb->pcb_dr6; 2072 dbregs->dr[7] = pcb->pcb_dr7; 2073 } 2074 return (0); 2075 } 2076 2077 int 2078 set_dbregs(struct lwp *lp, struct dbreg *dbregs) 2079 { 2080 if (lp == NULL) { 2081 load_dr0(dbregs->dr[0]); 2082 load_dr1(dbregs->dr[1]); 2083 load_dr2(dbregs->dr[2]); 2084 load_dr3(dbregs->dr[3]); 2085 load_dr4(dbregs->dr[4]); 2086 load_dr5(dbregs->dr[5]); 2087 load_dr6(dbregs->dr[6]); 2088 load_dr7(dbregs->dr[7]); 2089 } else { 2090 struct pcb *pcb; 2091 struct ucred *ucred; 2092 int i; 2093 uint64_t mask1, mask2; 2094 2095 /* 2096 * Don't let an illegal value for dr7 get set. Specifically, 2097 * check for undefined settings. Setting these bit patterns 2098 * result in undefined behaviour and can lead to an unexpected 2099 * TRCTRAP. 2100 */ 2101 /* JG this loop looks unreadable */ 2102 /* Check 4 2-bit fields for invalid patterns. 2103 * These fields are R/Wi, for i = 0..3 2104 */ 2105 /* Is 10 in LENi allowed when running in compatibility mode? */ 2106 /* Pattern 10 in R/Wi might be used to indicate 2107 * breakpoint on I/O. Further analysis should be 2108 * carried to decide if it is safe and useful to 2109 * provide access to that capability 2110 */ 2111 for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 4; 2112 i++, mask1 <<= 4, mask2 <<= 4) 2113 if ((dbregs->dr[7] & mask1) == mask2) 2114 return (EINVAL); 2115 2116 pcb = lp->lwp_thread->td_pcb; 2117 ucred = lp->lwp_proc->p_ucred; 2118 2119 /* 2120 * Don't let a process set a breakpoint that is not within the 2121 * process's address space. If a process could do this, it 2122 * could halt the system by setting a breakpoint in the kernel 2123 * (if ddb was enabled). Thus, we need to check to make sure 2124 * that no breakpoints are being enabled for addresses outside 2125 * process's address space, unless, perhaps, we were called by 2126 * uid 0. 2127 * 2128 * XXX - what about when the watched area of the user's 2129 * address space is written into from within the kernel 2130 * ... wouldn't that still cause a breakpoint to be generated 2131 * from within kernel mode? 2132 */ 2133 2134 if (priv_check_cred(ucred, PRIV_ROOT, 0) != 0) { 2135 if (dbregs->dr[7] & 0x3) { 2136 /* dr0 is enabled */ 2137 if (dbregs->dr[0] >= VM_MAX_USER_ADDRESS) 2138 return (EINVAL); 2139 } 2140 2141 if (dbregs->dr[7] & (0x3<<2)) { 2142 /* dr1 is enabled */ 2143 if (dbregs->dr[1] >= VM_MAX_USER_ADDRESS) 2144 return (EINVAL); 2145 } 2146 2147 if (dbregs->dr[7] & (0x3<<4)) { 2148 /* dr2 is enabled */ 2149 if (dbregs->dr[2] >= VM_MAX_USER_ADDRESS) 2150 return (EINVAL); 2151 } 2152 2153 if (dbregs->dr[7] & (0x3<<6)) { 2154 /* dr3 is enabled */ 2155 if (dbregs->dr[3] >= VM_MAX_USER_ADDRESS) 2156 return (EINVAL); 2157 } 2158 } 2159 2160 pcb->pcb_dr0 = dbregs->dr[0]; 2161 pcb->pcb_dr1 = dbregs->dr[1]; 2162 pcb->pcb_dr2 = dbregs->dr[2]; 2163 pcb->pcb_dr3 = dbregs->dr[3]; 2164 pcb->pcb_dr6 = dbregs->dr[6]; 2165 pcb->pcb_dr7 = dbregs->dr[7]; 2166 2167 pcb->pcb_flags |= PCB_DBREGS; 2168 } 2169 2170 return (0); 2171 } 2172 2173 /* 2174 * Return > 0 if a hardware breakpoint has been hit, and the 2175 * breakpoint was in user space. Return 0, otherwise. 2176 */ 2177 int 2178 user_dbreg_trap(void) 2179 { 2180 u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */ 2181 u_int64_t bp; /* breakpoint bits extracted from dr6 */ 2182 int nbp; /* number of breakpoints that triggered */ 2183 caddr_t addr[4]; /* breakpoint addresses */ 2184 int i; 2185 2186 dr7 = rdr7(); 2187 if ((dr7 & 0xff) == 0) { 2188 /* 2189 * all GE and LE bits in the dr7 register are zero, 2190 * thus the trap couldn't have been caused by the 2191 * hardware debug registers 2192 */ 2193 return 0; 2194 } 2195 2196 nbp = 0; 2197 dr6 = rdr6(); 2198 bp = dr6 & 0xf; 2199 2200 if (bp == 0) { 2201 /* 2202 * None of the breakpoint bits are set meaning this 2203 * trap was not caused by any of the debug registers 2204 */ 2205 return 0; 2206 } 2207 2208 /* 2209 * at least one of the breakpoints were hit, check to see 2210 * which ones and if any of them are user space addresses 2211 */ 2212 2213 if (bp & 0x01) { 2214 addr[nbp++] = (caddr_t)rdr0(); 2215 } 2216 if (bp & 0x02) { 2217 addr[nbp++] = (caddr_t)rdr1(); 2218 } 2219 if (bp & 0x04) { 2220 addr[nbp++] = (caddr_t)rdr2(); 2221 } 2222 if (bp & 0x08) { 2223 addr[nbp++] = (caddr_t)rdr3(); 2224 } 2225 2226 for (i=0; i<nbp; i++) { 2227 if (addr[i] < 2228 (caddr_t)VM_MAX_USER_ADDRESS) { 2229 /* 2230 * addr[i] is in user space 2231 */ 2232 return nbp; 2233 } 2234 } 2235 2236 /* 2237 * None of the breakpoints are in user space. 2238 */ 2239 return 0; 2240 } 2241 2242 2243 #ifndef DDB 2244 void 2245 Debugger(const char *msg) 2246 { 2247 kprintf("Debugger(\"%s\") called.\n", msg); 2248 } 2249 #endif /* no DDB */ 2250 2251 #ifdef DDB 2252 2253 /* 2254 * Provide inb() and outb() as functions. They are normally only 2255 * available as macros calling inlined functions, thus cannot be 2256 * called inside DDB. 2257 * 2258 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined. 2259 */ 2260 2261 #undef inb 2262 #undef outb 2263 2264 /* silence compiler warnings */ 2265 u_char inb(u_int); 2266 void outb(u_int, u_char); 2267 2268 u_char 2269 inb(u_int port) 2270 { 2271 u_char data; 2272 /* 2273 * We use %%dx and not %1 here because i/o is done at %dx and not at 2274 * %edx, while gcc generates inferior code (movw instead of movl) 2275 * if we tell it to load (u_short) port. 2276 */ 2277 __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); 2278 return (data); 2279 } 2280 2281 void 2282 outb(u_int port, u_char data) 2283 { 2284 u_char al; 2285 /* 2286 * Use an unnecessary assignment to help gcc's register allocator. 2287 * This make a large difference for gcc-1.40 and a tiny difference 2288 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for 2289 * best results. gcc-2.6.0 can't handle this. 2290 */ 2291 al = data; 2292 __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); 2293 } 2294 2295 #endif /* DDB */ 2296 2297 2298 2299 #include "opt_cpu.h" 2300 2301 2302 /* 2303 * initialize all the SMP locks 2304 */ 2305 2306 /* critical region when masking or unmasking interupts */ 2307 struct spinlock_deprecated imen_spinlock; 2308 2309 /* Make FAST_INTR() routines sequential */ 2310 struct spinlock_deprecated fast_intr_spinlock; 2311 2312 /* critical region for old style disable_intr/enable_intr */ 2313 struct spinlock_deprecated mpintr_spinlock; 2314 2315 /* critical region around INTR() routines */ 2316 struct spinlock_deprecated intr_spinlock; 2317 2318 /* lock region used by kernel profiling */ 2319 struct spinlock_deprecated mcount_spinlock; 2320 2321 /* locks com (tty) data/hardware accesses: a FASTINTR() */ 2322 struct spinlock_deprecated com_spinlock; 2323 2324 /* locks kernel kprintfs */ 2325 struct spinlock_deprecated cons_spinlock; 2326 2327 /* lock regions around the clock hardware */ 2328 struct spinlock_deprecated clock_spinlock; 2329 2330 /* lock around the MP rendezvous */ 2331 struct spinlock_deprecated smp_rv_spinlock; 2332 2333 static void 2334 init_locks(void) 2335 { 2336 /* 2337 * mp_lock = 0; BSP already owns the MP lock 2338 */ 2339 /* 2340 * Get the initial mp_lock with a count of 1 for the BSP. 2341 * This uses a LOGICAL cpu ID, ie BSP == 0. 2342 */ 2343 #ifdef SMP 2344 cpu_get_initial_mplock(); 2345 #endif 2346 /* DEPRECATED */ 2347 spin_lock_init(&mcount_spinlock); 2348 spin_lock_init(&fast_intr_spinlock); 2349 spin_lock_init(&intr_spinlock); 2350 spin_lock_init(&mpintr_spinlock); 2351 spin_lock_init(&imen_spinlock); 2352 spin_lock_init(&smp_rv_spinlock); 2353 spin_lock_init(&com_spinlock); 2354 spin_lock_init(&clock_spinlock); 2355 spin_lock_init(&cons_spinlock); 2356 2357 /* our token pool needs to work early */ 2358 lwkt_token_pool_init(); 2359 } 2360 2361