1 /*- 2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 3 * Copyright (c) 1992 Terrence R. Lambert. 4 * Copyright (c) 2003 Peter Wemm. 5 * Copyright (c) 2008 The DragonFly Project. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * William Jolitz. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $ 41 */ 42 43 #include "use_ether.h" 44 //#include "use_npx.h" 45 #include "use_isa.h" 46 #include "opt_atalk.h" 47 #include "opt_compat.h" 48 #include "opt_cpu.h" 49 #include "opt_ddb.h" 50 #include "opt_directio.h" 51 #include "opt_inet.h" 52 #include "opt_ipx.h" 53 #include "opt_msgbuf.h" 54 #include "opt_swap.h" 55 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/sysproto.h> 59 #include <sys/signalvar.h> 60 #include <sys/kernel.h> 61 #include <sys/linker.h> 62 #include <sys/malloc.h> 63 #include <sys/proc.h> 64 #include <sys/priv.h> 65 #include <sys/buf.h> 66 #include <sys/reboot.h> 67 #include <sys/mbuf.h> 68 #include <sys/msgbuf.h> 69 #include <sys/sysent.h> 70 #include <sys/sysctl.h> 71 #include <sys/vmmeter.h> 72 #include <sys/bus.h> 73 #include <sys/upcall.h> 74 #include <sys/usched.h> 75 #include <sys/reg.h> 76 77 #include <vm/vm.h> 78 #include <vm/vm_param.h> 79 #include <sys/lock.h> 80 #include <vm/vm_kern.h> 81 #include <vm/vm_object.h> 82 #include <vm/vm_page.h> 83 #include <vm/vm_map.h> 84 #include <vm/vm_pager.h> 85 #include <vm/vm_extern.h> 86 87 #include <sys/thread2.h> 88 #include <sys/mplock2.h> 89 90 #include <sys/user.h> 91 #include <sys/exec.h> 92 #include <sys/cons.h> 93 94 #include <ddb/ddb.h> 95 96 #include <machine/cpu.h> 97 #include <machine/clock.h> 98 #include <machine/specialreg.h> 99 #if JG 100 #include <machine/bootinfo.h> 101 #endif 102 #include <machine/md_var.h> 103 #include <machine/metadata.h> 104 #include <machine/pc/bios.h> 105 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */ 106 #include <machine/globaldata.h> /* CPU_prvspace */ 107 #include <machine/smp.h> 108 #ifdef PERFMON 109 #include <machine/perfmon.h> 110 #endif 111 #include <machine/cputypes.h> 112 113 #ifdef OLD_BUS_ARCH 114 #include <bus/isa/isa_device.h> 115 #endif 116 #include <machine_base/isa/intr_machdep.h> 117 #include <bus/isa/rtc.h> 118 #include <sys/random.h> 119 #include <sys/ptrace.h> 120 #include <machine/sigframe.h> 121 122 #define PHYSMAP_ENTRIES 10 123 124 extern void init386(int first); 125 extern void dblfault_handler(void); 126 extern u_int64_t hammer_time(u_int64_t, u_int64_t); 127 128 extern void printcpuinfo(void); /* XXX header file */ 129 extern void identify_cpu(void); 130 #if JG 131 extern void finishidentcpu(void); 132 #endif 133 extern void panicifcpuunsupported(void); 134 135 static void cpu_startup(void *); 136 #ifndef CPU_DISABLE_SSE 137 static void set_fpregs_xmm(struct save87 *, struct savexmm *); 138 static void fill_fpregs_xmm(struct savexmm *, struct save87 *); 139 #endif /* CPU_DISABLE_SSE */ 140 #ifdef DIRECTIO 141 extern void ffs_rawread_setup(void); 142 #endif /* DIRECTIO */ 143 static void init_locks(void); 144 145 SYSINIT(cpu, SI_BOOT2_SMP, SI_ORDER_FIRST, cpu_startup, NULL) 146 147 #ifdef DDB 148 extern vm_offset_t ksym_start, ksym_end; 149 #endif 150 151 uint64_t SMPptpa; 152 pt_entry_t *SMPpt; 153 154 155 struct privatespace CPU_prvspace[MAXCPU]; 156 157 int _udatasel, _ucodesel, _ucode32sel; 158 u_long atdevbase; 159 #ifdef SMP 160 int64_t tsc_offsets[MAXCPU]; 161 #else 162 int64_t tsc_offsets[1]; 163 #endif 164 165 #if defined(SWTCH_OPTIM_STATS) 166 extern int swtch_optim_stats; 167 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, 168 CTLFLAG_RD, &swtch_optim_stats, 0, ""); 169 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, 170 CTLFLAG_RD, &tlb_flush_count, 0, ""); 171 #endif 172 173 int physmem = 0; 174 175 static int 176 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS) 177 { 178 int error = sysctl_handle_int(oidp, 0, ctob(physmem), req); 179 return (error); 180 } 181 182 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD, 183 0, 0, sysctl_hw_physmem, "IU", ""); 184 185 static int 186 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS) 187 { 188 int error = sysctl_handle_int(oidp, 0, 189 ctob(physmem - vmstats.v_wire_count), req); 190 return (error); 191 } 192 193 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD, 194 0, 0, sysctl_hw_usermem, "IU", ""); 195 196 static int 197 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS) 198 { 199 int error = sysctl_handle_int(oidp, 0, 200 x86_64_btop(avail_end - avail_start), req); 201 return (error); 202 } 203 204 SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_INT|CTLFLAG_RD, 205 0, 0, sysctl_hw_availpages, "I", ""); 206 207 vm_paddr_t Maxmem = 0; 208 209 /* 210 * The number of PHYSMAP entries must be one less than the number of 211 * PHYSSEG entries because the PHYSMAP entry that spans the largest 212 * physical address that is accessible by ISA DMA is split into two 213 * PHYSSEG entries. 214 */ 215 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 216 217 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 218 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 219 220 /* must be 2 less so 0 0 can signal end of chunks */ 221 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2) 222 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2) 223 224 static vm_offset_t buffer_sva, buffer_eva; 225 vm_offset_t clean_sva, clean_eva; 226 static vm_offset_t pager_sva, pager_eva; 227 static struct trapframe proc0_tf; 228 229 static void 230 cpu_startup(void *dummy) 231 { 232 caddr_t v; 233 vm_size_t size = 0; 234 vm_offset_t firstaddr; 235 236 if (boothowto & RB_VERBOSE) 237 bootverbose++; 238 239 /* 240 * Good {morning,afternoon,evening,night}. 241 */ 242 kprintf("%s", version); 243 startrtclock(); 244 printcpuinfo(); 245 panicifcpuunsupported(); 246 #ifdef PERFMON 247 perfmon_init(); 248 #endif 249 kprintf("real memory = %ju (%ju MB)\n", 250 (intmax_t)ptoa(Maxmem), 251 (intmax_t)ptoa(Maxmem) / 1024 / 1024); 252 /* 253 * Display any holes after the first chunk of extended memory. 254 */ 255 if (bootverbose) { 256 int indx; 257 258 kprintf("Physical memory chunk(s):\n"); 259 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 260 vm_paddr_t size1 = phys_avail[indx + 1] - phys_avail[indx]; 261 262 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n", 263 (intmax_t)phys_avail[indx], 264 (intmax_t)phys_avail[indx + 1] - 1, 265 (intmax_t)size1, 266 (intmax_t)(size1 / PAGE_SIZE)); 267 } 268 } 269 270 /* 271 * Allocate space for system data structures. 272 * The first available kernel virtual address is in "v". 273 * As pages of kernel virtual memory are allocated, "v" is incremented. 274 * As pages of memory are allocated and cleared, 275 * "firstaddr" is incremented. 276 * An index into the kernel page table corresponding to the 277 * virtual memory address maintained in "v" is kept in "mapaddr". 278 */ 279 280 /* 281 * Make two passes. The first pass calculates how much memory is 282 * needed and allocates it. The second pass assigns virtual 283 * addresses to the various data structures. 284 */ 285 firstaddr = 0; 286 again: 287 v = (caddr_t)firstaddr; 288 289 #define valloc(name, type, num) \ 290 (name) = (type *)v; v = (caddr_t)((name)+(num)) 291 #define valloclim(name, type, num, lim) \ 292 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) 293 294 /* 295 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. 296 * For the first 64MB of ram nominally allocate sufficient buffers to 297 * cover 1/4 of our ram. Beyond the first 64MB allocate additional 298 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing 299 * the buffer cache we limit the eventual kva reservation to 300 * maxbcache bytes. 301 * 302 * factor represents the 1/4 x ram conversion. 303 */ 304 if (nbuf == 0) { 305 int factor = 4 * BKVASIZE / 1024; 306 int kbytes = physmem * (PAGE_SIZE / 1024); 307 308 nbuf = 50; 309 if (kbytes > 4096) 310 nbuf += min((kbytes - 4096) / factor, 65536 / factor); 311 if (kbytes > 65536) 312 nbuf += (kbytes - 65536) * 2 / (factor * 5); 313 if (maxbcache && nbuf > maxbcache / BKVASIZE) 314 nbuf = maxbcache / BKVASIZE; 315 } 316 317 /* 318 * Do not allow the buffer_map to be more then 1/2 the size of the 319 * kernel_map. 320 */ 321 if (nbuf > (virtual_end - virtual_start) / (BKVASIZE * 2)) { 322 nbuf = (virtual_end - virtual_start) / (BKVASIZE * 2); 323 kprintf("Warning: nbufs capped at %d\n", nbuf); 324 } 325 326 nswbuf = max(min(nbuf/4, 256), 16); 327 #ifdef NSWBUF_MIN 328 if (nswbuf < NSWBUF_MIN) 329 nswbuf = NSWBUF_MIN; 330 #endif 331 #ifdef DIRECTIO 332 ffs_rawread_setup(); 333 #endif 334 335 valloc(swbuf, struct buf, nswbuf); 336 valloc(buf, struct buf, nbuf); 337 338 /* 339 * End of first pass, size has been calculated so allocate memory 340 */ 341 if (firstaddr == 0) { 342 size = (vm_size_t)(v - firstaddr); 343 firstaddr = kmem_alloc(&kernel_map, round_page(size)); 344 if (firstaddr == 0) 345 panic("startup: no room for tables"); 346 goto again; 347 } 348 349 /* 350 * End of second pass, addresses have been assigned 351 */ 352 if ((vm_size_t)(v - firstaddr) != size) 353 panic("startup: table size inconsistency"); 354 355 kmem_suballoc(&kernel_map, &clean_map, &clean_sva, &clean_eva, 356 (nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size); 357 kmem_suballoc(&clean_map, &buffer_map, &buffer_sva, &buffer_eva, 358 (nbuf*BKVASIZE)); 359 buffer_map.system_map = 1; 360 kmem_suballoc(&clean_map, &pager_map, &pager_sva, &pager_eva, 361 (nswbuf*MAXPHYS) + pager_map_size); 362 pager_map.system_map = 1; 363 364 #if defined(USERCONFIG) 365 userconfig(); 366 cninit(); /* the preferred console may have changed */ 367 #endif 368 369 kprintf("avail memory = %ju (%ju MB)\n", 370 (uintmax_t)ptoa(vmstats.v_free_count), 371 (uintmax_t)ptoa(vmstats.v_free_count) / 1024 / 1024); 372 373 /* 374 * Set up buffers, so they can be used to read disk labels. 375 */ 376 bufinit(); 377 vm_pager_bufferinit(); 378 379 #ifdef SMP 380 /* 381 * OK, enough kmem_alloc/malloc state should be up, lets get on with it! 382 */ 383 mp_start(); /* fire up the APs and APICs */ 384 mp_announce(); 385 #endif /* SMP */ 386 cpu_setregs(); 387 } 388 389 /* 390 * Send an interrupt to process. 391 * 392 * Stack is set up to allow sigcode stored 393 * at top to call routine, followed by kcall 394 * to sigreturn routine below. After sigreturn 395 * resets the signal mask, the stack, and the 396 * frame pointer, it returns to the user 397 * specified pc, psl. 398 */ 399 void 400 sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code) 401 { 402 struct lwp *lp = curthread->td_lwp; 403 struct proc *p = lp->lwp_proc; 404 struct trapframe *regs; 405 struct sigacts *psp = p->p_sigacts; 406 struct sigframe sf, *sfp; 407 int oonstack; 408 char *sp; 409 410 regs = lp->lwp_md.md_regs; 411 oonstack = (lp->lwp_sigstk.ss_flags & SS_ONSTACK) ? 1 : 0; 412 413 /* Save user context */ 414 bzero(&sf, sizeof(struct sigframe)); 415 sf.sf_uc.uc_sigmask = *mask; 416 sf.sf_uc.uc_stack = lp->lwp_sigstk; 417 sf.sf_uc.uc_mcontext.mc_onstack = oonstack; 418 KKASSERT(__offsetof(struct trapframe, tf_rdi) == 0); 419 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(struct trapframe)); 420 421 /* Make the size of the saved context visible to userland */ 422 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); 423 424 /* Save mailbox pending state for syscall interlock semantics */ 425 if (p->p_flag & P_MAILBOX) 426 sf.sf_uc.uc_mcontext.mc_xflags |= PGEX_MAILBOX; 427 428 /* Allocate and validate space for the signal handler context. */ 429 if ((lp->lwp_flag & LWP_ALTSTACK) != 0 && !oonstack && 430 SIGISMEMBER(psp->ps_sigonstack, sig)) { 431 sp = (char *)(lp->lwp_sigstk.ss_sp + lp->lwp_sigstk.ss_size - 432 sizeof(struct sigframe)); 433 lp->lwp_sigstk.ss_flags |= SS_ONSTACK; 434 } else { 435 /* We take red zone into account */ 436 sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128; 437 } 438 439 /* Align to 16 bytes */ 440 sfp = (struct sigframe *)((intptr_t)sp & ~0xFUL); 441 442 /* Translate the signal is appropriate */ 443 if (p->p_sysent->sv_sigtbl) { 444 if (sig <= p->p_sysent->sv_sigsize) 445 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 446 } 447 448 /* 449 * Build the argument list for the signal handler. 450 * 451 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx) 452 */ 453 regs->tf_rdi = sig; /* argument 1 */ 454 regs->tf_rdx = (register_t)&sfp->sf_uc; /* argument 3 */ 455 456 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 457 /* 458 * Signal handler installed with SA_SIGINFO. 459 * 460 * action(signo, siginfo, ucontext) 461 */ 462 regs->tf_rsi = (register_t)&sfp->sf_si; /* argument 2 */ 463 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */ 464 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 465 466 /* fill siginfo structure */ 467 sf.sf_si.si_signo = sig; 468 sf.sf_si.si_code = code; 469 sf.sf_si.si_addr = (void *)regs->tf_addr; 470 } else { 471 /* 472 * Old FreeBSD-style arguments. 473 * 474 * handler (signo, code, [uc], addr) 475 */ 476 regs->tf_rsi = (register_t)code; /* argument 2 */ 477 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */ 478 sf.sf_ahu.sf_handler = catcher; 479 } 480 481 /* 482 * If we're a vm86 process, we want to save the segment registers. 483 * We also change eflags to be our emulated eflags, not the actual 484 * eflags. 485 */ 486 #if JG 487 if (regs->tf_eflags & PSL_VM) { 488 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 489 struct vm86_kernel *vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86; 490 491 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 492 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 493 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 494 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 495 496 if (vm86->vm86_has_vme == 0) 497 sf.sf_uc.uc_mcontext.mc_eflags = 498 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 499 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 500 501 /* 502 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 503 * syscalls made by the signal handler. This just avoids 504 * wasting time for our lazy fixup of such faults. PSL_NT 505 * does nothing in vm86 mode, but vm86 programs can set it 506 * almost legitimately in probes for old cpu types. 507 */ 508 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 509 } 510 #endif 511 512 /* 513 * Save the FPU state and reinit the FP unit 514 */ 515 npxpush(&sf.sf_uc.uc_mcontext); 516 517 /* 518 * Copy the sigframe out to the user's stack. 519 */ 520 if (copyout(&sf, sfp, sizeof(struct sigframe)) != 0) { 521 /* 522 * Something is wrong with the stack pointer. 523 * ...Kill the process. 524 */ 525 sigexit(lp, SIGILL); 526 } 527 528 regs->tf_rsp = (register_t)sfp; 529 regs->tf_rip = PS_STRINGS - *(p->p_sysent->sv_szsigcode); 530 531 /* 532 * i386 abi specifies that the direction flag must be cleared 533 * on function entry 534 */ 535 regs->tf_rflags &= ~(PSL_T|PSL_D); 536 537 /* 538 * 64 bit mode has a code and stack selector but 539 * no data or extra selector. %fs and %gs are not 540 * stored in-context. 541 */ 542 regs->tf_cs = _ucodesel; 543 regs->tf_ss = _udatasel; 544 } 545 546 /* 547 * Sanitize the trapframe for a virtual kernel passing control to a custom 548 * VM context. Remove any items that would otherwise create a privilage 549 * issue. 550 * 551 * XXX at the moment we allow userland to set the resume flag. Is this a 552 * bad idea? 553 */ 554 int 555 cpu_sanitize_frame(struct trapframe *frame) 556 { 557 frame->tf_cs = _ucodesel; 558 frame->tf_ss = _udatasel; 559 /* XXX VM (8086) mode not supported? */ 560 frame->tf_rflags &= (PSL_RF | PSL_USERCHANGE | PSL_VM_UNSUPP); 561 frame->tf_rflags |= PSL_RESERVED_DEFAULT | PSL_I; 562 563 return(0); 564 } 565 566 /* 567 * Sanitize the tls so loading the descriptor does not blow up 568 * on us. For x86_64 we don't have to do anything. 569 */ 570 int 571 cpu_sanitize_tls(struct savetls *tls) 572 { 573 return(0); 574 } 575 576 /* 577 * sigreturn(ucontext_t *sigcntxp) 578 * 579 * System call to cleanup state after a signal 580 * has been taken. Reset signal mask and 581 * stack state from context left by sendsig (above). 582 * Return to previous pc and psl as specified by 583 * context left by sendsig. Check carefully to 584 * make sure that the user has not modified the 585 * state to gain improper privileges. 586 * 587 * MPSAFE 588 */ 589 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 590 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 591 592 int 593 sys_sigreturn(struct sigreturn_args *uap) 594 { 595 struct lwp *lp = curthread->td_lwp; 596 struct proc *p = lp->lwp_proc; 597 struct trapframe *regs; 598 ucontext_t uc; 599 ucontext_t *ucp; 600 register_t rflags; 601 int cs; 602 int error; 603 604 /* 605 * We have to copy the information into kernel space so userland 606 * can't modify it while we are sniffing it. 607 */ 608 regs = lp->lwp_md.md_regs; 609 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 610 if (error) 611 return (error); 612 ucp = &uc; 613 rflags = ucp->uc_mcontext.mc_rflags; 614 615 /* VM (8086) mode not supported */ 616 rflags &= ~PSL_VM_UNSUPP; 617 618 #if JG 619 if (eflags & PSL_VM) { 620 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 621 struct vm86_kernel *vm86; 622 623 /* 624 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 625 * set up the vm86 area, and we can't enter vm86 mode. 626 */ 627 if (lp->lwp_thread->td_pcb->pcb_ext == 0) 628 return (EINVAL); 629 vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86; 630 if (vm86->vm86_inited == 0) 631 return (EINVAL); 632 633 /* go back to user mode if both flags are set */ 634 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) 635 trapsignal(lp, SIGBUS, 0); 636 637 if (vm86->vm86_has_vme) { 638 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 639 (eflags & VME_USERCHANGE) | PSL_VM; 640 } else { 641 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 642 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 643 (eflags & VM_USERCHANGE) | PSL_VM; 644 } 645 bcopy(&ucp->uc_mcontext.mc_gs, tf, sizeof(struct trapframe)); 646 tf->tf_eflags = eflags; 647 tf->tf_vm86_ds = tf->tf_ds; 648 tf->tf_vm86_es = tf->tf_es; 649 tf->tf_vm86_fs = tf->tf_fs; 650 tf->tf_vm86_gs = tf->tf_gs; 651 tf->tf_ds = _udatasel; 652 tf->tf_es = _udatasel; 653 tf->tf_fs = _udatasel; 654 tf->tf_gs = _udatasel; 655 } else 656 #endif 657 { 658 /* 659 * Don't allow users to change privileged or reserved flags. 660 */ 661 /* 662 * XXX do allow users to change the privileged flag PSL_RF. 663 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers 664 * should sometimes set it there too. tf_eflags is kept in 665 * the signal context during signal handling and there is no 666 * other place to remember it, so the PSL_RF bit may be 667 * corrupted by the signal handler without us knowing. 668 * Corruption of the PSL_RF bit at worst causes one more or 669 * one less debugger trap, so allowing it is fairly harmless. 670 */ 671 if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { 672 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags); 673 return(EINVAL); 674 } 675 676 /* 677 * Don't allow users to load a valid privileged %cs. Let the 678 * hardware check for invalid selectors, excess privilege in 679 * other selectors, invalid %eip's and invalid %esp's. 680 */ 681 cs = ucp->uc_mcontext.mc_cs; 682 if (!CS_SECURE(cs)) { 683 kprintf("sigreturn: cs = 0x%x\n", cs); 684 trapsignal(lp, SIGBUS, T_PROTFLT); 685 return(EINVAL); 686 } 687 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(struct trapframe)); 688 } 689 690 /* 691 * Restore the FPU state from the frame 692 */ 693 crit_enter(); 694 npxpop(&ucp->uc_mcontext); 695 696 /* 697 * Merge saved signal mailbox pending flag to maintain interlock 698 * semantics against system calls. 699 */ 700 if (ucp->uc_mcontext.mc_xflags & PGEX_MAILBOX) 701 p->p_flag |= P_MAILBOX; 702 703 if (ucp->uc_mcontext.mc_onstack & 1) 704 lp->lwp_sigstk.ss_flags |= SS_ONSTACK; 705 else 706 lp->lwp_sigstk.ss_flags &= ~SS_ONSTACK; 707 708 lp->lwp_sigmask = ucp->uc_sigmask; 709 SIG_CANTMASK(lp->lwp_sigmask); 710 crit_exit(); 711 return(EJUSTRETURN); 712 } 713 714 /* 715 * Stack frame on entry to function. %rax will contain the function vector, 716 * %rcx will contain the function data. flags, rcx, and rax will have 717 * already been pushed on the stack. 718 */ 719 struct upc_frame { 720 register_t rax; 721 register_t rcx; 722 register_t rdx; 723 register_t flags; 724 register_t oldip; 725 }; 726 727 void 728 sendupcall(struct vmupcall *vu, int morepending) 729 { 730 struct lwp *lp = curthread->td_lwp; 731 struct trapframe *regs; 732 struct upcall upcall; 733 struct upc_frame upc_frame; 734 int crit_count = 0; 735 736 /* 737 * If we are a virtual kernel running an emulated user process 738 * context, switch back to the virtual kernel context before 739 * trying to post the signal. 740 */ 741 if (lp->lwp_vkernel && lp->lwp_vkernel->ve) { 742 lp->lwp_md.md_regs->tf_trapno = 0; 743 vkernel_trap(lp, lp->lwp_md.md_regs); 744 } 745 746 /* 747 * Get the upcall data structure 748 */ 749 if (copyin(lp->lwp_upcall, &upcall, sizeof(upcall)) || 750 copyin((char *)upcall.upc_uthread + upcall.upc_critoff, &crit_count, sizeof(int)) 751 ) { 752 vu->vu_pending = 0; 753 kprintf("bad upcall address\n"); 754 return; 755 } 756 757 /* 758 * If the data structure is already marked pending or has a critical 759 * section count, mark the data structure as pending and return 760 * without doing an upcall. vu_pending is left set. 761 */ 762 if (upcall.upc_pending || crit_count >= vu->vu_pending) { 763 if (upcall.upc_pending < vu->vu_pending) { 764 upcall.upc_pending = vu->vu_pending; 765 copyout(&upcall.upc_pending, &lp->lwp_upcall->upc_pending, 766 sizeof(upcall.upc_pending)); 767 } 768 return; 769 } 770 771 /* 772 * We can run this upcall now, clear vu_pending. 773 * 774 * Bump our critical section count and set or clear the 775 * user pending flag depending on whether more upcalls are 776 * pending. The user will be responsible for calling 777 * upc_dispatch(-1) to process remaining upcalls. 778 */ 779 vu->vu_pending = 0; 780 upcall.upc_pending = morepending; 781 ++crit_count; 782 copyout(&upcall.upc_pending, &lp->lwp_upcall->upc_pending, 783 sizeof(upcall.upc_pending)); 784 copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, 785 sizeof(int)); 786 787 /* 788 * Construct a stack frame and issue the upcall 789 */ 790 regs = lp->lwp_md.md_regs; 791 upc_frame.rax = regs->tf_rax; 792 upc_frame.rcx = regs->tf_rcx; 793 upc_frame.rdx = regs->tf_rdx; 794 upc_frame.flags = regs->tf_rflags; 795 upc_frame.oldip = regs->tf_rip; 796 if (copyout(&upc_frame, (void *)(regs->tf_rsp - sizeof(upc_frame)), 797 sizeof(upc_frame)) != 0) { 798 kprintf("bad stack on upcall\n"); 799 } else { 800 regs->tf_rax = (register_t)vu->vu_func; 801 regs->tf_rcx = (register_t)vu->vu_data; 802 regs->tf_rdx = (register_t)lp->lwp_upcall; 803 regs->tf_rip = (register_t)vu->vu_ctx; 804 regs->tf_rsp -= sizeof(upc_frame); 805 } 806 } 807 808 /* 809 * fetchupcall occurs in the context of a system call, which means that 810 * we have to return EJUSTRETURN in order to prevent eax and edx from 811 * being overwritten by the syscall return value. 812 * 813 * if vu is not NULL we return the new context in %edx, the new data in %ecx, 814 * and the function pointer in %eax. 815 */ 816 int 817 fetchupcall(struct vmupcall *vu, int morepending, void *rsp) 818 { 819 struct upc_frame upc_frame; 820 struct lwp *lp = curthread->td_lwp; 821 struct trapframe *regs; 822 int error; 823 struct upcall upcall; 824 int crit_count; 825 826 regs = lp->lwp_md.md_regs; 827 828 error = copyout(&morepending, &lp->lwp_upcall->upc_pending, sizeof(int)); 829 if (error == 0) { 830 if (vu) { 831 /* 832 * This jumps us to the next ready context. 833 */ 834 vu->vu_pending = 0; 835 error = copyin(lp->lwp_upcall, &upcall, sizeof(upcall)); 836 crit_count = 0; 837 if (error == 0) 838 error = copyin((char *)upcall.upc_uthread + upcall.upc_critoff, &crit_count, sizeof(int)); 839 ++crit_count; 840 if (error == 0) 841 error = copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, sizeof(int)); 842 regs->tf_rax = (register_t)vu->vu_func; 843 regs->tf_rcx = (register_t)vu->vu_data; 844 regs->tf_rdx = (register_t)lp->lwp_upcall; 845 regs->tf_rip = (register_t)vu->vu_ctx; 846 regs->tf_rsp = (register_t)rsp; 847 } else { 848 /* 849 * This returns us to the originally interrupted code. 850 */ 851 error = copyin(rsp, &upc_frame, sizeof(upc_frame)); 852 regs->tf_rax = upc_frame.rax; 853 regs->tf_rcx = upc_frame.rcx; 854 regs->tf_rdx = upc_frame.rdx; 855 regs->tf_rflags = (regs->tf_rflags & ~PSL_USERCHANGE) | 856 (upc_frame.flags & PSL_USERCHANGE); 857 regs->tf_rip = upc_frame.oldip; 858 regs->tf_rsp = (register_t)((char *)rsp + sizeof(upc_frame)); 859 } 860 } 861 if (error == 0) 862 error = EJUSTRETURN; 863 return(error); 864 } 865 866 /* 867 * Machine dependent boot() routine 868 * 869 * I haven't seen anything to put here yet 870 * Possibly some stuff might be grafted back here from boot() 871 */ 872 void 873 cpu_boot(int howto) 874 { 875 } 876 877 /* 878 * Shutdown the CPU as much as possible 879 */ 880 void 881 cpu_halt(void) 882 { 883 for (;;) 884 __asm__ __volatile("hlt"); 885 } 886 887 /* 888 * cpu_idle() represents the idle LWKT. You cannot return from this function 889 * (unless you want to blow things up!). Instead we look for runnable threads 890 * and loop or halt as appropriate. Giant is not held on entry to the thread. 891 * 892 * The main loop is entered with a critical section held, we must release 893 * the critical section before doing anything else. lwkt_switch() will 894 * check for pending interrupts due to entering and exiting its own 895 * critical section. 896 * 897 * Note on cpu_idle_hlt: On an SMP system we rely on a scheduler IPI 898 * to wake a HLTed cpu up. However, there are cases where the idlethread 899 * will be entered with the possibility that no IPI will occur and in such 900 * cases lwkt_switch() sets TDF_IDLE_NOHLT. 901 */ 902 static int cpu_idle_hlt = 1; 903 static int cpu_idle_hltcnt; 904 static int cpu_idle_spincnt; 905 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, 906 &cpu_idle_hlt, 0, "Idle loop HLT enable"); 907 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hltcnt, CTLFLAG_RW, 908 &cpu_idle_hltcnt, 0, "Idle loop entry halts"); 909 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_spincnt, CTLFLAG_RW, 910 &cpu_idle_spincnt, 0, "Idle loop entry spins"); 911 912 static void 913 cpu_idle_default_hook(void) 914 { 915 /* 916 * We must guarentee that hlt is exactly the instruction 917 * following the sti. 918 */ 919 __asm __volatile("sti; hlt"); 920 } 921 922 /* Other subsystems (e.g., ACPI) can hook this later. */ 923 void (*cpu_idle_hook)(void) = cpu_idle_default_hook; 924 925 void 926 cpu_idle(void) 927 { 928 struct thread *td = curthread; 929 930 crit_exit(); 931 KKASSERT(td->td_critcount == 0); 932 for (;;) { 933 /* 934 * See if there are any LWKTs ready to go. 935 */ 936 lwkt_switch(); 937 938 /* 939 * If we are going to halt call splz unconditionally after 940 * CLIing to catch any interrupt races. Note that we are 941 * at SPL0 and interrupts are enabled. 942 */ 943 if (cpu_idle_hlt && !lwkt_runnable() && 944 (td->td_flags & TDF_IDLE_NOHLT) == 0) { 945 __asm __volatile("cli"); 946 splz(); 947 if (!lwkt_runnable()) 948 cpu_idle_hook(); 949 #ifdef SMP 950 else 951 handle_cpu_contention_mask(); 952 #endif 953 ++cpu_idle_hltcnt; 954 } else { 955 td->td_flags &= ~TDF_IDLE_NOHLT; 956 splz(); 957 #ifdef SMP 958 __asm __volatile("sti"); 959 handle_cpu_contention_mask(); 960 #else 961 __asm __volatile("sti"); 962 #endif 963 ++cpu_idle_spincnt; 964 } 965 } 966 } 967 968 #ifdef SMP 969 970 /* 971 * This routine is called when the only runnable threads require 972 * the MP lock, and the scheduler couldn't get it. On a real cpu 973 * we let the scheduler spin. 974 */ 975 void 976 handle_cpu_contention_mask(void) 977 { 978 cpumask_t mask; 979 980 mask = cpu_contention_mask; 981 cpu_ccfence(); 982 if (mask && bsfl(mask) != mycpu->gd_cpuid) 983 DELAY(2); 984 } 985 986 /* 987 * This routine is called if a spinlock has been held through the 988 * exponential backoff period and is seriously contested. On a real cpu 989 * we let it spin. 990 */ 991 void 992 cpu_spinlock_contested(void) 993 { 994 cpu_pause(); 995 } 996 997 #endif 998 999 /* 1000 * Clear registers on exec 1001 */ 1002 void 1003 exec_setregs(u_long entry, u_long stack, u_long ps_strings) 1004 { 1005 struct thread *td = curthread; 1006 struct lwp *lp = td->td_lwp; 1007 struct pcb *pcb = td->td_pcb; 1008 struct trapframe *regs = lp->lwp_md.md_regs; 1009 1010 /* was i386_user_cleanup() in NetBSD */ 1011 user_ldt_free(pcb); 1012 1013 bzero((char *)regs, sizeof(struct trapframe)); 1014 regs->tf_rip = entry; 1015 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; /* align the stack */ 1016 regs->tf_rdi = stack; /* argv */ 1017 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); 1018 regs->tf_ss = _udatasel; 1019 regs->tf_cs = _ucodesel; 1020 regs->tf_rbx = ps_strings; 1021 1022 /* 1023 * Reset the hardware debug registers if they were in use. 1024 * They won't have any meaning for the newly exec'd process. 1025 */ 1026 if (pcb->pcb_flags & PCB_DBREGS) { 1027 pcb->pcb_dr0 = 0; 1028 pcb->pcb_dr1 = 0; 1029 pcb->pcb_dr2 = 0; 1030 pcb->pcb_dr3 = 0; 1031 pcb->pcb_dr6 = 0; 1032 pcb->pcb_dr7 = 0; /* JG set bit 10? */ 1033 if (pcb == td->td_pcb) { 1034 /* 1035 * Clear the debug registers on the running 1036 * CPU, otherwise they will end up affecting 1037 * the next process we switch to. 1038 */ 1039 reset_dbregs(); 1040 } 1041 pcb->pcb_flags &= ~PCB_DBREGS; 1042 } 1043 1044 /* 1045 * Initialize the math emulator (if any) for the current process. 1046 * Actually, just clear the bit that says that the emulator has 1047 * been initialized. Initialization is delayed until the process 1048 * traps to the emulator (if it is done at all) mainly because 1049 * emulators don't provide an entry point for initialization. 1050 */ 1051 pcb->pcb_flags &= ~FP_SOFTFP; 1052 1053 /* 1054 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing 1055 * gd_npxthread. Otherwise a preemptive interrupt thread 1056 * may panic in npxdna(). 1057 */ 1058 crit_enter(); 1059 load_cr0(rcr0() | CR0_MP); 1060 1061 /* 1062 * NOTE: The MSR values must be correct so we can return to 1063 * userland. gd_user_fs/gs must be correct so the switch 1064 * code knows what the current MSR values are. 1065 */ 1066 pcb->pcb_fsbase = 0; /* Values loaded from PCB on switch */ 1067 pcb->pcb_gsbase = 0; 1068 mdcpu->gd_user_fs = 0; /* Cache of current MSR values */ 1069 mdcpu->gd_user_gs = 0; 1070 wrmsr(MSR_FSBASE, 0); /* Set MSR values for return to userland */ 1071 wrmsr(MSR_KGSBASE, 0); 1072 1073 /* Initialize the npx (if any) for the current process. */ 1074 npxinit(__INITIAL_NPXCW__); 1075 crit_exit(); 1076 1077 pcb->pcb_ds = _udatasel; 1078 pcb->pcb_es = _udatasel; 1079 pcb->pcb_fs = _udatasel; 1080 pcb->pcb_gs = _udatasel; 1081 } 1082 1083 void 1084 cpu_setregs(void) 1085 { 1086 register_t cr0; 1087 1088 cr0 = rcr0(); 1089 cr0 |= CR0_NE; /* Done by npxinit() */ 1090 cr0 |= CR0_MP | CR0_TS; /* Done at every execve() too. */ 1091 cr0 |= CR0_WP | CR0_AM; 1092 load_cr0(cr0); 1093 load_gs(_udatasel); 1094 } 1095 1096 static int 1097 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS) 1098 { 1099 int error; 1100 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, 1101 req); 1102 if (!error && req->newptr) 1103 resettodr(); 1104 return (error); 1105 } 1106 1107 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, 1108 &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); 1109 1110 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, 1111 CTLFLAG_RW, &disable_rtc_set, 0, ""); 1112 1113 #if JG 1114 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, 1115 CTLFLAG_RD, &bootinfo, bootinfo, ""); 1116 #endif 1117 1118 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, 1119 CTLFLAG_RW, &wall_cmos_clock, 0, ""); 1120 1121 extern u_long bootdev; /* not a cdev_t - encoding is different */ 1122 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, 1123 CTLFLAG_RD, &bootdev, 0, "Boot device (not in cdev_t format)"); 1124 1125 /* 1126 * Initialize 386 and configure to run kernel 1127 */ 1128 1129 /* 1130 * Initialize segments & interrupt table 1131 */ 1132 1133 int _default_ldt; 1134 struct user_segment_descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ 1135 static struct gate_descriptor idt0[NIDT]; 1136 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 1137 #if JG 1138 union descriptor ldt[NLDT]; /* local descriptor table */ 1139 #endif 1140 1141 /* table descriptors - used to load tables by cpu */ 1142 struct region_descriptor r_gdt, r_idt; 1143 1144 #if defined(I586_CPU) && !defined(NO_F00F_HACK) 1145 extern int has_f00f_bug; 1146 #endif 1147 1148 static char dblfault_stack[PAGE_SIZE] __aligned(16); 1149 1150 /* JG proc0paddr is a virtual address */ 1151 void *proc0paddr; 1152 /* JG alignment? */ 1153 char proc0paddr_buff[LWKT_THREAD_STACK]; 1154 1155 1156 /* software prototypes -- in more palatable form */ 1157 struct soft_segment_descriptor gdt_segs[] = { 1158 /* GNULL_SEL 0 Null Descriptor */ 1159 { 0x0, /* segment base address */ 1160 0x0, /* length */ 1161 0, /* segment type */ 1162 0, /* segment descriptor priority level */ 1163 0, /* segment descriptor present */ 1164 0, /* long */ 1165 0, /* default 32 vs 16 bit size */ 1166 0 /* limit granularity (byte/page units)*/ }, 1167 /* GCODE_SEL 1 Code Descriptor for kernel */ 1168 { 0x0, /* segment base address */ 1169 0xfffff, /* length - all address space */ 1170 SDT_MEMERA, /* segment type */ 1171 SEL_KPL, /* segment descriptor priority level */ 1172 1, /* segment descriptor present */ 1173 1, /* long */ 1174 0, /* default 32 vs 16 bit size */ 1175 1 /* limit granularity (byte/page units)*/ }, 1176 /* GDATA_SEL 2 Data Descriptor for kernel */ 1177 { 0x0, /* segment base address */ 1178 0xfffff, /* length - all address space */ 1179 SDT_MEMRWA, /* segment type */ 1180 SEL_KPL, /* segment descriptor priority level */ 1181 1, /* segment descriptor present */ 1182 1, /* long */ 1183 0, /* default 32 vs 16 bit size */ 1184 1 /* limit granularity (byte/page units)*/ }, 1185 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */ 1186 { 0x0, /* segment base address */ 1187 0xfffff, /* length - all address space */ 1188 SDT_MEMERA, /* segment type */ 1189 SEL_UPL, /* segment descriptor priority level */ 1190 1, /* segment descriptor present */ 1191 0, /* long */ 1192 1, /* default 32 vs 16 bit size */ 1193 1 /* limit granularity (byte/page units)*/ }, 1194 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */ 1195 { 0x0, /* segment base address */ 1196 0xfffff, /* length - all address space */ 1197 SDT_MEMRWA, /* segment type */ 1198 SEL_UPL, /* segment descriptor priority level */ 1199 1, /* segment descriptor present */ 1200 0, /* long */ 1201 1, /* default 32 vs 16 bit size */ 1202 1 /* limit granularity (byte/page units)*/ }, 1203 /* GUCODE_SEL 5 64 bit Code Descriptor for user */ 1204 { 0x0, /* segment base address */ 1205 0xfffff, /* length - all address space */ 1206 SDT_MEMERA, /* segment type */ 1207 SEL_UPL, /* segment descriptor priority level */ 1208 1, /* segment descriptor present */ 1209 1, /* long */ 1210 0, /* default 32 vs 16 bit size */ 1211 1 /* limit granularity (byte/page units)*/ }, 1212 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */ 1213 { 1214 0x0, /* segment base address */ 1215 sizeof(struct x86_64tss)-1,/* length - all address space */ 1216 SDT_SYSTSS, /* segment type */ 1217 SEL_KPL, /* segment descriptor priority level */ 1218 1, /* segment descriptor present */ 1219 0, /* long */ 1220 0, /* unused - default 32 vs 16 bit size */ 1221 0 /* limit granularity (byte/page units)*/ }, 1222 /* Actually, the TSS is a system descriptor which is double size */ 1223 { 0x0, /* segment base address */ 1224 0x0, /* length */ 1225 0, /* segment type */ 1226 0, /* segment descriptor priority level */ 1227 0, /* segment descriptor present */ 1228 0, /* long */ 1229 0, /* default 32 vs 16 bit size */ 1230 0 /* limit granularity (byte/page units)*/ }, 1231 /* GUGS32_SEL 8 32 bit GS Descriptor for user */ 1232 { 0x0, /* segment base address */ 1233 0xfffff, /* length - all address space */ 1234 SDT_MEMRWA, /* segment type */ 1235 SEL_UPL, /* segment descriptor priority level */ 1236 1, /* segment descriptor present */ 1237 0, /* long */ 1238 1, /* default 32 vs 16 bit size */ 1239 1 /* limit granularity (byte/page units)*/ }, 1240 }; 1241 1242 void 1243 setidt(int idx, inthand_t *func, int typ, int dpl, int ist) 1244 { 1245 struct gate_descriptor *ip; 1246 1247 ip = idt + idx; 1248 ip->gd_looffset = (uintptr_t)func; 1249 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 1250 ip->gd_ist = ist; 1251 ip->gd_xx = 0; 1252 ip->gd_type = typ; 1253 ip->gd_dpl = dpl; 1254 ip->gd_p = 1; 1255 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 1256 } 1257 1258 #define IDTVEC(name) __CONCAT(X,name) 1259 1260 extern inthand_t 1261 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 1262 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 1263 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 1264 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 1265 IDTVEC(xmm), IDTVEC(dblfault), 1266 IDTVEC(fast_syscall), IDTVEC(fast_syscall32); 1267 1268 #ifdef DEBUG_INTERRUPTS 1269 extern inthand_t *Xrsvdary[256]; 1270 #endif 1271 1272 void 1273 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd) 1274 { 1275 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 1276 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 1277 ssd->ssd_type = sd->sd_type; 1278 ssd->ssd_dpl = sd->sd_dpl; 1279 ssd->ssd_p = sd->sd_p; 1280 ssd->ssd_def32 = sd->sd_def32; 1281 ssd->ssd_gran = sd->sd_gran; 1282 } 1283 1284 void 1285 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd) 1286 { 1287 1288 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 1289 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 1290 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 1291 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 1292 sd->sd_type = ssd->ssd_type; 1293 sd->sd_dpl = ssd->ssd_dpl; 1294 sd->sd_p = ssd->ssd_p; 1295 sd->sd_long = ssd->ssd_long; 1296 sd->sd_def32 = ssd->ssd_def32; 1297 sd->sd_gran = ssd->ssd_gran; 1298 } 1299 1300 void 1301 ssdtosyssd(struct soft_segment_descriptor *ssd, 1302 struct system_segment_descriptor *sd) 1303 { 1304 1305 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 1306 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 1307 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 1308 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 1309 sd->sd_type = ssd->ssd_type; 1310 sd->sd_dpl = ssd->ssd_dpl; 1311 sd->sd_p = ssd->ssd_p; 1312 sd->sd_gran = ssd->ssd_gran; 1313 } 1314 1315 u_int basemem; 1316 1317 /* 1318 * Populate the (physmap) array with base/bound pairs describing the 1319 * available physical memory in the system, then test this memory and 1320 * build the phys_avail array describing the actually-available memory. 1321 * 1322 * If we cannot accurately determine the physical memory map, then use 1323 * value from the 0xE801 call, and failing that, the RTC. 1324 * 1325 * Total memory size may be set by the kernel environment variable 1326 * hw.physmem or the compile-time define MAXMEM. 1327 * 1328 * XXX first should be vm_paddr_t. 1329 */ 1330 static void 1331 getmemsize(caddr_t kmdp, u_int64_t first) 1332 { 1333 int i, off, physmap_idx, pa_indx, da_indx; 1334 vm_paddr_t pa, physmap[PHYSMAP_SIZE]; 1335 u_long physmem_tunable; 1336 pt_entry_t *pte; 1337 struct bios_smap *smapbase, *smap, *smapend; 1338 u_int32_t smapsize; 1339 quad_t dcons_addr, dcons_size; 1340 1341 bzero(physmap, sizeof(physmap)); 1342 basemem = 0; 1343 physmap_idx = 0; 1344 1345 /* 1346 * get memory map from INT 15:E820, kindly supplied by the loader. 1347 * 1348 * subr_module.c says: 1349 * "Consumer may safely assume that size value precedes data." 1350 * ie: an int32_t immediately precedes smap. 1351 */ 1352 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1353 MODINFO_METADATA | MODINFOMD_SMAP); 1354 if (smapbase == NULL) 1355 panic("No BIOS smap info from loader!"); 1356 1357 smapsize = *((u_int32_t *)smapbase - 1); 1358 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1359 1360 for (smap = smapbase; smap < smapend; smap++) { 1361 if (boothowto & RB_VERBOSE) 1362 kprintf("SMAP type=%02x base=%016lx len=%016lx\n", 1363 smap->type, smap->base, smap->length); 1364 1365 if (smap->type != SMAP_TYPE_MEMORY) 1366 continue; 1367 1368 if (smap->length == 0) 1369 continue; 1370 1371 for (i = 0; i <= physmap_idx; i += 2) { 1372 if (smap->base < physmap[i + 1]) { 1373 if (boothowto & RB_VERBOSE) 1374 kprintf( 1375 "Overlapping or non-monotonic memory region, ignoring second region\n"); 1376 continue; 1377 } 1378 } 1379 1380 if (smap->base == physmap[physmap_idx + 1]) { 1381 physmap[physmap_idx + 1] += smap->length; 1382 continue; 1383 } 1384 1385 physmap_idx += 2; 1386 if (physmap_idx == PHYSMAP_SIZE) { 1387 kprintf( 1388 "Too many segments in the physical address map, giving up\n"); 1389 break; 1390 } 1391 physmap[physmap_idx] = smap->base; 1392 physmap[physmap_idx + 1] = smap->base + smap->length; 1393 } 1394 1395 /* 1396 * Find the 'base memory' segment for SMP 1397 */ 1398 basemem = 0; 1399 for (i = 0; i <= physmap_idx; i += 2) { 1400 if (physmap[i] == 0x00000000) { 1401 basemem = physmap[i + 1] / 1024; 1402 break; 1403 } 1404 } 1405 if (basemem == 0) 1406 panic("BIOS smap did not include a basemem segment!"); 1407 1408 #ifdef SMP 1409 /* make hole for AP bootstrap code */ 1410 physmap[1] = mp_bootaddress(physmap[1] / 1024); 1411 1412 /* look for the MP hardware - needed for apic addresses */ 1413 mp_probe(); 1414 #endif 1415 1416 /* 1417 * Maxmem isn't the "maximum memory", it's one larger than the 1418 * highest page of the physical address space. It should be 1419 * called something like "Maxphyspage". We may adjust this 1420 * based on ``hw.physmem'' and the results of the memory test. 1421 */ 1422 Maxmem = atop(physmap[physmap_idx + 1]); 1423 1424 #ifdef MAXMEM 1425 Maxmem = MAXMEM / 4; 1426 #endif 1427 1428 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 1429 Maxmem = atop(physmem_tunable); 1430 1431 /* 1432 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 1433 * in the system. 1434 */ 1435 if (Maxmem > atop(physmap[physmap_idx + 1])) 1436 Maxmem = atop(physmap[physmap_idx + 1]); 1437 1438 if (atop(physmap[physmap_idx + 1]) != Maxmem && 1439 (boothowto & RB_VERBOSE)) 1440 kprintf("Physical memory use set to %ldK\n", Maxmem * 4); 1441 1442 /* call pmap initialization to make new kernel address space */ 1443 pmap_bootstrap(&first); 1444 1445 /* 1446 * Size up each available chunk of physical memory. 1447 */ 1448 physmap[0] = PAGE_SIZE; /* mask off page 0 */ 1449 pa_indx = 0; 1450 da_indx = 1; 1451 phys_avail[pa_indx++] = physmap[0]; 1452 phys_avail[pa_indx] = physmap[0]; 1453 dump_avail[da_indx] = physmap[0]; 1454 pte = CMAP1; 1455 1456 /* 1457 * Get dcons buffer address 1458 */ 1459 if (kgetenv_quad("dcons.addr", &dcons_addr) == 0 || 1460 kgetenv_quad("dcons.size", &dcons_size) == 0) 1461 dcons_addr = 0; 1462 1463 /* 1464 * physmap is in bytes, so when converting to page boundaries, 1465 * round up the start address and round down the end address. 1466 */ 1467 for (i = 0; i <= physmap_idx; i += 2) { 1468 vm_paddr_t end; 1469 1470 end = ptoa((vm_paddr_t)Maxmem); 1471 if (physmap[i + 1] < end) 1472 end = trunc_page(physmap[i + 1]); 1473 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 1474 int tmp, page_bad, full; 1475 int *ptr = (int *)CADDR1; 1476 1477 full = FALSE; 1478 /* 1479 * block out kernel memory as not available. 1480 */ 1481 if (pa >= 0x100000 && pa < first) 1482 goto do_dump_avail; 1483 1484 /* 1485 * block out dcons buffer 1486 */ 1487 if (dcons_addr > 0 1488 && pa >= trunc_page(dcons_addr) 1489 && pa < dcons_addr + dcons_size) 1490 goto do_dump_avail; 1491 1492 page_bad = FALSE; 1493 1494 /* 1495 * map page into kernel: valid, read/write,non-cacheable 1496 */ 1497 *pte = pa | PG_V | PG_RW | PG_N; 1498 cpu_invltlb(); 1499 1500 tmp = *(int *)ptr; 1501 /* 1502 * Test for alternating 1's and 0's 1503 */ 1504 *(volatile int *)ptr = 0xaaaaaaaa; 1505 if (*(volatile int *)ptr != 0xaaaaaaaa) 1506 page_bad = TRUE; 1507 /* 1508 * Test for alternating 0's and 1's 1509 */ 1510 *(volatile int *)ptr = 0x55555555; 1511 if (*(volatile int *)ptr != 0x55555555) 1512 page_bad = TRUE; 1513 /* 1514 * Test for all 1's 1515 */ 1516 *(volatile int *)ptr = 0xffffffff; 1517 if (*(volatile int *)ptr != 0xffffffff) 1518 page_bad = TRUE; 1519 /* 1520 * Test for all 0's 1521 */ 1522 *(volatile int *)ptr = 0x0; 1523 if (*(volatile int *)ptr != 0x0) 1524 page_bad = TRUE; 1525 /* 1526 * Restore original value. 1527 */ 1528 *(int *)ptr = tmp; 1529 1530 /* 1531 * Adjust array of valid/good pages. 1532 */ 1533 if (page_bad == TRUE) 1534 continue; 1535 /* 1536 * If this good page is a continuation of the 1537 * previous set of good pages, then just increase 1538 * the end pointer. Otherwise start a new chunk. 1539 * Note that "end" points one higher than end, 1540 * making the range >= start and < end. 1541 * If we're also doing a speculative memory 1542 * test and we at or past the end, bump up Maxmem 1543 * so that we keep going. The first bad page 1544 * will terminate the loop. 1545 */ 1546 if (phys_avail[pa_indx] == pa) { 1547 phys_avail[pa_indx] += PAGE_SIZE; 1548 } else { 1549 pa_indx++; 1550 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 1551 kprintf( 1552 "Too many holes in the physical address space, giving up\n"); 1553 pa_indx--; 1554 full = TRUE; 1555 goto do_dump_avail; 1556 } 1557 phys_avail[pa_indx++] = pa; /* start */ 1558 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 1559 } 1560 physmem++; 1561 do_dump_avail: 1562 if (dump_avail[da_indx] == pa) { 1563 dump_avail[da_indx] += PAGE_SIZE; 1564 } else { 1565 da_indx++; 1566 if (da_indx == DUMP_AVAIL_ARRAY_END) { 1567 da_indx--; 1568 goto do_next; 1569 } 1570 dump_avail[da_indx++] = pa; /* start */ 1571 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 1572 } 1573 do_next: 1574 if (full) 1575 break; 1576 } 1577 } 1578 *pte = 0; 1579 cpu_invltlb(); 1580 1581 /* 1582 * XXX 1583 * The last chunk must contain at least one page plus the message 1584 * buffer to avoid complicating other code (message buffer address 1585 * calculation, etc.). 1586 */ 1587 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 1588 round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) { 1589 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 1590 phys_avail[pa_indx--] = 0; 1591 phys_avail[pa_indx--] = 0; 1592 } 1593 1594 Maxmem = atop(phys_avail[pa_indx]); 1595 1596 /* Trim off space for the message buffer. */ 1597 phys_avail[pa_indx] -= round_page(MSGBUF_SIZE); 1598 1599 avail_end = phys_avail[pa_indx]; 1600 1601 /* Map the message buffer. */ 1602 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) 1603 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + 1604 off); 1605 } 1606 1607 /* 1608 * IDT VECTORS: 1609 * 0 Divide by zero 1610 * 1 Debug 1611 * 2 NMI 1612 * 3 BreakPoint 1613 * 4 OverFlow 1614 * 5 Bound-Range 1615 * 6 Invalid OpCode 1616 * 7 Device Not Available (x87) 1617 * 8 Double-Fault 1618 * 9 Coprocessor Segment overrun (unsupported, reserved) 1619 * 10 Invalid-TSS 1620 * 11 Segment not present 1621 * 12 Stack 1622 * 13 General Protection 1623 * 14 Page Fault 1624 * 15 Reserved 1625 * 16 x87 FP Exception pending 1626 * 17 Alignment Check 1627 * 18 Machine Check 1628 * 19 SIMD floating point 1629 * 20-31 reserved 1630 * 32-255 INTn/external sources 1631 */ 1632 u_int64_t 1633 hammer_time(u_int64_t modulep, u_int64_t physfree) 1634 { 1635 caddr_t kmdp; 1636 int gsel_tss, x; 1637 #if JG 1638 int metadata_missing, off; 1639 #endif 1640 struct mdglobaldata *gd; 1641 u_int64_t msr; 1642 1643 /* 1644 * Prevent lowering of the ipl if we call tsleep() early. 1645 */ 1646 gd = &CPU_prvspace[0].mdglobaldata; 1647 bzero(gd, sizeof(*gd)); 1648 1649 /* 1650 * Note: on both UP and SMP curthread must be set non-NULL 1651 * early in the boot sequence because the system assumes 1652 * that 'curthread' is never NULL. 1653 */ 1654 1655 gd->mi.gd_curthread = &thread0; 1656 thread0.td_gd = &gd->mi; 1657 1658 atdevbase = ISA_HOLE_START + PTOV_OFFSET; 1659 1660 #if JG 1661 metadata_missing = 0; 1662 if (bootinfo.bi_modulep) { 1663 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; 1664 preload_bootstrap_relocate(KERNBASE); 1665 } else { 1666 metadata_missing = 1; 1667 } 1668 if (bootinfo.bi_envp) 1669 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; 1670 #endif 1671 1672 preload_metadata = (caddr_t)(uintptr_t)(modulep + PTOV_OFFSET); 1673 preload_bootstrap_relocate(PTOV_OFFSET); 1674 kmdp = preload_search_by_type("elf kernel"); 1675 if (kmdp == NULL) 1676 kmdp = preload_search_by_type("elf64 kernel"); 1677 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 1678 kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + PTOV_OFFSET; 1679 #ifdef DDB 1680 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 1681 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 1682 #endif 1683 1684 /* 1685 * start with one cpu. Note: with one cpu, ncpus2_shift, ncpus2_mask, 1686 * and ncpus_fit_mask remain 0. 1687 */ 1688 ncpus = 1; 1689 ncpus2 = 1; 1690 ncpus_fit = 1; 1691 /* Init basic tunables, hz etc */ 1692 init_param1(); 1693 1694 /* 1695 * make gdt memory segments 1696 */ 1697 gdt_segs[GPROC0_SEL].ssd_base = 1698 (uintptr_t) &CPU_prvspace[0].mdglobaldata.gd_common_tss; 1699 1700 gd->mi.gd_prvspace = &CPU_prvspace[0]; 1701 1702 for (x = 0; x < NGDT; x++) { 1703 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1)) 1704 ssdtosd(&gdt_segs[x], &gdt[x]); 1705 } 1706 ssdtosyssd(&gdt_segs[GPROC0_SEL], 1707 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1708 1709 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 1710 r_gdt.rd_base = (long) gdt; 1711 lgdt(&r_gdt); 1712 1713 wrmsr(MSR_FSBASE, 0); /* User value */ 1714 wrmsr(MSR_GSBASE, (u_int64_t)&gd->mi); 1715 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 1716 1717 mi_gdinit(&gd->mi, 0); 1718 cpu_gdinit(gd, 0); 1719 proc0paddr = proc0paddr_buff; 1720 mi_proc0init(&gd->mi, proc0paddr); 1721 safepri = TDPRI_MAX; 1722 1723 /* spinlocks and the BGL */ 1724 init_locks(); 1725 1726 /* exceptions */ 1727 for (x = 0; x < NIDT; x++) 1728 setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); 1729 setidt(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); 1730 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0); 1731 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 1); 1732 setidt(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0); 1733 setidt(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0); 1734 setidt(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); 1735 setidt(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); 1736 setidt(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); 1737 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 1738 setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); 1739 setidt(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); 1740 setidt(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); 1741 setidt(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); 1742 setidt(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); 1743 setidt(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); 1744 setidt(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); 1745 setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); 1746 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0); 1747 setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); 1748 1749 r_idt.rd_limit = sizeof(idt0) - 1; 1750 r_idt.rd_base = (long) idt; 1751 lidt(&r_idt); 1752 1753 /* 1754 * Initialize the console before we print anything out. 1755 */ 1756 cninit(); 1757 1758 #if JG 1759 if (metadata_missing) 1760 kprintf("WARNING: loader(8) metadata is missing!\n"); 1761 #endif 1762 1763 #if NISA >0 1764 isa_defaultirq(); 1765 #endif 1766 rand_initialize(); 1767 1768 #ifdef DDB 1769 kdb_init(); 1770 if (boothowto & RB_KDB) 1771 Debugger("Boot flags requested debugger"); 1772 #endif 1773 1774 #if JG 1775 finishidentcpu(); /* Final stage of CPU initialization */ 1776 setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 1777 setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 1778 #endif 1779 identify_cpu(); /* Final stage of CPU initialization */ 1780 initializecpu(); /* Initialize CPU registers */ 1781 1782 /* make an initial tss so cpu can get interrupt stack on syscall! */ 1783 gd->gd_common_tss.tss_rsp0 = 1784 (register_t)(thread0.td_kstack + 1785 KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb)); 1786 /* Ensure the stack is aligned to 16 bytes */ 1787 gd->gd_common_tss.tss_rsp0 &= ~0xFul; 1788 gd->gd_rsp0 = gd->gd_common_tss.tss_rsp0; 1789 1790 /* doublefault stack space, runs on ist1 */ 1791 gd->gd_common_tss.tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)]; 1792 1793 /* Set the IO permission bitmap (empty due to tss seg limit) */ 1794 gd->gd_common_tss.tss_iobase = sizeof(struct x86_64tss); 1795 1796 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 1797 gd->gd_tss_gdt = &gdt[GPROC0_SEL]; 1798 gd->gd_common_tssd = *gd->gd_tss_gdt; 1799 ltr(gsel_tss); 1800 1801 /* Set up the fast syscall stuff */ 1802 msr = rdmsr(MSR_EFER) | EFER_SCE; 1803 wrmsr(MSR_EFER, msr); 1804 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); 1805 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 1806 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1807 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 1808 wrmsr(MSR_STAR, msr); 1809 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); 1810 1811 getmemsize(kmdp, physfree); 1812 init_param2(physmem); 1813 1814 /* now running on new page tables, configured,and u/iom is accessible */ 1815 1816 /* Map the message buffer. */ 1817 #if JG 1818 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) 1819 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); 1820 #endif 1821 1822 msgbufinit(msgbufp, MSGBUF_SIZE); 1823 1824 1825 /* transfer to user mode */ 1826 1827 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 1828 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 1829 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 1830 1831 load_ds(_udatasel); 1832 load_es(_udatasel); 1833 load_fs(_udatasel); 1834 1835 /* setup proc 0's pcb */ 1836 thread0.td_pcb->pcb_flags = 0; 1837 thread0.td_pcb->pcb_cr3 = KPML4phys; 1838 thread0.td_pcb->pcb_ext = 0; 1839 lwp0.lwp_md.md_regs = &proc0_tf; 1840 1841 /* Location of kernel stack for locore */ 1842 return ((u_int64_t)thread0.td_pcb); 1843 } 1844 1845 /* 1846 * Initialize machine-dependant portions of the global data structure. 1847 * Note that the global data area and cpu0's idlestack in the private 1848 * data space were allocated in locore. 1849 * 1850 * Note: the idlethread's cpl is 0 1851 * 1852 * WARNING! Called from early boot, 'mycpu' may not work yet. 1853 */ 1854 void 1855 cpu_gdinit(struct mdglobaldata *gd, int cpu) 1856 { 1857 if (cpu) 1858 gd->mi.gd_curthread = &gd->mi.gd_idlethread; 1859 1860 lwkt_init_thread(&gd->mi.gd_idlethread, 1861 gd->mi.gd_prvspace->idlestack, 1862 sizeof(gd->mi.gd_prvspace->idlestack), 1863 0, &gd->mi); 1864 lwkt_set_comm(&gd->mi.gd_idlethread, "idle_%d", cpu); 1865 gd->mi.gd_idlethread.td_switch = cpu_lwkt_switch; 1866 gd->mi.gd_idlethread.td_sp -= sizeof(void *); 1867 *(void **)gd->mi.gd_idlethread.td_sp = cpu_idle_restore; 1868 } 1869 1870 int 1871 is_globaldata_space(vm_offset_t saddr, vm_offset_t eaddr) 1872 { 1873 if (saddr >= (vm_offset_t)&CPU_prvspace[0] && 1874 eaddr <= (vm_offset_t)&CPU_prvspace[MAXCPU]) { 1875 return (TRUE); 1876 } 1877 return (FALSE); 1878 } 1879 1880 struct globaldata * 1881 globaldata_find(int cpu) 1882 { 1883 KKASSERT(cpu >= 0 && cpu < ncpus); 1884 return(&CPU_prvspace[cpu].mdglobaldata.mi); 1885 } 1886 1887 #if defined(I586_CPU) && !defined(NO_F00F_HACK) 1888 static void f00f_hack(void *unused); 1889 SYSINIT(f00f_hack, SI_BOOT2_BIOS, SI_ORDER_ANY, f00f_hack, NULL); 1890 1891 static void 1892 f00f_hack(void *unused) 1893 { 1894 struct gate_descriptor *new_idt; 1895 vm_offset_t tmp; 1896 1897 if (!has_f00f_bug) 1898 return; 1899 1900 kprintf("Intel Pentium detected, installing workaround for F00F bug\n"); 1901 1902 r_idt.rd_limit = sizeof(idt0) - 1; 1903 1904 tmp = kmem_alloc(&kernel_map, PAGE_SIZE * 2); 1905 if (tmp == 0) 1906 panic("kmem_alloc returned 0"); 1907 if (((unsigned int)tmp & (PAGE_SIZE-1)) != 0) 1908 panic("kmem_alloc returned non-page-aligned memory"); 1909 /* Put the first seven entries in the lower page */ 1910 new_idt = (struct gate_descriptor*)(tmp + PAGE_SIZE - (7*8)); 1911 bcopy(idt, new_idt, sizeof(idt0)); 1912 r_idt.rd_base = (int)new_idt; 1913 lidt(&r_idt); 1914 idt = new_idt; 1915 if (vm_map_protect(&kernel_map, tmp, tmp + PAGE_SIZE, 1916 VM_PROT_READ, FALSE) != KERN_SUCCESS) 1917 panic("vm_map_protect failed"); 1918 return; 1919 } 1920 #endif /* defined(I586_CPU) && !NO_F00F_HACK */ 1921 1922 int 1923 ptrace_set_pc(struct lwp *lp, unsigned long addr) 1924 { 1925 lp->lwp_md.md_regs->tf_rip = addr; 1926 return (0); 1927 } 1928 1929 int 1930 ptrace_single_step(struct lwp *lp) 1931 { 1932 lp->lwp_md.md_regs->tf_rflags |= PSL_T; 1933 return (0); 1934 } 1935 1936 int 1937 fill_regs(struct lwp *lp, struct reg *regs) 1938 { 1939 struct trapframe *tp; 1940 1941 tp = lp->lwp_md.md_regs; 1942 bcopy(&tp->tf_rdi, ®s->r_rdi, sizeof(*regs)); 1943 return (0); 1944 } 1945 1946 int 1947 set_regs(struct lwp *lp, struct reg *regs) 1948 { 1949 struct trapframe *tp; 1950 1951 tp = lp->lwp_md.md_regs; 1952 if (!EFL_SECURE(regs->r_rflags, tp->tf_rflags) || 1953 !CS_SECURE(regs->r_cs)) 1954 return (EINVAL); 1955 bcopy(®s->r_rdi, &tp->tf_rdi, sizeof(*regs)); 1956 return (0); 1957 } 1958 1959 #ifndef CPU_DISABLE_SSE 1960 static void 1961 fill_fpregs_xmm(struct savexmm *sv_xmm, struct save87 *sv_87) 1962 { 1963 struct env87 *penv_87 = &sv_87->sv_env; 1964 struct envxmm *penv_xmm = &sv_xmm->sv_env; 1965 int i; 1966 1967 /* FPU control/status */ 1968 penv_87->en_cw = penv_xmm->en_cw; 1969 penv_87->en_sw = penv_xmm->en_sw; 1970 penv_87->en_tw = penv_xmm->en_tw; 1971 penv_87->en_fip = penv_xmm->en_fip; 1972 penv_87->en_fcs = penv_xmm->en_fcs; 1973 penv_87->en_opcode = penv_xmm->en_opcode; 1974 penv_87->en_foo = penv_xmm->en_foo; 1975 penv_87->en_fos = penv_xmm->en_fos; 1976 1977 /* FPU registers */ 1978 for (i = 0; i < 8; ++i) 1979 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; 1980 1981 sv_87->sv_ex_sw = sv_xmm->sv_ex_sw; 1982 } 1983 1984 static void 1985 set_fpregs_xmm(struct save87 *sv_87, struct savexmm *sv_xmm) 1986 { 1987 struct env87 *penv_87 = &sv_87->sv_env; 1988 struct envxmm *penv_xmm = &sv_xmm->sv_env; 1989 int i; 1990 1991 /* FPU control/status */ 1992 penv_xmm->en_cw = penv_87->en_cw; 1993 penv_xmm->en_sw = penv_87->en_sw; 1994 penv_xmm->en_tw = penv_87->en_tw; 1995 penv_xmm->en_fip = penv_87->en_fip; 1996 penv_xmm->en_fcs = penv_87->en_fcs; 1997 penv_xmm->en_opcode = penv_87->en_opcode; 1998 penv_xmm->en_foo = penv_87->en_foo; 1999 penv_xmm->en_fos = penv_87->en_fos; 2000 2001 /* FPU registers */ 2002 for (i = 0; i < 8; ++i) 2003 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; 2004 2005 sv_xmm->sv_ex_sw = sv_87->sv_ex_sw; 2006 } 2007 #endif /* CPU_DISABLE_SSE */ 2008 2009 int 2010 fill_fpregs(struct lwp *lp, struct fpreg *fpregs) 2011 { 2012 #ifndef CPU_DISABLE_SSE 2013 if (cpu_fxsr) { 2014 fill_fpregs_xmm(&lp->lwp_thread->td_pcb->pcb_save.sv_xmm, 2015 (struct save87 *)fpregs); 2016 return (0); 2017 } 2018 #endif /* CPU_DISABLE_SSE */ 2019 bcopy(&lp->lwp_thread->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs); 2020 return (0); 2021 } 2022 2023 int 2024 set_fpregs(struct lwp *lp, struct fpreg *fpregs) 2025 { 2026 #ifndef CPU_DISABLE_SSE 2027 if (cpu_fxsr) { 2028 set_fpregs_xmm((struct save87 *)fpregs, 2029 &lp->lwp_thread->td_pcb->pcb_save.sv_xmm); 2030 return (0); 2031 } 2032 #endif /* CPU_DISABLE_SSE */ 2033 bcopy(fpregs, &lp->lwp_thread->td_pcb->pcb_save.sv_87, sizeof *fpregs); 2034 return (0); 2035 } 2036 2037 int 2038 fill_dbregs(struct lwp *lp, struct dbreg *dbregs) 2039 { 2040 if (lp == NULL) { 2041 dbregs->dr[0] = rdr0(); 2042 dbregs->dr[1] = rdr1(); 2043 dbregs->dr[2] = rdr2(); 2044 dbregs->dr[3] = rdr3(); 2045 dbregs->dr[4] = rdr4(); 2046 dbregs->dr[5] = rdr5(); 2047 dbregs->dr[6] = rdr6(); 2048 dbregs->dr[7] = rdr7(); 2049 } else { 2050 struct pcb *pcb; 2051 2052 pcb = lp->lwp_thread->td_pcb; 2053 dbregs->dr[0] = pcb->pcb_dr0; 2054 dbregs->dr[1] = pcb->pcb_dr1; 2055 dbregs->dr[2] = pcb->pcb_dr2; 2056 dbregs->dr[3] = pcb->pcb_dr3; 2057 dbregs->dr[4] = 0; 2058 dbregs->dr[5] = 0; 2059 dbregs->dr[6] = pcb->pcb_dr6; 2060 dbregs->dr[7] = pcb->pcb_dr7; 2061 } 2062 return (0); 2063 } 2064 2065 int 2066 set_dbregs(struct lwp *lp, struct dbreg *dbregs) 2067 { 2068 if (lp == NULL) { 2069 load_dr0(dbregs->dr[0]); 2070 load_dr1(dbregs->dr[1]); 2071 load_dr2(dbregs->dr[2]); 2072 load_dr3(dbregs->dr[3]); 2073 load_dr4(dbregs->dr[4]); 2074 load_dr5(dbregs->dr[5]); 2075 load_dr6(dbregs->dr[6]); 2076 load_dr7(dbregs->dr[7]); 2077 } else { 2078 struct pcb *pcb; 2079 struct ucred *ucred; 2080 int i; 2081 uint64_t mask1, mask2; 2082 2083 /* 2084 * Don't let an illegal value for dr7 get set. Specifically, 2085 * check for undefined settings. Setting these bit patterns 2086 * result in undefined behaviour and can lead to an unexpected 2087 * TRCTRAP. 2088 */ 2089 /* JG this loop looks unreadable */ 2090 /* Check 4 2-bit fields for invalid patterns. 2091 * These fields are R/Wi, for i = 0..3 2092 */ 2093 /* Is 10 in LENi allowed when running in compatibility mode? */ 2094 /* Pattern 10 in R/Wi might be used to indicate 2095 * breakpoint on I/O. Further analysis should be 2096 * carried to decide if it is safe and useful to 2097 * provide access to that capability 2098 */ 2099 for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 4; 2100 i++, mask1 <<= 4, mask2 <<= 4) 2101 if ((dbregs->dr[7] & mask1) == mask2) 2102 return (EINVAL); 2103 2104 pcb = lp->lwp_thread->td_pcb; 2105 ucred = lp->lwp_proc->p_ucred; 2106 2107 /* 2108 * Don't let a process set a breakpoint that is not within the 2109 * process's address space. If a process could do this, it 2110 * could halt the system by setting a breakpoint in the kernel 2111 * (if ddb was enabled). Thus, we need to check to make sure 2112 * that no breakpoints are being enabled for addresses outside 2113 * process's address space, unless, perhaps, we were called by 2114 * uid 0. 2115 * 2116 * XXX - what about when the watched area of the user's 2117 * address space is written into from within the kernel 2118 * ... wouldn't that still cause a breakpoint to be generated 2119 * from within kernel mode? 2120 */ 2121 2122 if (priv_check_cred(ucred, PRIV_ROOT, 0) != 0) { 2123 if (dbregs->dr[7] & 0x3) { 2124 /* dr0 is enabled */ 2125 if (dbregs->dr[0] >= VM_MAX_USER_ADDRESS) 2126 return (EINVAL); 2127 } 2128 2129 if (dbregs->dr[7] & (0x3<<2)) { 2130 /* dr1 is enabled */ 2131 if (dbregs->dr[1] >= VM_MAX_USER_ADDRESS) 2132 return (EINVAL); 2133 } 2134 2135 if (dbregs->dr[7] & (0x3<<4)) { 2136 /* dr2 is enabled */ 2137 if (dbregs->dr[2] >= VM_MAX_USER_ADDRESS) 2138 return (EINVAL); 2139 } 2140 2141 if (dbregs->dr[7] & (0x3<<6)) { 2142 /* dr3 is enabled */ 2143 if (dbregs->dr[3] >= VM_MAX_USER_ADDRESS) 2144 return (EINVAL); 2145 } 2146 } 2147 2148 pcb->pcb_dr0 = dbregs->dr[0]; 2149 pcb->pcb_dr1 = dbregs->dr[1]; 2150 pcb->pcb_dr2 = dbregs->dr[2]; 2151 pcb->pcb_dr3 = dbregs->dr[3]; 2152 pcb->pcb_dr6 = dbregs->dr[6]; 2153 pcb->pcb_dr7 = dbregs->dr[7]; 2154 2155 pcb->pcb_flags |= PCB_DBREGS; 2156 } 2157 2158 return (0); 2159 } 2160 2161 /* 2162 * Return > 0 if a hardware breakpoint has been hit, and the 2163 * breakpoint was in user space. Return 0, otherwise. 2164 */ 2165 int 2166 user_dbreg_trap(void) 2167 { 2168 u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */ 2169 u_int64_t bp; /* breakpoint bits extracted from dr6 */ 2170 int nbp; /* number of breakpoints that triggered */ 2171 caddr_t addr[4]; /* breakpoint addresses */ 2172 int i; 2173 2174 dr7 = rdr7(); 2175 if ((dr7 & 0xff) == 0) { 2176 /* 2177 * all GE and LE bits in the dr7 register are zero, 2178 * thus the trap couldn't have been caused by the 2179 * hardware debug registers 2180 */ 2181 return 0; 2182 } 2183 2184 nbp = 0; 2185 dr6 = rdr6(); 2186 bp = dr6 & 0xf; 2187 2188 if (bp == 0) { 2189 /* 2190 * None of the breakpoint bits are set meaning this 2191 * trap was not caused by any of the debug registers 2192 */ 2193 return 0; 2194 } 2195 2196 /* 2197 * at least one of the breakpoints were hit, check to see 2198 * which ones and if any of them are user space addresses 2199 */ 2200 2201 if (bp & 0x01) { 2202 addr[nbp++] = (caddr_t)rdr0(); 2203 } 2204 if (bp & 0x02) { 2205 addr[nbp++] = (caddr_t)rdr1(); 2206 } 2207 if (bp & 0x04) { 2208 addr[nbp++] = (caddr_t)rdr2(); 2209 } 2210 if (bp & 0x08) { 2211 addr[nbp++] = (caddr_t)rdr3(); 2212 } 2213 2214 for (i=0; i<nbp; i++) { 2215 if (addr[i] < 2216 (caddr_t)VM_MAX_USER_ADDRESS) { 2217 /* 2218 * addr[i] is in user space 2219 */ 2220 return nbp; 2221 } 2222 } 2223 2224 /* 2225 * None of the breakpoints are in user space. 2226 */ 2227 return 0; 2228 } 2229 2230 2231 #ifndef DDB 2232 void 2233 Debugger(const char *msg) 2234 { 2235 kprintf("Debugger(\"%s\") called.\n", msg); 2236 } 2237 #endif /* no DDB */ 2238 2239 #ifdef DDB 2240 2241 /* 2242 * Provide inb() and outb() as functions. They are normally only 2243 * available as macros calling inlined functions, thus cannot be 2244 * called inside DDB. 2245 * 2246 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined. 2247 */ 2248 2249 #undef inb 2250 #undef outb 2251 2252 /* silence compiler warnings */ 2253 u_char inb(u_int); 2254 void outb(u_int, u_char); 2255 2256 u_char 2257 inb(u_int port) 2258 { 2259 u_char data; 2260 /* 2261 * We use %%dx and not %1 here because i/o is done at %dx and not at 2262 * %edx, while gcc generates inferior code (movw instead of movl) 2263 * if we tell it to load (u_short) port. 2264 */ 2265 __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); 2266 return (data); 2267 } 2268 2269 void 2270 outb(u_int port, u_char data) 2271 { 2272 u_char al; 2273 /* 2274 * Use an unnecessary assignment to help gcc's register allocator. 2275 * This make a large difference for gcc-1.40 and a tiny difference 2276 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for 2277 * best results. gcc-2.6.0 can't handle this. 2278 */ 2279 al = data; 2280 __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); 2281 } 2282 2283 #endif /* DDB */ 2284 2285 2286 2287 #include "opt_cpu.h" 2288 2289 2290 /* 2291 * initialize all the SMP locks 2292 */ 2293 2294 /* critical region when masking or unmasking interupts */ 2295 struct spinlock_deprecated imen_spinlock; 2296 2297 /* critical region for old style disable_intr/enable_intr */ 2298 struct spinlock_deprecated mpintr_spinlock; 2299 2300 /* critical region around INTR() routines */ 2301 struct spinlock_deprecated intr_spinlock; 2302 2303 /* lock region used by kernel profiling */ 2304 struct spinlock_deprecated mcount_spinlock; 2305 2306 /* locks com (tty) data/hardware accesses: a FASTINTR() */ 2307 struct spinlock_deprecated com_spinlock; 2308 2309 /* lock regions around the clock hardware */ 2310 struct spinlock_deprecated clock_spinlock; 2311 2312 static void 2313 init_locks(void) 2314 { 2315 /* 2316 * mp_lock = 0; BSP already owns the MP lock 2317 */ 2318 /* 2319 * Get the initial mp_lock with a count of 1 for the BSP. 2320 * This uses a LOGICAL cpu ID, ie BSP == 0. 2321 */ 2322 #ifdef SMP 2323 cpu_get_initial_mplock(); 2324 #endif 2325 /* DEPRECATED */ 2326 spin_lock_init(&mcount_spinlock); 2327 spin_lock_init(&intr_spinlock); 2328 spin_lock_init(&mpintr_spinlock); 2329 spin_lock_init(&imen_spinlock); 2330 spin_lock_init(&com_spinlock); 2331 spin_lock_init(&clock_spinlock); 2332 2333 /* our token pool needs to work early */ 2334 lwkt_token_pool_init(); 2335 } 2336 2337