1 /*- 2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 3 * Copyright (c) 1992 Terrence R. Lambert. 4 * Copyright (c) 2003 Peter Wemm. 5 * Copyright (c) 2008 The DragonFly Project. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * William Jolitz. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $ 41 */ 42 43 #include "use_ether.h" 44 //#include "use_npx.h" 45 #include "use_isa.h" 46 #include "opt_atalk.h" 47 #include "opt_compat.h" 48 #include "opt_cpu.h" 49 #include "opt_ddb.h" 50 #include "opt_directio.h" 51 #include "opt_inet.h" 52 #include "opt_ipx.h" 53 #include "opt_msgbuf.h" 54 #include "opt_swap.h" 55 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/sysproto.h> 59 #include <sys/signalvar.h> 60 #include <sys/kernel.h> 61 #include <sys/linker.h> 62 #include <sys/malloc.h> 63 #include <sys/proc.h> 64 #include <sys/priv.h> 65 #include <sys/buf.h> 66 #include <sys/reboot.h> 67 #include <sys/mbuf.h> 68 #include <sys/msgbuf.h> 69 #include <sys/sysent.h> 70 #include <sys/sysctl.h> 71 #include <sys/vmmeter.h> 72 #include <sys/bus.h> 73 #include <sys/upcall.h> 74 #include <sys/usched.h> 75 #include <sys/reg.h> 76 77 #include <vm/vm.h> 78 #include <vm/vm_param.h> 79 #include <sys/lock.h> 80 #include <vm/vm_kern.h> 81 #include <vm/vm_object.h> 82 #include <vm/vm_page.h> 83 #include <vm/vm_map.h> 84 #include <vm/vm_pager.h> 85 #include <vm/vm_extern.h> 86 87 #include <sys/thread2.h> 88 #include <sys/mplock2.h> 89 90 #include <sys/user.h> 91 #include <sys/exec.h> 92 #include <sys/cons.h> 93 94 #include <ddb/ddb.h> 95 96 #include <machine/cpu.h> 97 #include <machine/clock.h> 98 #include <machine/specialreg.h> 99 #if JG 100 #include <machine/bootinfo.h> 101 #endif 102 #include <machine/md_var.h> 103 #include <machine/metadata.h> 104 #include <machine/pc/bios.h> 105 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */ 106 #include <machine/globaldata.h> /* CPU_prvspace */ 107 #include <machine/smp.h> 108 #ifdef PERFMON 109 #include <machine/perfmon.h> 110 #endif 111 #include <machine/cputypes.h> 112 113 #ifdef OLD_BUS_ARCH 114 #include <bus/isa/isa_device.h> 115 #endif 116 #include <machine_base/isa/intr_machdep.h> 117 #include <bus/isa/rtc.h> 118 #include <sys/random.h> 119 #include <sys/ptrace.h> 120 #include <machine/sigframe.h> 121 122 #define PHYSMAP_ENTRIES 10 123 124 extern void init386(int first); 125 extern void dblfault_handler(void); 126 extern u_int64_t hammer_time(u_int64_t, u_int64_t); 127 128 extern void printcpuinfo(void); /* XXX header file */ 129 extern void identify_cpu(void); 130 #if JG 131 extern void finishidentcpu(void); 132 #endif 133 extern void panicifcpuunsupported(void); 134 135 static void cpu_startup(void *); 136 #ifndef CPU_DISABLE_SSE 137 static void set_fpregs_xmm(struct save87 *, struct savexmm *); 138 static void fill_fpregs_xmm(struct savexmm *, struct save87 *); 139 #endif /* CPU_DISABLE_SSE */ 140 #ifdef DIRECTIO 141 extern void ffs_rawread_setup(void); 142 #endif /* DIRECTIO */ 143 static void init_locks(void); 144 145 SYSINIT(cpu, SI_BOOT2_SMP, SI_ORDER_FIRST, cpu_startup, NULL) 146 147 #ifdef DDB 148 extern vm_offset_t ksym_start, ksym_end; 149 #endif 150 151 uint64_t SMPptpa; 152 pt_entry_t *SMPpt; 153 154 155 struct privatespace CPU_prvspace[MAXCPU]; 156 157 int _udatasel, _ucodesel, _ucode32sel; 158 u_long atdevbase; 159 #ifdef SMP 160 int64_t tsc_offsets[MAXCPU]; 161 #else 162 int64_t tsc_offsets[1]; 163 #endif 164 165 #if defined(SWTCH_OPTIM_STATS) 166 extern int swtch_optim_stats; 167 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, 168 CTLFLAG_RD, &swtch_optim_stats, 0, ""); 169 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, 170 CTLFLAG_RD, &tlb_flush_count, 0, ""); 171 #endif 172 173 int physmem = 0; 174 175 static int 176 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS) 177 { 178 int error = sysctl_handle_int(oidp, 0, ctob(physmem), req); 179 return (error); 180 } 181 182 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD, 183 0, 0, sysctl_hw_physmem, "IU", ""); 184 185 static int 186 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS) 187 { 188 int error = sysctl_handle_int(oidp, 0, 189 ctob(physmem - vmstats.v_wire_count), req); 190 return (error); 191 } 192 193 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD, 194 0, 0, sysctl_hw_usermem, "IU", ""); 195 196 static int 197 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS) 198 { 199 int error = sysctl_handle_int(oidp, 0, 200 x86_64_btop(avail_end - avail_start), req); 201 return (error); 202 } 203 204 SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_INT|CTLFLAG_RD, 205 0, 0, sysctl_hw_availpages, "I", ""); 206 207 vm_paddr_t Maxmem = 0; 208 209 /* 210 * The number of PHYSMAP entries must be one less than the number of 211 * PHYSSEG entries because the PHYSMAP entry that spans the largest 212 * physical address that is accessible by ISA DMA is split into two 213 * PHYSSEG entries. 214 */ 215 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 216 217 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 218 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 219 220 /* must be 2 less so 0 0 can signal end of chunks */ 221 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2) 222 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2) 223 224 static vm_offset_t buffer_sva, buffer_eva; 225 vm_offset_t clean_sva, clean_eva; 226 static vm_offset_t pager_sva, pager_eva; 227 static struct trapframe proc0_tf; 228 229 static void 230 cpu_startup(void *dummy) 231 { 232 caddr_t v; 233 vm_size_t size = 0; 234 vm_offset_t firstaddr; 235 236 if (boothowto & RB_VERBOSE) 237 bootverbose++; 238 239 /* 240 * Good {morning,afternoon,evening,night}. 241 */ 242 kprintf("%s", version); 243 startrtclock(); 244 printcpuinfo(); 245 panicifcpuunsupported(); 246 #ifdef PERFMON 247 perfmon_init(); 248 #endif 249 kprintf("real memory = %ju (%ju MB)\n", 250 (intmax_t)ptoa(Maxmem), 251 (intmax_t)ptoa(Maxmem) / 1024 / 1024); 252 /* 253 * Display any holes after the first chunk of extended memory. 254 */ 255 if (bootverbose) { 256 int indx; 257 258 kprintf("Physical memory chunk(s):\n"); 259 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 260 vm_paddr_t size1 = phys_avail[indx + 1] - phys_avail[indx]; 261 262 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n", 263 (intmax_t)phys_avail[indx], 264 (intmax_t)phys_avail[indx + 1] - 1, 265 (intmax_t)size1, 266 (intmax_t)(size1 / PAGE_SIZE)); 267 } 268 } 269 270 /* 271 * Allocate space for system data structures. 272 * The first available kernel virtual address is in "v". 273 * As pages of kernel virtual memory are allocated, "v" is incremented. 274 * As pages of memory are allocated and cleared, 275 * "firstaddr" is incremented. 276 * An index into the kernel page table corresponding to the 277 * virtual memory address maintained in "v" is kept in "mapaddr". 278 */ 279 280 /* 281 * Make two passes. The first pass calculates how much memory is 282 * needed and allocates it. The second pass assigns virtual 283 * addresses to the various data structures. 284 */ 285 firstaddr = 0; 286 again: 287 v = (caddr_t)firstaddr; 288 289 #define valloc(name, type, num) \ 290 (name) = (type *)v; v = (caddr_t)((name)+(num)) 291 #define valloclim(name, type, num, lim) \ 292 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) 293 294 /* 295 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. 296 * For the first 64MB of ram nominally allocate sufficient buffers to 297 * cover 1/4 of our ram. Beyond the first 64MB allocate additional 298 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing 299 * the buffer cache we limit the eventual kva reservation to 300 * maxbcache bytes. 301 * 302 * factor represents the 1/4 x ram conversion. 303 */ 304 if (nbuf == 0) { 305 int factor = 4 * BKVASIZE / 1024; 306 int kbytes = physmem * (PAGE_SIZE / 1024); 307 308 nbuf = 50; 309 if (kbytes > 4096) 310 nbuf += min((kbytes - 4096) / factor, 65536 / factor); 311 if (kbytes > 65536) 312 nbuf += (kbytes - 65536) * 2 / (factor * 5); 313 if (maxbcache && nbuf > maxbcache / BKVASIZE) 314 nbuf = maxbcache / BKVASIZE; 315 } 316 317 /* 318 * Do not allow the buffer_map to be more then 1/2 the size of the 319 * kernel_map. 320 */ 321 if (nbuf > (virtual_end - virtual_start) / (BKVASIZE * 2)) { 322 nbuf = (virtual_end - virtual_start) / (BKVASIZE * 2); 323 kprintf("Warning: nbufs capped at %d\n", nbuf); 324 } 325 326 nswbuf = max(min(nbuf/4, 256), 16); 327 #ifdef NSWBUF_MIN 328 if (nswbuf < NSWBUF_MIN) 329 nswbuf = NSWBUF_MIN; 330 #endif 331 #ifdef DIRECTIO 332 ffs_rawread_setup(); 333 #endif 334 335 valloc(swbuf, struct buf, nswbuf); 336 valloc(buf, struct buf, nbuf); 337 338 /* 339 * End of first pass, size has been calculated so allocate memory 340 */ 341 if (firstaddr == 0) { 342 size = (vm_size_t)(v - firstaddr); 343 firstaddr = kmem_alloc(&kernel_map, round_page(size)); 344 if (firstaddr == 0) 345 panic("startup: no room for tables"); 346 goto again; 347 } 348 349 /* 350 * End of second pass, addresses have been assigned 351 */ 352 if ((vm_size_t)(v - firstaddr) != size) 353 panic("startup: table size inconsistency"); 354 355 kmem_suballoc(&kernel_map, &clean_map, &clean_sva, &clean_eva, 356 (nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size); 357 kmem_suballoc(&clean_map, &buffer_map, &buffer_sva, &buffer_eva, 358 (nbuf*BKVASIZE)); 359 buffer_map.system_map = 1; 360 kmem_suballoc(&clean_map, &pager_map, &pager_sva, &pager_eva, 361 (nswbuf*MAXPHYS) + pager_map_size); 362 pager_map.system_map = 1; 363 364 #if defined(USERCONFIG) 365 userconfig(); 366 cninit(); /* the preferred console may have changed */ 367 #endif 368 369 kprintf("avail memory = %ju (%ju MB)\n", 370 (uintmax_t)ptoa(vmstats.v_free_count), 371 (uintmax_t)ptoa(vmstats.v_free_count) / 1024 / 1024); 372 373 /* 374 * Set up buffers, so they can be used to read disk labels. 375 */ 376 bufinit(); 377 vm_pager_bufferinit(); 378 379 #ifdef SMP 380 /* 381 * OK, enough kmem_alloc/malloc state should be up, lets get on with it! 382 */ 383 mp_start(); /* fire up the APs and APICs */ 384 mp_announce(); 385 #endif /* SMP */ 386 cpu_setregs(); 387 } 388 389 /* 390 * Send an interrupt to process. 391 * 392 * Stack is set up to allow sigcode stored 393 * at top to call routine, followed by kcall 394 * to sigreturn routine below. After sigreturn 395 * resets the signal mask, the stack, and the 396 * frame pointer, it returns to the user 397 * specified pc, psl. 398 */ 399 void 400 sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code) 401 { 402 struct lwp *lp = curthread->td_lwp; 403 struct proc *p = lp->lwp_proc; 404 struct trapframe *regs; 405 struct sigacts *psp = p->p_sigacts; 406 struct sigframe sf, *sfp; 407 int oonstack; 408 char *sp; 409 410 regs = lp->lwp_md.md_regs; 411 oonstack = (lp->lwp_sigstk.ss_flags & SS_ONSTACK) ? 1 : 0; 412 413 /* Save user context */ 414 bzero(&sf, sizeof(struct sigframe)); 415 sf.sf_uc.uc_sigmask = *mask; 416 sf.sf_uc.uc_stack = lp->lwp_sigstk; 417 sf.sf_uc.uc_mcontext.mc_onstack = oonstack; 418 KKASSERT(__offsetof(struct trapframe, tf_rdi) == 0); 419 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(struct trapframe)); 420 421 /* Make the size of the saved context visible to userland */ 422 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); 423 424 /* Save mailbox pending state for syscall interlock semantics */ 425 if (p->p_flag & P_MAILBOX) 426 sf.sf_uc.uc_mcontext.mc_xflags |= PGEX_MAILBOX; 427 428 /* Allocate and validate space for the signal handler context. */ 429 if ((lp->lwp_flag & LWP_ALTSTACK) != 0 && !oonstack && 430 SIGISMEMBER(psp->ps_sigonstack, sig)) { 431 sp = (char *)(lp->lwp_sigstk.ss_sp + lp->lwp_sigstk.ss_size - 432 sizeof(struct sigframe)); 433 lp->lwp_sigstk.ss_flags |= SS_ONSTACK; 434 } else { 435 /* We take red zone into account */ 436 sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128; 437 } 438 439 /* Align to 16 bytes */ 440 sfp = (struct sigframe *)((intptr_t)sp & ~0xFUL); 441 442 /* Translate the signal is appropriate */ 443 if (p->p_sysent->sv_sigtbl) { 444 if (sig <= p->p_sysent->sv_sigsize) 445 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 446 } 447 448 /* 449 * Build the argument list for the signal handler. 450 * 451 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx) 452 */ 453 regs->tf_rdi = sig; /* argument 1 */ 454 regs->tf_rdx = (register_t)&sfp->sf_uc; /* argument 3 */ 455 456 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 457 /* 458 * Signal handler installed with SA_SIGINFO. 459 * 460 * action(signo, siginfo, ucontext) 461 */ 462 regs->tf_rsi = (register_t)&sfp->sf_si; /* argument 2 */ 463 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */ 464 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 465 466 /* fill siginfo structure */ 467 sf.sf_si.si_signo = sig; 468 sf.sf_si.si_code = code; 469 sf.sf_si.si_addr = (void *)regs->tf_addr; 470 } else { 471 /* 472 * Old FreeBSD-style arguments. 473 * 474 * handler (signo, code, [uc], addr) 475 */ 476 regs->tf_rsi = (register_t)code; /* argument 2 */ 477 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */ 478 sf.sf_ahu.sf_handler = catcher; 479 } 480 481 /* 482 * If we're a vm86 process, we want to save the segment registers. 483 * We also change eflags to be our emulated eflags, not the actual 484 * eflags. 485 */ 486 #if JG 487 if (regs->tf_eflags & PSL_VM) { 488 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 489 struct vm86_kernel *vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86; 490 491 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 492 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 493 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 494 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 495 496 if (vm86->vm86_has_vme == 0) 497 sf.sf_uc.uc_mcontext.mc_eflags = 498 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 499 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 500 501 /* 502 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 503 * syscalls made by the signal handler. This just avoids 504 * wasting time for our lazy fixup of such faults. PSL_NT 505 * does nothing in vm86 mode, but vm86 programs can set it 506 * almost legitimately in probes for old cpu types. 507 */ 508 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 509 } 510 #endif 511 512 /* 513 * Save the FPU state and reinit the FP unit 514 */ 515 npxpush(&sf.sf_uc.uc_mcontext); 516 517 /* 518 * Copy the sigframe out to the user's stack. 519 */ 520 if (copyout(&sf, sfp, sizeof(struct sigframe)) != 0) { 521 /* 522 * Something is wrong with the stack pointer. 523 * ...Kill the process. 524 */ 525 sigexit(lp, SIGILL); 526 } 527 528 regs->tf_rsp = (register_t)sfp; 529 regs->tf_rip = PS_STRINGS - *(p->p_sysent->sv_szsigcode); 530 531 /* 532 * i386 abi specifies that the direction flag must be cleared 533 * on function entry 534 */ 535 regs->tf_rflags &= ~(PSL_T|PSL_D); 536 537 /* 538 * 64 bit mode has a code and stack selector but 539 * no data or extra selector. %fs and %gs are not 540 * stored in-context. 541 */ 542 regs->tf_cs = _ucodesel; 543 regs->tf_ss = _udatasel; 544 } 545 546 /* 547 * Sanitize the trapframe for a virtual kernel passing control to a custom 548 * VM context. Remove any items that would otherwise create a privilage 549 * issue. 550 * 551 * XXX at the moment we allow userland to set the resume flag. Is this a 552 * bad idea? 553 */ 554 int 555 cpu_sanitize_frame(struct trapframe *frame) 556 { 557 frame->tf_cs = _ucodesel; 558 frame->tf_ss = _udatasel; 559 /* XXX VM (8086) mode not supported? */ 560 frame->tf_rflags &= (PSL_RF | PSL_USERCHANGE | PSL_VM_UNSUPP); 561 frame->tf_rflags |= PSL_RESERVED_DEFAULT | PSL_I; 562 563 return(0); 564 } 565 566 /* 567 * Sanitize the tls so loading the descriptor does not blow up 568 * on us. For x86_64 we don't have to do anything. 569 */ 570 int 571 cpu_sanitize_tls(struct savetls *tls) 572 { 573 return(0); 574 } 575 576 /* 577 * sigreturn(ucontext_t *sigcntxp) 578 * 579 * System call to cleanup state after a signal 580 * has been taken. Reset signal mask and 581 * stack state from context left by sendsig (above). 582 * Return to previous pc and psl as specified by 583 * context left by sendsig. Check carefully to 584 * make sure that the user has not modified the 585 * state to gain improper privileges. 586 * 587 * MPSAFE 588 */ 589 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 590 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 591 592 int 593 sys_sigreturn(struct sigreturn_args *uap) 594 { 595 struct lwp *lp = curthread->td_lwp; 596 struct proc *p = lp->lwp_proc; 597 struct trapframe *regs; 598 ucontext_t uc; 599 ucontext_t *ucp; 600 register_t rflags; 601 int cs; 602 int error; 603 604 /* 605 * We have to copy the information into kernel space so userland 606 * can't modify it while we are sniffing it. 607 */ 608 regs = lp->lwp_md.md_regs; 609 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 610 if (error) 611 return (error); 612 ucp = &uc; 613 rflags = ucp->uc_mcontext.mc_rflags; 614 615 /* VM (8086) mode not supported */ 616 rflags &= ~PSL_VM_UNSUPP; 617 618 #if JG 619 if (eflags & PSL_VM) { 620 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 621 struct vm86_kernel *vm86; 622 623 /* 624 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 625 * set up the vm86 area, and we can't enter vm86 mode. 626 */ 627 if (lp->lwp_thread->td_pcb->pcb_ext == 0) 628 return (EINVAL); 629 vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86; 630 if (vm86->vm86_inited == 0) 631 return (EINVAL); 632 633 /* go back to user mode if both flags are set */ 634 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) 635 trapsignal(lp, SIGBUS, 0); 636 637 if (vm86->vm86_has_vme) { 638 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 639 (eflags & VME_USERCHANGE) | PSL_VM; 640 } else { 641 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 642 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 643 (eflags & VM_USERCHANGE) | PSL_VM; 644 } 645 bcopy(&ucp->uc_mcontext.mc_gs, tf, sizeof(struct trapframe)); 646 tf->tf_eflags = eflags; 647 tf->tf_vm86_ds = tf->tf_ds; 648 tf->tf_vm86_es = tf->tf_es; 649 tf->tf_vm86_fs = tf->tf_fs; 650 tf->tf_vm86_gs = tf->tf_gs; 651 tf->tf_ds = _udatasel; 652 tf->tf_es = _udatasel; 653 tf->tf_fs = _udatasel; 654 tf->tf_gs = _udatasel; 655 } else 656 #endif 657 { 658 /* 659 * Don't allow users to change privileged or reserved flags. 660 */ 661 /* 662 * XXX do allow users to change the privileged flag PSL_RF. 663 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers 664 * should sometimes set it there too. tf_eflags is kept in 665 * the signal context during signal handling and there is no 666 * other place to remember it, so the PSL_RF bit may be 667 * corrupted by the signal handler without us knowing. 668 * Corruption of the PSL_RF bit at worst causes one more or 669 * one less debugger trap, so allowing it is fairly harmless. 670 */ 671 if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { 672 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags); 673 return(EINVAL); 674 } 675 676 /* 677 * Don't allow users to load a valid privileged %cs. Let the 678 * hardware check for invalid selectors, excess privilege in 679 * other selectors, invalid %eip's and invalid %esp's. 680 */ 681 cs = ucp->uc_mcontext.mc_cs; 682 if (!CS_SECURE(cs)) { 683 kprintf("sigreturn: cs = 0x%x\n", cs); 684 trapsignal(lp, SIGBUS, T_PROTFLT); 685 return(EINVAL); 686 } 687 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(struct trapframe)); 688 } 689 690 /* 691 * Restore the FPU state from the frame 692 */ 693 crit_enter(); 694 npxpop(&ucp->uc_mcontext); 695 696 /* 697 * Merge saved signal mailbox pending flag to maintain interlock 698 * semantics against system calls. 699 */ 700 if (ucp->uc_mcontext.mc_xflags & PGEX_MAILBOX) 701 p->p_flag |= P_MAILBOX; 702 703 if (ucp->uc_mcontext.mc_onstack & 1) 704 lp->lwp_sigstk.ss_flags |= SS_ONSTACK; 705 else 706 lp->lwp_sigstk.ss_flags &= ~SS_ONSTACK; 707 708 lp->lwp_sigmask = ucp->uc_sigmask; 709 SIG_CANTMASK(lp->lwp_sigmask); 710 crit_exit(); 711 return(EJUSTRETURN); 712 } 713 714 /* 715 * Stack frame on entry to function. %rax will contain the function vector, 716 * %rcx will contain the function data. flags, rcx, and rax will have 717 * already been pushed on the stack. 718 */ 719 struct upc_frame { 720 register_t rax; 721 register_t rcx; 722 register_t rdx; 723 register_t flags; 724 register_t oldip; 725 }; 726 727 void 728 sendupcall(struct vmupcall *vu, int morepending) 729 { 730 struct lwp *lp = curthread->td_lwp; 731 struct trapframe *regs; 732 struct upcall upcall; 733 struct upc_frame upc_frame; 734 int crit_count = 0; 735 736 /* 737 * If we are a virtual kernel running an emulated user process 738 * context, switch back to the virtual kernel context before 739 * trying to post the signal. 740 */ 741 if (lp->lwp_vkernel && lp->lwp_vkernel->ve) { 742 lp->lwp_md.md_regs->tf_trapno = 0; 743 vkernel_trap(lp, lp->lwp_md.md_regs); 744 } 745 746 /* 747 * Get the upcall data structure 748 */ 749 if (copyin(lp->lwp_upcall, &upcall, sizeof(upcall)) || 750 copyin((char *)upcall.upc_uthread + upcall.upc_critoff, &crit_count, sizeof(int)) 751 ) { 752 vu->vu_pending = 0; 753 kprintf("bad upcall address\n"); 754 return; 755 } 756 757 /* 758 * If the data structure is already marked pending or has a critical 759 * section count, mark the data structure as pending and return 760 * without doing an upcall. vu_pending is left set. 761 */ 762 if (upcall.upc_pending || crit_count >= vu->vu_pending) { 763 if (upcall.upc_pending < vu->vu_pending) { 764 upcall.upc_pending = vu->vu_pending; 765 copyout(&upcall.upc_pending, &lp->lwp_upcall->upc_pending, 766 sizeof(upcall.upc_pending)); 767 } 768 return; 769 } 770 771 /* 772 * We can run this upcall now, clear vu_pending. 773 * 774 * Bump our critical section count and set or clear the 775 * user pending flag depending on whether more upcalls are 776 * pending. The user will be responsible for calling 777 * upc_dispatch(-1) to process remaining upcalls. 778 */ 779 vu->vu_pending = 0; 780 upcall.upc_pending = morepending; 781 crit_count += TDPRI_CRIT; 782 copyout(&upcall.upc_pending, &lp->lwp_upcall->upc_pending, 783 sizeof(upcall.upc_pending)); 784 copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, 785 sizeof(int)); 786 787 /* 788 * Construct a stack frame and issue the upcall 789 */ 790 regs = lp->lwp_md.md_regs; 791 upc_frame.rax = regs->tf_rax; 792 upc_frame.rcx = regs->tf_rcx; 793 upc_frame.rdx = regs->tf_rdx; 794 upc_frame.flags = regs->tf_rflags; 795 upc_frame.oldip = regs->tf_rip; 796 if (copyout(&upc_frame, (void *)(regs->tf_rsp - sizeof(upc_frame)), 797 sizeof(upc_frame)) != 0) { 798 kprintf("bad stack on upcall\n"); 799 } else { 800 regs->tf_rax = (register_t)vu->vu_func; 801 regs->tf_rcx = (register_t)vu->vu_data; 802 regs->tf_rdx = (register_t)lp->lwp_upcall; 803 regs->tf_rip = (register_t)vu->vu_ctx; 804 regs->tf_rsp -= sizeof(upc_frame); 805 } 806 } 807 808 /* 809 * fetchupcall occurs in the context of a system call, which means that 810 * we have to return EJUSTRETURN in order to prevent eax and edx from 811 * being overwritten by the syscall return value. 812 * 813 * if vu is not NULL we return the new context in %edx, the new data in %ecx, 814 * and the function pointer in %eax. 815 */ 816 int 817 fetchupcall(struct vmupcall *vu, int morepending, void *rsp) 818 { 819 struct upc_frame upc_frame; 820 struct lwp *lp = curthread->td_lwp; 821 struct trapframe *regs; 822 int error; 823 struct upcall upcall; 824 int crit_count; 825 826 regs = lp->lwp_md.md_regs; 827 828 error = copyout(&morepending, &lp->lwp_upcall->upc_pending, sizeof(int)); 829 if (error == 0) { 830 if (vu) { 831 /* 832 * This jumps us to the next ready context. 833 */ 834 vu->vu_pending = 0; 835 error = copyin(lp->lwp_upcall, &upcall, sizeof(upcall)); 836 crit_count = 0; 837 if (error == 0) 838 error = copyin((char *)upcall.upc_uthread + upcall.upc_critoff, &crit_count, sizeof(int)); 839 crit_count += TDPRI_CRIT; 840 if (error == 0) 841 error = copyout(&crit_count, (char *)upcall.upc_uthread + upcall.upc_critoff, sizeof(int)); 842 regs->tf_rax = (register_t)vu->vu_func; 843 regs->tf_rcx = (register_t)vu->vu_data; 844 regs->tf_rdx = (register_t)lp->lwp_upcall; 845 regs->tf_rip = (register_t)vu->vu_ctx; 846 regs->tf_rsp = (register_t)rsp; 847 } else { 848 /* 849 * This returns us to the originally interrupted code. 850 */ 851 error = copyin(rsp, &upc_frame, sizeof(upc_frame)); 852 regs->tf_rax = upc_frame.rax; 853 regs->tf_rcx = upc_frame.rcx; 854 regs->tf_rdx = upc_frame.rdx; 855 regs->tf_rflags = (regs->tf_rflags & ~PSL_USERCHANGE) | 856 (upc_frame.flags & PSL_USERCHANGE); 857 regs->tf_rip = upc_frame.oldip; 858 regs->tf_rsp = (register_t)((char *)rsp + sizeof(upc_frame)); 859 } 860 } 861 if (error == 0) 862 error = EJUSTRETURN; 863 return(error); 864 } 865 866 /* 867 * Machine dependent boot() routine 868 * 869 * I haven't seen anything to put here yet 870 * Possibly some stuff might be grafted back here from boot() 871 */ 872 void 873 cpu_boot(int howto) 874 { 875 } 876 877 /* 878 * Shutdown the CPU as much as possible 879 */ 880 void 881 cpu_halt(void) 882 { 883 for (;;) 884 __asm__ __volatile("hlt"); 885 } 886 887 /* 888 * cpu_idle() represents the idle LWKT. You cannot return from this function 889 * (unless you want to blow things up!). Instead we look for runnable threads 890 * and loop or halt as appropriate. Giant is not held on entry to the thread. 891 * 892 * The main loop is entered with a critical section held, we must release 893 * the critical section before doing anything else. lwkt_switch() will 894 * check for pending interrupts due to entering and exiting its own 895 * critical section. 896 * 897 * Note on cpu_idle_hlt: On an SMP system we rely on a scheduler IPI 898 * to wake a HLTed cpu up. However, there are cases where the idlethread 899 * will be entered with the possibility that no IPI will occur and in such 900 * cases lwkt_switch() sets TDF_IDLE_NOHLT. 901 */ 902 static int cpu_idle_hlt = 1; 903 static int cpu_idle_hltcnt; 904 static int cpu_idle_spincnt; 905 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, 906 &cpu_idle_hlt, 0, "Idle loop HLT enable"); 907 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hltcnt, CTLFLAG_RW, 908 &cpu_idle_hltcnt, 0, "Idle loop entry halts"); 909 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_spincnt, CTLFLAG_RW, 910 &cpu_idle_spincnt, 0, "Idle loop entry spins"); 911 912 static void 913 cpu_idle_default_hook(void) 914 { 915 /* 916 * We must guarentee that hlt is exactly the instruction 917 * following the sti. 918 */ 919 __asm __volatile("sti; hlt"); 920 } 921 922 /* Other subsystems (e.g., ACPI) can hook this later. */ 923 void (*cpu_idle_hook)(void) = cpu_idle_default_hook; 924 925 void 926 cpu_idle(void) 927 { 928 struct thread *td = curthread; 929 930 crit_exit(); 931 KKASSERT(td->td_pri < TDPRI_CRIT); 932 for (;;) { 933 /* 934 * See if there are any LWKTs ready to go. 935 */ 936 lwkt_switch(); 937 938 /* 939 * If we are going to halt call splz unconditionally after 940 * CLIing to catch any interrupt races. Note that we are 941 * at SPL0 and interrupts are enabled. 942 */ 943 if (cpu_idle_hlt && !lwkt_runnable() && 944 (td->td_flags & TDF_IDLE_NOHLT) == 0) { 945 __asm __volatile("cli"); 946 splz(); 947 if (!lwkt_runnable()) 948 cpu_idle_hook(); 949 #ifdef SMP 950 else 951 __asm __volatile("pause"); 952 #endif 953 ++cpu_idle_hltcnt; 954 } else { 955 td->td_flags &= ~TDF_IDLE_NOHLT; 956 splz(); 957 #ifdef SMP 958 __asm __volatile("sti; pause"); 959 #else 960 __asm __volatile("sti"); 961 #endif 962 ++cpu_idle_spincnt; 963 } 964 } 965 } 966 967 #ifdef SMP 968 969 /* 970 * This routine is called when the only runnable threads require 971 * the MP lock, and the scheduler couldn't get it. On a real cpu 972 * we let the scheduler spin. 973 */ 974 void 975 cpu_mplock_contested(void) 976 { 977 cpu_pause(); 978 } 979 980 /* 981 * This routine is called if a spinlock has been held through the 982 * exponential backoff period and is seriously contested. On a real cpu 983 * we let it spin. 984 */ 985 void 986 cpu_spinlock_contested(void) 987 { 988 cpu_pause(); 989 } 990 991 #endif 992 993 /* 994 * Clear registers on exec 995 */ 996 void 997 exec_setregs(u_long entry, u_long stack, u_long ps_strings) 998 { 999 struct thread *td = curthread; 1000 struct lwp *lp = td->td_lwp; 1001 struct pcb *pcb = td->td_pcb; 1002 struct trapframe *regs = lp->lwp_md.md_regs; 1003 1004 /* was i386_user_cleanup() in NetBSD */ 1005 user_ldt_free(pcb); 1006 1007 bzero((char *)regs, sizeof(struct trapframe)); 1008 regs->tf_rip = entry; 1009 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; /* align the stack */ 1010 regs->tf_rdi = stack; /* argv */ 1011 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); 1012 regs->tf_ss = _udatasel; 1013 regs->tf_cs = _ucodesel; 1014 regs->tf_rbx = ps_strings; 1015 1016 /* 1017 * Reset the hardware debug registers if they were in use. 1018 * They won't have any meaning for the newly exec'd process. 1019 */ 1020 if (pcb->pcb_flags & PCB_DBREGS) { 1021 pcb->pcb_dr0 = 0; 1022 pcb->pcb_dr1 = 0; 1023 pcb->pcb_dr2 = 0; 1024 pcb->pcb_dr3 = 0; 1025 pcb->pcb_dr6 = 0; 1026 pcb->pcb_dr7 = 0; /* JG set bit 10? */ 1027 if (pcb == td->td_pcb) { 1028 /* 1029 * Clear the debug registers on the running 1030 * CPU, otherwise they will end up affecting 1031 * the next process we switch to. 1032 */ 1033 reset_dbregs(); 1034 } 1035 pcb->pcb_flags &= ~PCB_DBREGS; 1036 } 1037 1038 /* 1039 * Initialize the math emulator (if any) for the current process. 1040 * Actually, just clear the bit that says that the emulator has 1041 * been initialized. Initialization is delayed until the process 1042 * traps to the emulator (if it is done at all) mainly because 1043 * emulators don't provide an entry point for initialization. 1044 */ 1045 pcb->pcb_flags &= ~FP_SOFTFP; 1046 1047 /* 1048 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing 1049 * gd_npxthread. Otherwise a preemptive interrupt thread 1050 * may panic in npxdna(). 1051 */ 1052 crit_enter(); 1053 load_cr0(rcr0() | CR0_MP); 1054 1055 /* 1056 * NOTE: The MSR values must be correct so we can return to 1057 * userland. gd_user_fs/gs must be correct so the switch 1058 * code knows what the current MSR values are. 1059 */ 1060 pcb->pcb_fsbase = 0; /* Values loaded from PCB on switch */ 1061 pcb->pcb_gsbase = 0; 1062 mdcpu->gd_user_fs = 0; /* Cache of current MSR values */ 1063 mdcpu->gd_user_gs = 0; 1064 wrmsr(MSR_FSBASE, 0); /* Set MSR values for return to userland */ 1065 wrmsr(MSR_KGSBASE, 0); 1066 1067 /* Initialize the npx (if any) for the current process. */ 1068 npxinit(__INITIAL_NPXCW__); 1069 crit_exit(); 1070 1071 pcb->pcb_ds = _udatasel; 1072 pcb->pcb_es = _udatasel; 1073 pcb->pcb_fs = _udatasel; 1074 pcb->pcb_gs = _udatasel; 1075 } 1076 1077 void 1078 cpu_setregs(void) 1079 { 1080 register_t cr0; 1081 1082 cr0 = rcr0(); 1083 cr0 |= CR0_NE; /* Done by npxinit() */ 1084 cr0 |= CR0_MP | CR0_TS; /* Done at every execve() too. */ 1085 cr0 |= CR0_WP | CR0_AM; 1086 load_cr0(cr0); 1087 load_gs(_udatasel); 1088 } 1089 1090 static int 1091 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS) 1092 { 1093 int error; 1094 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, 1095 req); 1096 if (!error && req->newptr) 1097 resettodr(); 1098 return (error); 1099 } 1100 1101 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, 1102 &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); 1103 1104 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, 1105 CTLFLAG_RW, &disable_rtc_set, 0, ""); 1106 1107 #if JG 1108 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, 1109 CTLFLAG_RD, &bootinfo, bootinfo, ""); 1110 #endif 1111 1112 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, 1113 CTLFLAG_RW, &wall_cmos_clock, 0, ""); 1114 1115 extern u_long bootdev; /* not a cdev_t - encoding is different */ 1116 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, 1117 CTLFLAG_RD, &bootdev, 0, "Boot device (not in cdev_t format)"); 1118 1119 /* 1120 * Initialize 386 and configure to run kernel 1121 */ 1122 1123 /* 1124 * Initialize segments & interrupt table 1125 */ 1126 1127 int _default_ldt; 1128 struct user_segment_descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ 1129 static struct gate_descriptor idt0[NIDT]; 1130 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 1131 #if JG 1132 union descriptor ldt[NLDT]; /* local descriptor table */ 1133 #endif 1134 1135 /* table descriptors - used to load tables by cpu */ 1136 struct region_descriptor r_gdt, r_idt; 1137 1138 #if defined(I586_CPU) && !defined(NO_F00F_HACK) 1139 extern int has_f00f_bug; 1140 #endif 1141 1142 static char dblfault_stack[PAGE_SIZE] __aligned(16); 1143 1144 /* JG proc0paddr is a virtual address */ 1145 void *proc0paddr; 1146 /* JG alignment? */ 1147 char proc0paddr_buff[LWKT_THREAD_STACK]; 1148 1149 1150 /* software prototypes -- in more palatable form */ 1151 struct soft_segment_descriptor gdt_segs[] = { 1152 /* GNULL_SEL 0 Null Descriptor */ 1153 { 0x0, /* segment base address */ 1154 0x0, /* length */ 1155 0, /* segment type */ 1156 0, /* segment descriptor priority level */ 1157 0, /* segment descriptor present */ 1158 0, /* long */ 1159 0, /* default 32 vs 16 bit size */ 1160 0 /* limit granularity (byte/page units)*/ }, 1161 /* GCODE_SEL 1 Code Descriptor for kernel */ 1162 { 0x0, /* segment base address */ 1163 0xfffff, /* length - all address space */ 1164 SDT_MEMERA, /* segment type */ 1165 SEL_KPL, /* segment descriptor priority level */ 1166 1, /* segment descriptor present */ 1167 1, /* long */ 1168 0, /* default 32 vs 16 bit size */ 1169 1 /* limit granularity (byte/page units)*/ }, 1170 /* GDATA_SEL 2 Data Descriptor for kernel */ 1171 { 0x0, /* segment base address */ 1172 0xfffff, /* length - all address space */ 1173 SDT_MEMRWA, /* segment type */ 1174 SEL_KPL, /* segment descriptor priority level */ 1175 1, /* segment descriptor present */ 1176 1, /* long */ 1177 0, /* default 32 vs 16 bit size */ 1178 1 /* limit granularity (byte/page units)*/ }, 1179 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */ 1180 { 0x0, /* segment base address */ 1181 0xfffff, /* length - all address space */ 1182 SDT_MEMERA, /* segment type */ 1183 SEL_UPL, /* segment descriptor priority level */ 1184 1, /* segment descriptor present */ 1185 0, /* long */ 1186 1, /* default 32 vs 16 bit size */ 1187 1 /* limit granularity (byte/page units)*/ }, 1188 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */ 1189 { 0x0, /* segment base address */ 1190 0xfffff, /* length - all address space */ 1191 SDT_MEMRWA, /* segment type */ 1192 SEL_UPL, /* segment descriptor priority level */ 1193 1, /* segment descriptor present */ 1194 0, /* long */ 1195 1, /* default 32 vs 16 bit size */ 1196 1 /* limit granularity (byte/page units)*/ }, 1197 /* GUCODE_SEL 5 64 bit Code Descriptor for user */ 1198 { 0x0, /* segment base address */ 1199 0xfffff, /* length - all address space */ 1200 SDT_MEMERA, /* segment type */ 1201 SEL_UPL, /* segment descriptor priority level */ 1202 1, /* segment descriptor present */ 1203 1, /* long */ 1204 0, /* default 32 vs 16 bit size */ 1205 1 /* limit granularity (byte/page units)*/ }, 1206 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */ 1207 { 1208 0x0, /* segment base address */ 1209 sizeof(struct x86_64tss)-1,/* length - all address space */ 1210 SDT_SYSTSS, /* segment type */ 1211 SEL_KPL, /* segment descriptor priority level */ 1212 1, /* segment descriptor present */ 1213 0, /* long */ 1214 0, /* unused - default 32 vs 16 bit size */ 1215 0 /* limit granularity (byte/page units)*/ }, 1216 /* Actually, the TSS is a system descriptor which is double size */ 1217 { 0x0, /* segment base address */ 1218 0x0, /* length */ 1219 0, /* segment type */ 1220 0, /* segment descriptor priority level */ 1221 0, /* segment descriptor present */ 1222 0, /* long */ 1223 0, /* default 32 vs 16 bit size */ 1224 0 /* limit granularity (byte/page units)*/ }, 1225 /* GUGS32_SEL 8 32 bit GS Descriptor for user */ 1226 { 0x0, /* segment base address */ 1227 0xfffff, /* length - all address space */ 1228 SDT_MEMRWA, /* segment type */ 1229 SEL_UPL, /* segment descriptor priority level */ 1230 1, /* segment descriptor present */ 1231 0, /* long */ 1232 1, /* default 32 vs 16 bit size */ 1233 1 /* limit granularity (byte/page units)*/ }, 1234 }; 1235 1236 void 1237 setidt(int idx, inthand_t *func, int typ, int dpl, int ist) 1238 { 1239 struct gate_descriptor *ip; 1240 1241 ip = idt + idx; 1242 ip->gd_looffset = (uintptr_t)func; 1243 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 1244 ip->gd_ist = ist; 1245 ip->gd_xx = 0; 1246 ip->gd_type = typ; 1247 ip->gd_dpl = dpl; 1248 ip->gd_p = 1; 1249 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 1250 } 1251 1252 #define IDTVEC(name) __CONCAT(X,name) 1253 1254 extern inthand_t 1255 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 1256 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 1257 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 1258 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 1259 IDTVEC(xmm), IDTVEC(dblfault), 1260 IDTVEC(fast_syscall), IDTVEC(fast_syscall32); 1261 1262 #ifdef DEBUG_INTERRUPTS 1263 extern inthand_t *Xrsvdary[256]; 1264 #endif 1265 1266 void 1267 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd) 1268 { 1269 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 1270 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 1271 ssd->ssd_type = sd->sd_type; 1272 ssd->ssd_dpl = sd->sd_dpl; 1273 ssd->ssd_p = sd->sd_p; 1274 ssd->ssd_def32 = sd->sd_def32; 1275 ssd->ssd_gran = sd->sd_gran; 1276 } 1277 1278 void 1279 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd) 1280 { 1281 1282 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 1283 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 1284 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 1285 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 1286 sd->sd_type = ssd->ssd_type; 1287 sd->sd_dpl = ssd->ssd_dpl; 1288 sd->sd_p = ssd->ssd_p; 1289 sd->sd_long = ssd->ssd_long; 1290 sd->sd_def32 = ssd->ssd_def32; 1291 sd->sd_gran = ssd->ssd_gran; 1292 } 1293 1294 void 1295 ssdtosyssd(struct soft_segment_descriptor *ssd, 1296 struct system_segment_descriptor *sd) 1297 { 1298 1299 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 1300 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 1301 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 1302 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 1303 sd->sd_type = ssd->ssd_type; 1304 sd->sd_dpl = ssd->ssd_dpl; 1305 sd->sd_p = ssd->ssd_p; 1306 sd->sd_gran = ssd->ssd_gran; 1307 } 1308 1309 u_int basemem; 1310 1311 /* 1312 * Populate the (physmap) array with base/bound pairs describing the 1313 * available physical memory in the system, then test this memory and 1314 * build the phys_avail array describing the actually-available memory. 1315 * 1316 * If we cannot accurately determine the physical memory map, then use 1317 * value from the 0xE801 call, and failing that, the RTC. 1318 * 1319 * Total memory size may be set by the kernel environment variable 1320 * hw.physmem or the compile-time define MAXMEM. 1321 * 1322 * XXX first should be vm_paddr_t. 1323 */ 1324 static void 1325 getmemsize(caddr_t kmdp, u_int64_t first) 1326 { 1327 int i, off, physmap_idx, pa_indx, da_indx; 1328 vm_paddr_t pa, physmap[PHYSMAP_SIZE]; 1329 u_long physmem_tunable; 1330 pt_entry_t *pte; 1331 struct bios_smap *smapbase, *smap, *smapend; 1332 u_int32_t smapsize; 1333 quad_t dcons_addr, dcons_size; 1334 1335 bzero(physmap, sizeof(physmap)); 1336 basemem = 0; 1337 physmap_idx = 0; 1338 1339 /* 1340 * get memory map from INT 15:E820, kindly supplied by the loader. 1341 * 1342 * subr_module.c says: 1343 * "Consumer may safely assume that size value precedes data." 1344 * ie: an int32_t immediately precedes smap. 1345 */ 1346 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1347 MODINFO_METADATA | MODINFOMD_SMAP); 1348 if (smapbase == NULL) 1349 panic("No BIOS smap info from loader!"); 1350 1351 smapsize = *((u_int32_t *)smapbase - 1); 1352 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1353 1354 for (smap = smapbase; smap < smapend; smap++) { 1355 if (boothowto & RB_VERBOSE) 1356 kprintf("SMAP type=%02x base=%016lx len=%016lx\n", 1357 smap->type, smap->base, smap->length); 1358 1359 if (smap->type != SMAP_TYPE_MEMORY) 1360 continue; 1361 1362 if (smap->length == 0) 1363 continue; 1364 1365 for (i = 0; i <= physmap_idx; i += 2) { 1366 if (smap->base < physmap[i + 1]) { 1367 if (boothowto & RB_VERBOSE) 1368 kprintf( 1369 "Overlapping or non-monotonic memory region, ignoring second region\n"); 1370 continue; 1371 } 1372 } 1373 1374 if (smap->base == physmap[physmap_idx + 1]) { 1375 physmap[physmap_idx + 1] += smap->length; 1376 continue; 1377 } 1378 1379 physmap_idx += 2; 1380 if (physmap_idx == PHYSMAP_SIZE) { 1381 kprintf( 1382 "Too many segments in the physical address map, giving up\n"); 1383 break; 1384 } 1385 physmap[physmap_idx] = smap->base; 1386 physmap[physmap_idx + 1] = smap->base + smap->length; 1387 } 1388 1389 /* 1390 * Find the 'base memory' segment for SMP 1391 */ 1392 basemem = 0; 1393 for (i = 0; i <= physmap_idx; i += 2) { 1394 if (physmap[i] == 0x00000000) { 1395 basemem = physmap[i + 1] / 1024; 1396 break; 1397 } 1398 } 1399 if (basemem == 0) 1400 panic("BIOS smap did not include a basemem segment!"); 1401 1402 #ifdef SMP 1403 /* make hole for AP bootstrap code */ 1404 physmap[1] = mp_bootaddress(physmap[1] / 1024); 1405 1406 /* look for the MP hardware - needed for apic addresses */ 1407 mp_probe(); 1408 #endif 1409 1410 /* 1411 * Maxmem isn't the "maximum memory", it's one larger than the 1412 * highest page of the physical address space. It should be 1413 * called something like "Maxphyspage". We may adjust this 1414 * based on ``hw.physmem'' and the results of the memory test. 1415 */ 1416 Maxmem = atop(physmap[physmap_idx + 1]); 1417 1418 #ifdef MAXMEM 1419 Maxmem = MAXMEM / 4; 1420 #endif 1421 1422 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 1423 Maxmem = atop(physmem_tunable); 1424 1425 /* 1426 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 1427 * in the system. 1428 */ 1429 if (Maxmem > atop(physmap[physmap_idx + 1])) 1430 Maxmem = atop(physmap[physmap_idx + 1]); 1431 1432 if (atop(physmap[physmap_idx + 1]) != Maxmem && 1433 (boothowto & RB_VERBOSE)) 1434 kprintf("Physical memory use set to %ldK\n", Maxmem * 4); 1435 1436 /* call pmap initialization to make new kernel address space */ 1437 pmap_bootstrap(&first); 1438 1439 /* 1440 * Size up each available chunk of physical memory. 1441 */ 1442 physmap[0] = PAGE_SIZE; /* mask off page 0 */ 1443 pa_indx = 0; 1444 da_indx = 1; 1445 phys_avail[pa_indx++] = physmap[0]; 1446 phys_avail[pa_indx] = physmap[0]; 1447 dump_avail[da_indx] = physmap[0]; 1448 pte = CMAP1; 1449 1450 /* 1451 * Get dcons buffer address 1452 */ 1453 if (kgetenv_quad("dcons.addr", &dcons_addr) == 0 || 1454 kgetenv_quad("dcons.size", &dcons_size) == 0) 1455 dcons_addr = 0; 1456 1457 /* 1458 * physmap is in bytes, so when converting to page boundaries, 1459 * round up the start address and round down the end address. 1460 */ 1461 for (i = 0; i <= physmap_idx; i += 2) { 1462 vm_paddr_t end; 1463 1464 end = ptoa((vm_paddr_t)Maxmem); 1465 if (physmap[i + 1] < end) 1466 end = trunc_page(physmap[i + 1]); 1467 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 1468 int tmp, page_bad, full; 1469 int *ptr = (int *)CADDR1; 1470 1471 full = FALSE; 1472 /* 1473 * block out kernel memory as not available. 1474 */ 1475 if (pa >= 0x100000 && pa < first) 1476 goto do_dump_avail; 1477 1478 /* 1479 * block out dcons buffer 1480 */ 1481 if (dcons_addr > 0 1482 && pa >= trunc_page(dcons_addr) 1483 && pa < dcons_addr + dcons_size) 1484 goto do_dump_avail; 1485 1486 page_bad = FALSE; 1487 1488 /* 1489 * map page into kernel: valid, read/write,non-cacheable 1490 */ 1491 *pte = pa | PG_V | PG_RW | PG_N; 1492 cpu_invltlb(); 1493 1494 tmp = *(int *)ptr; 1495 /* 1496 * Test for alternating 1's and 0's 1497 */ 1498 *(volatile int *)ptr = 0xaaaaaaaa; 1499 if (*(volatile int *)ptr != 0xaaaaaaaa) 1500 page_bad = TRUE; 1501 /* 1502 * Test for alternating 0's and 1's 1503 */ 1504 *(volatile int *)ptr = 0x55555555; 1505 if (*(volatile int *)ptr != 0x55555555) 1506 page_bad = TRUE; 1507 /* 1508 * Test for all 1's 1509 */ 1510 *(volatile int *)ptr = 0xffffffff; 1511 if (*(volatile int *)ptr != 0xffffffff) 1512 page_bad = TRUE; 1513 /* 1514 * Test for all 0's 1515 */ 1516 *(volatile int *)ptr = 0x0; 1517 if (*(volatile int *)ptr != 0x0) 1518 page_bad = TRUE; 1519 /* 1520 * Restore original value. 1521 */ 1522 *(int *)ptr = tmp; 1523 1524 /* 1525 * Adjust array of valid/good pages. 1526 */ 1527 if (page_bad == TRUE) 1528 continue; 1529 /* 1530 * If this good page is a continuation of the 1531 * previous set of good pages, then just increase 1532 * the end pointer. Otherwise start a new chunk. 1533 * Note that "end" points one higher than end, 1534 * making the range >= start and < end. 1535 * If we're also doing a speculative memory 1536 * test and we at or past the end, bump up Maxmem 1537 * so that we keep going. The first bad page 1538 * will terminate the loop. 1539 */ 1540 if (phys_avail[pa_indx] == pa) { 1541 phys_avail[pa_indx] += PAGE_SIZE; 1542 } else { 1543 pa_indx++; 1544 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 1545 kprintf( 1546 "Too many holes in the physical address space, giving up\n"); 1547 pa_indx--; 1548 full = TRUE; 1549 goto do_dump_avail; 1550 } 1551 phys_avail[pa_indx++] = pa; /* start */ 1552 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 1553 } 1554 physmem++; 1555 do_dump_avail: 1556 if (dump_avail[da_indx] == pa) { 1557 dump_avail[da_indx] += PAGE_SIZE; 1558 } else { 1559 da_indx++; 1560 if (da_indx == DUMP_AVAIL_ARRAY_END) { 1561 da_indx--; 1562 goto do_next; 1563 } 1564 dump_avail[da_indx++] = pa; /* start */ 1565 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 1566 } 1567 do_next: 1568 if (full) 1569 break; 1570 } 1571 } 1572 *pte = 0; 1573 cpu_invltlb(); 1574 1575 /* 1576 * XXX 1577 * The last chunk must contain at least one page plus the message 1578 * buffer to avoid complicating other code (message buffer address 1579 * calculation, etc.). 1580 */ 1581 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 1582 round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) { 1583 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 1584 phys_avail[pa_indx--] = 0; 1585 phys_avail[pa_indx--] = 0; 1586 } 1587 1588 Maxmem = atop(phys_avail[pa_indx]); 1589 1590 /* Trim off space for the message buffer. */ 1591 phys_avail[pa_indx] -= round_page(MSGBUF_SIZE); 1592 1593 avail_end = phys_avail[pa_indx]; 1594 1595 /* Map the message buffer. */ 1596 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) 1597 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + 1598 off); 1599 } 1600 1601 /* 1602 * IDT VECTORS: 1603 * 0 Divide by zero 1604 * 1 Debug 1605 * 2 NMI 1606 * 3 BreakPoint 1607 * 4 OverFlow 1608 * 5 Bound-Range 1609 * 6 Invalid OpCode 1610 * 7 Device Not Available (x87) 1611 * 8 Double-Fault 1612 * 9 Coprocessor Segment overrun (unsupported, reserved) 1613 * 10 Invalid-TSS 1614 * 11 Segment not present 1615 * 12 Stack 1616 * 13 General Protection 1617 * 14 Page Fault 1618 * 15 Reserved 1619 * 16 x87 FP Exception pending 1620 * 17 Alignment Check 1621 * 18 Machine Check 1622 * 19 SIMD floating point 1623 * 20-31 reserved 1624 * 32-255 INTn/external sources 1625 */ 1626 u_int64_t 1627 hammer_time(u_int64_t modulep, u_int64_t physfree) 1628 { 1629 caddr_t kmdp; 1630 int gsel_tss, x; 1631 #if JG 1632 int metadata_missing, off; 1633 #endif 1634 struct mdglobaldata *gd; 1635 u_int64_t msr; 1636 char *env; 1637 1638 #if JG 1639 /* 1640 * This must be done before the first references 1641 * to CPU_prvspace[0] are made. 1642 */ 1643 init_paging(&physfree); 1644 #endif 1645 1646 /* 1647 * Prevent lowering of the ipl if we call tsleep() early. 1648 */ 1649 gd = &CPU_prvspace[0].mdglobaldata; 1650 bzero(gd, sizeof(*gd)); 1651 1652 /* 1653 * Note: on both UP and SMP curthread must be set non-NULL 1654 * early in the boot sequence because the system assumes 1655 * that 'curthread' is never NULL. 1656 */ 1657 1658 gd->mi.gd_curthread = &thread0; 1659 thread0.td_gd = &gd->mi; 1660 1661 atdevbase = ISA_HOLE_START + PTOV_OFFSET; 1662 1663 #if JG 1664 metadata_missing = 0; 1665 if (bootinfo.bi_modulep) { 1666 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; 1667 preload_bootstrap_relocate(KERNBASE); 1668 } else { 1669 metadata_missing = 1; 1670 } 1671 if (bootinfo.bi_envp) 1672 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; 1673 #endif 1674 1675 preload_metadata = (caddr_t)(uintptr_t)(modulep + PTOV_OFFSET); 1676 preload_bootstrap_relocate(PTOV_OFFSET); 1677 kmdp = preload_search_by_type("elf kernel"); 1678 if (kmdp == NULL) 1679 kmdp = preload_search_by_type("elf64 kernel"); 1680 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 1681 kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + PTOV_OFFSET; 1682 #ifdef DDB 1683 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 1684 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 1685 #endif 1686 1687 /* 1688 * start with one cpu. Note: with one cpu, ncpus2_shift, ncpus2_mask, 1689 * and ncpus_fit_mask remain 0. 1690 */ 1691 ncpus = 1; 1692 ncpus2 = 1; 1693 ncpus_fit = 1; 1694 /* Init basic tunables, hz etc */ 1695 init_param1(); 1696 1697 /* 1698 * make gdt memory segments 1699 */ 1700 gdt_segs[GPROC0_SEL].ssd_base = 1701 (uintptr_t) &CPU_prvspace[0].mdglobaldata.gd_common_tss; 1702 1703 gd->mi.gd_prvspace = &CPU_prvspace[0]; 1704 1705 for (x = 0; x < NGDT; x++) { 1706 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1)) 1707 ssdtosd(&gdt_segs[x], &gdt[x]); 1708 } 1709 ssdtosyssd(&gdt_segs[GPROC0_SEL], 1710 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1711 1712 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 1713 r_gdt.rd_base = (long) gdt; 1714 lgdt(&r_gdt); 1715 1716 wrmsr(MSR_FSBASE, 0); /* User value */ 1717 wrmsr(MSR_GSBASE, (u_int64_t)&gd->mi); 1718 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 1719 1720 mi_gdinit(&gd->mi, 0); 1721 cpu_gdinit(gd, 0); 1722 proc0paddr = proc0paddr_buff; 1723 mi_proc0init(&gd->mi, proc0paddr); 1724 safepri = TDPRI_MAX; 1725 1726 /* spinlocks and the BGL */ 1727 init_locks(); 1728 1729 /* exceptions */ 1730 for (x = 0; x < NIDT; x++) 1731 setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); 1732 setidt(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); 1733 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0); 1734 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 1); 1735 setidt(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0); 1736 setidt(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0); 1737 setidt(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); 1738 setidt(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); 1739 setidt(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); 1740 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 1741 setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); 1742 setidt(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); 1743 setidt(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); 1744 setidt(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); 1745 setidt(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); 1746 setidt(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); 1747 setidt(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); 1748 setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); 1749 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0); 1750 setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); 1751 1752 r_idt.rd_limit = sizeof(idt0) - 1; 1753 r_idt.rd_base = (long) idt; 1754 lidt(&r_idt); 1755 1756 /* 1757 * Initialize the console before we print anything out. 1758 */ 1759 cninit(); 1760 1761 #if JG 1762 if (metadata_missing) 1763 kprintf("WARNING: loader(8) metadata is missing!\n"); 1764 #endif 1765 1766 #if NISA >0 1767 isa_defaultirq(); 1768 #endif 1769 rand_initialize(); 1770 1771 #ifdef DDB 1772 kdb_init(); 1773 if (boothowto & RB_KDB) 1774 Debugger("Boot flags requested debugger"); 1775 #endif 1776 1777 #if JG 1778 finishidentcpu(); /* Final stage of CPU initialization */ 1779 setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 1780 setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 1781 #endif 1782 identify_cpu(); /* Final stage of CPU initialization */ 1783 initializecpu(); /* Initialize CPU registers */ 1784 1785 /* make an initial tss so cpu can get interrupt stack on syscall! */ 1786 gd->gd_common_tss.tss_rsp0 = 1787 (register_t)(thread0.td_kstack + 1788 KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb)); 1789 /* Ensure the stack is aligned to 16 bytes */ 1790 gd->gd_common_tss.tss_rsp0 &= ~0xFul; 1791 gd->gd_rsp0 = gd->gd_common_tss.tss_rsp0; 1792 1793 /* doublefault stack space, runs on ist1 */ 1794 gd->gd_common_tss.tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)]; 1795 1796 /* Set the IO permission bitmap (empty due to tss seg limit) */ 1797 gd->gd_common_tss.tss_iobase = sizeof(struct x86_64tss); 1798 1799 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 1800 gd->gd_tss_gdt = &gdt[GPROC0_SEL]; 1801 gd->gd_common_tssd = *gd->gd_tss_gdt; 1802 ltr(gsel_tss); 1803 1804 /* Set up the fast syscall stuff */ 1805 msr = rdmsr(MSR_EFER) | EFER_SCE; 1806 wrmsr(MSR_EFER, msr); 1807 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); 1808 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 1809 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1810 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 1811 wrmsr(MSR_STAR, msr); 1812 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); 1813 1814 getmemsize(kmdp, physfree); 1815 init_param2(physmem); 1816 1817 /* now running on new page tables, configured,and u/iom is accessible */ 1818 1819 /* Map the message buffer. */ 1820 #if JG 1821 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) 1822 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); 1823 #endif 1824 1825 msgbufinit(msgbufp, MSGBUF_SIZE); 1826 1827 1828 /* transfer to user mode */ 1829 1830 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 1831 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 1832 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 1833 1834 load_ds(_udatasel); 1835 load_es(_udatasel); 1836 load_fs(_udatasel); 1837 1838 /* setup proc 0's pcb */ 1839 thread0.td_pcb->pcb_flags = 0; 1840 thread0.td_pcb->pcb_cr3 = KPML4phys; 1841 thread0.td_pcb->pcb_ext = 0; 1842 lwp0.lwp_md.md_regs = &proc0_tf; 1843 env = kgetenv("kernelname"); 1844 if (env != NULL) 1845 strlcpy(kernelname, env, sizeof(kernelname)); 1846 1847 /* Location of kernel stack for locore */ 1848 return ((u_int64_t)thread0.td_pcb); 1849 } 1850 1851 /* 1852 * Initialize machine-dependant portions of the global data structure. 1853 * Note that the global data area and cpu0's idlestack in the private 1854 * data space were allocated in locore. 1855 * 1856 * Note: the idlethread's cpl is 0 1857 * 1858 * WARNING! Called from early boot, 'mycpu' may not work yet. 1859 */ 1860 void 1861 cpu_gdinit(struct mdglobaldata *gd, int cpu) 1862 { 1863 if (cpu) 1864 gd->mi.gd_curthread = &gd->mi.gd_idlethread; 1865 1866 lwkt_init_thread(&gd->mi.gd_idlethread, 1867 gd->mi.gd_prvspace->idlestack, 1868 sizeof(gd->mi.gd_prvspace->idlestack), 1869 TDF_MPSAFE, &gd->mi); 1870 lwkt_set_comm(&gd->mi.gd_idlethread, "idle_%d", cpu); 1871 gd->mi.gd_idlethread.td_switch = cpu_lwkt_switch; 1872 gd->mi.gd_idlethread.td_sp -= sizeof(void *); 1873 *(void **)gd->mi.gd_idlethread.td_sp = cpu_idle_restore; 1874 } 1875 1876 int 1877 is_globaldata_space(vm_offset_t saddr, vm_offset_t eaddr) 1878 { 1879 if (saddr >= (vm_offset_t)&CPU_prvspace[0] && 1880 eaddr <= (vm_offset_t)&CPU_prvspace[MAXCPU]) { 1881 return (TRUE); 1882 } 1883 return (FALSE); 1884 } 1885 1886 struct globaldata * 1887 globaldata_find(int cpu) 1888 { 1889 KKASSERT(cpu >= 0 && cpu < ncpus); 1890 return(&CPU_prvspace[cpu].mdglobaldata.mi); 1891 } 1892 1893 #if defined(I586_CPU) && !defined(NO_F00F_HACK) 1894 static void f00f_hack(void *unused); 1895 SYSINIT(f00f_hack, SI_BOOT2_BIOS, SI_ORDER_ANY, f00f_hack, NULL); 1896 1897 static void 1898 f00f_hack(void *unused) 1899 { 1900 struct gate_descriptor *new_idt; 1901 vm_offset_t tmp; 1902 1903 if (!has_f00f_bug) 1904 return; 1905 1906 kprintf("Intel Pentium detected, installing workaround for F00F bug\n"); 1907 1908 r_idt.rd_limit = sizeof(idt0) - 1; 1909 1910 tmp = kmem_alloc(&kernel_map, PAGE_SIZE * 2); 1911 if (tmp == 0) 1912 panic("kmem_alloc returned 0"); 1913 if (((unsigned int)tmp & (PAGE_SIZE-1)) != 0) 1914 panic("kmem_alloc returned non-page-aligned memory"); 1915 /* Put the first seven entries in the lower page */ 1916 new_idt = (struct gate_descriptor*)(tmp + PAGE_SIZE - (7*8)); 1917 bcopy(idt, new_idt, sizeof(idt0)); 1918 r_idt.rd_base = (int)new_idt; 1919 lidt(&r_idt); 1920 idt = new_idt; 1921 if (vm_map_protect(&kernel_map, tmp, tmp + PAGE_SIZE, 1922 VM_PROT_READ, FALSE) != KERN_SUCCESS) 1923 panic("vm_map_protect failed"); 1924 return; 1925 } 1926 #endif /* defined(I586_CPU) && !NO_F00F_HACK */ 1927 1928 int 1929 ptrace_set_pc(struct lwp *lp, unsigned long addr) 1930 { 1931 lp->lwp_md.md_regs->tf_rip = addr; 1932 return (0); 1933 } 1934 1935 int 1936 ptrace_single_step(struct lwp *lp) 1937 { 1938 lp->lwp_md.md_regs->tf_rflags |= PSL_T; 1939 return (0); 1940 } 1941 1942 int 1943 fill_regs(struct lwp *lp, struct reg *regs) 1944 { 1945 struct pcb *pcb; 1946 struct trapframe *tp; 1947 1948 tp = lp->lwp_md.md_regs; 1949 bcopy(&tp->tf_rdi, ®s->r_rdi, sizeof(*regs)); 1950 1951 pcb = lp->lwp_thread->td_pcb; 1952 return (0); 1953 } 1954 1955 int 1956 set_regs(struct lwp *lp, struct reg *regs) 1957 { 1958 struct pcb *pcb; 1959 struct trapframe *tp; 1960 1961 tp = lp->lwp_md.md_regs; 1962 if (!EFL_SECURE(regs->r_rflags, tp->tf_rflags) || 1963 !CS_SECURE(regs->r_cs)) 1964 return (EINVAL); 1965 bcopy(®s->r_rdi, &tp->tf_rdi, sizeof(*regs)); 1966 pcb = lp->lwp_thread->td_pcb; 1967 return (0); 1968 } 1969 1970 #ifndef CPU_DISABLE_SSE 1971 static void 1972 fill_fpregs_xmm(struct savexmm *sv_xmm, struct save87 *sv_87) 1973 { 1974 struct env87 *penv_87 = &sv_87->sv_env; 1975 struct envxmm *penv_xmm = &sv_xmm->sv_env; 1976 int i; 1977 1978 /* FPU control/status */ 1979 penv_87->en_cw = penv_xmm->en_cw; 1980 penv_87->en_sw = penv_xmm->en_sw; 1981 penv_87->en_tw = penv_xmm->en_tw; 1982 penv_87->en_fip = penv_xmm->en_fip; 1983 penv_87->en_fcs = penv_xmm->en_fcs; 1984 penv_87->en_opcode = penv_xmm->en_opcode; 1985 penv_87->en_foo = penv_xmm->en_foo; 1986 penv_87->en_fos = penv_xmm->en_fos; 1987 1988 /* FPU registers */ 1989 for (i = 0; i < 8; ++i) 1990 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; 1991 1992 sv_87->sv_ex_sw = sv_xmm->sv_ex_sw; 1993 } 1994 1995 static void 1996 set_fpregs_xmm(struct save87 *sv_87, struct savexmm *sv_xmm) 1997 { 1998 struct env87 *penv_87 = &sv_87->sv_env; 1999 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2000 int i; 2001 2002 /* FPU control/status */ 2003 penv_xmm->en_cw = penv_87->en_cw; 2004 penv_xmm->en_sw = penv_87->en_sw; 2005 penv_xmm->en_tw = penv_87->en_tw; 2006 penv_xmm->en_fip = penv_87->en_fip; 2007 penv_xmm->en_fcs = penv_87->en_fcs; 2008 penv_xmm->en_opcode = penv_87->en_opcode; 2009 penv_xmm->en_foo = penv_87->en_foo; 2010 penv_xmm->en_fos = penv_87->en_fos; 2011 2012 /* FPU registers */ 2013 for (i = 0; i < 8; ++i) 2014 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; 2015 2016 sv_xmm->sv_ex_sw = sv_87->sv_ex_sw; 2017 } 2018 #endif /* CPU_DISABLE_SSE */ 2019 2020 int 2021 fill_fpregs(struct lwp *lp, struct fpreg *fpregs) 2022 { 2023 #ifndef CPU_DISABLE_SSE 2024 if (cpu_fxsr) { 2025 fill_fpregs_xmm(&lp->lwp_thread->td_pcb->pcb_save.sv_xmm, 2026 (struct save87 *)fpregs); 2027 return (0); 2028 } 2029 #endif /* CPU_DISABLE_SSE */ 2030 bcopy(&lp->lwp_thread->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs); 2031 return (0); 2032 } 2033 2034 int 2035 set_fpregs(struct lwp *lp, struct fpreg *fpregs) 2036 { 2037 #ifndef CPU_DISABLE_SSE 2038 if (cpu_fxsr) { 2039 set_fpregs_xmm((struct save87 *)fpregs, 2040 &lp->lwp_thread->td_pcb->pcb_save.sv_xmm); 2041 return (0); 2042 } 2043 #endif /* CPU_DISABLE_SSE */ 2044 bcopy(fpregs, &lp->lwp_thread->td_pcb->pcb_save.sv_87, sizeof *fpregs); 2045 return (0); 2046 } 2047 2048 int 2049 fill_dbregs(struct lwp *lp, struct dbreg *dbregs) 2050 { 2051 if (lp == NULL) { 2052 dbregs->dr[0] = rdr0(); 2053 dbregs->dr[1] = rdr1(); 2054 dbregs->dr[2] = rdr2(); 2055 dbregs->dr[3] = rdr3(); 2056 dbregs->dr[4] = rdr4(); 2057 dbregs->dr[5] = rdr5(); 2058 dbregs->dr[6] = rdr6(); 2059 dbregs->dr[7] = rdr7(); 2060 } else { 2061 struct pcb *pcb; 2062 2063 pcb = lp->lwp_thread->td_pcb; 2064 dbregs->dr[0] = pcb->pcb_dr0; 2065 dbregs->dr[1] = pcb->pcb_dr1; 2066 dbregs->dr[2] = pcb->pcb_dr2; 2067 dbregs->dr[3] = pcb->pcb_dr3; 2068 dbregs->dr[4] = 0; 2069 dbregs->dr[5] = 0; 2070 dbregs->dr[6] = pcb->pcb_dr6; 2071 dbregs->dr[7] = pcb->pcb_dr7; 2072 } 2073 return (0); 2074 } 2075 2076 int 2077 set_dbregs(struct lwp *lp, struct dbreg *dbregs) 2078 { 2079 if (lp == NULL) { 2080 load_dr0(dbregs->dr[0]); 2081 load_dr1(dbregs->dr[1]); 2082 load_dr2(dbregs->dr[2]); 2083 load_dr3(dbregs->dr[3]); 2084 load_dr4(dbregs->dr[4]); 2085 load_dr5(dbregs->dr[5]); 2086 load_dr6(dbregs->dr[6]); 2087 load_dr7(dbregs->dr[7]); 2088 } else { 2089 struct pcb *pcb; 2090 struct ucred *ucred; 2091 int i; 2092 uint64_t mask1, mask2; 2093 2094 /* 2095 * Don't let an illegal value for dr7 get set. Specifically, 2096 * check for undefined settings. Setting these bit patterns 2097 * result in undefined behaviour and can lead to an unexpected 2098 * TRCTRAP. 2099 */ 2100 /* JG this loop looks unreadable */ 2101 /* Check 4 2-bit fields for invalid patterns. 2102 * These fields are R/Wi, for i = 0..3 2103 */ 2104 /* Is 10 in LENi allowed when running in compatibility mode? */ 2105 /* Pattern 10 in R/Wi might be used to indicate 2106 * breakpoint on I/O. Further analysis should be 2107 * carried to decide if it is safe and useful to 2108 * provide access to that capability 2109 */ 2110 for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 4; 2111 i++, mask1 <<= 4, mask2 <<= 4) 2112 if ((dbregs->dr[7] & mask1) == mask2) 2113 return (EINVAL); 2114 2115 pcb = lp->lwp_thread->td_pcb; 2116 ucred = lp->lwp_proc->p_ucred; 2117 2118 /* 2119 * Don't let a process set a breakpoint that is not within the 2120 * process's address space. If a process could do this, it 2121 * could halt the system by setting a breakpoint in the kernel 2122 * (if ddb was enabled). Thus, we need to check to make sure 2123 * that no breakpoints are being enabled for addresses outside 2124 * process's address space, unless, perhaps, we were called by 2125 * uid 0. 2126 * 2127 * XXX - what about when the watched area of the user's 2128 * address space is written into from within the kernel 2129 * ... wouldn't that still cause a breakpoint to be generated 2130 * from within kernel mode? 2131 */ 2132 2133 if (priv_check_cred(ucred, PRIV_ROOT, 0) != 0) { 2134 if (dbregs->dr[7] & 0x3) { 2135 /* dr0 is enabled */ 2136 if (dbregs->dr[0] >= VM_MAX_USER_ADDRESS) 2137 return (EINVAL); 2138 } 2139 2140 if (dbregs->dr[7] & (0x3<<2)) { 2141 /* dr1 is enabled */ 2142 if (dbregs->dr[1] >= VM_MAX_USER_ADDRESS) 2143 return (EINVAL); 2144 } 2145 2146 if (dbregs->dr[7] & (0x3<<4)) { 2147 /* dr2 is enabled */ 2148 if (dbregs->dr[2] >= VM_MAX_USER_ADDRESS) 2149 return (EINVAL); 2150 } 2151 2152 if (dbregs->dr[7] & (0x3<<6)) { 2153 /* dr3 is enabled */ 2154 if (dbregs->dr[3] >= VM_MAX_USER_ADDRESS) 2155 return (EINVAL); 2156 } 2157 } 2158 2159 pcb->pcb_dr0 = dbregs->dr[0]; 2160 pcb->pcb_dr1 = dbregs->dr[1]; 2161 pcb->pcb_dr2 = dbregs->dr[2]; 2162 pcb->pcb_dr3 = dbregs->dr[3]; 2163 pcb->pcb_dr6 = dbregs->dr[6]; 2164 pcb->pcb_dr7 = dbregs->dr[7]; 2165 2166 pcb->pcb_flags |= PCB_DBREGS; 2167 } 2168 2169 return (0); 2170 } 2171 2172 /* 2173 * Return > 0 if a hardware breakpoint has been hit, and the 2174 * breakpoint was in user space. Return 0, otherwise. 2175 */ 2176 int 2177 user_dbreg_trap(void) 2178 { 2179 u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */ 2180 u_int64_t bp; /* breakpoint bits extracted from dr6 */ 2181 int nbp; /* number of breakpoints that triggered */ 2182 caddr_t addr[4]; /* breakpoint addresses */ 2183 int i; 2184 2185 dr7 = rdr7(); 2186 if ((dr7 & 0xff) == 0) { 2187 /* 2188 * all GE and LE bits in the dr7 register are zero, 2189 * thus the trap couldn't have been caused by the 2190 * hardware debug registers 2191 */ 2192 return 0; 2193 } 2194 2195 nbp = 0; 2196 dr6 = rdr6(); 2197 bp = dr6 & 0xf; 2198 2199 if (bp == 0) { 2200 /* 2201 * None of the breakpoint bits are set meaning this 2202 * trap was not caused by any of the debug registers 2203 */ 2204 return 0; 2205 } 2206 2207 /* 2208 * at least one of the breakpoints were hit, check to see 2209 * which ones and if any of them are user space addresses 2210 */ 2211 2212 if (bp & 0x01) { 2213 addr[nbp++] = (caddr_t)rdr0(); 2214 } 2215 if (bp & 0x02) { 2216 addr[nbp++] = (caddr_t)rdr1(); 2217 } 2218 if (bp & 0x04) { 2219 addr[nbp++] = (caddr_t)rdr2(); 2220 } 2221 if (bp & 0x08) { 2222 addr[nbp++] = (caddr_t)rdr3(); 2223 } 2224 2225 for (i=0; i<nbp; i++) { 2226 if (addr[i] < 2227 (caddr_t)VM_MAX_USER_ADDRESS) { 2228 /* 2229 * addr[i] is in user space 2230 */ 2231 return nbp; 2232 } 2233 } 2234 2235 /* 2236 * None of the breakpoints are in user space. 2237 */ 2238 return 0; 2239 } 2240 2241 2242 #ifndef DDB 2243 void 2244 Debugger(const char *msg) 2245 { 2246 kprintf("Debugger(\"%s\") called.\n", msg); 2247 } 2248 #endif /* no DDB */ 2249 2250 #ifdef DDB 2251 2252 /* 2253 * Provide inb() and outb() as functions. They are normally only 2254 * available as macros calling inlined functions, thus cannot be 2255 * called inside DDB. 2256 * 2257 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined. 2258 */ 2259 2260 #undef inb 2261 #undef outb 2262 2263 /* silence compiler warnings */ 2264 u_char inb(u_int); 2265 void outb(u_int, u_char); 2266 2267 u_char 2268 inb(u_int port) 2269 { 2270 u_char data; 2271 /* 2272 * We use %%dx and not %1 here because i/o is done at %dx and not at 2273 * %edx, while gcc generates inferior code (movw instead of movl) 2274 * if we tell it to load (u_short) port. 2275 */ 2276 __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); 2277 return (data); 2278 } 2279 2280 void 2281 outb(u_int port, u_char data) 2282 { 2283 u_char al; 2284 /* 2285 * Use an unnecessary assignment to help gcc's register allocator. 2286 * This make a large difference for gcc-1.40 and a tiny difference 2287 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for 2288 * best results. gcc-2.6.0 can't handle this. 2289 */ 2290 al = data; 2291 __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); 2292 } 2293 2294 #endif /* DDB */ 2295 2296 2297 2298 #include "opt_cpu.h" 2299 2300 2301 /* 2302 * initialize all the SMP locks 2303 */ 2304 2305 /* critical region when masking or unmasking interupts */ 2306 struct spinlock_deprecated imen_spinlock; 2307 2308 /* Make FAST_INTR() routines sequential */ 2309 struct spinlock_deprecated fast_intr_spinlock; 2310 2311 /* critical region for old style disable_intr/enable_intr */ 2312 struct spinlock_deprecated mpintr_spinlock; 2313 2314 /* critical region around INTR() routines */ 2315 struct spinlock_deprecated intr_spinlock; 2316 2317 /* lock region used by kernel profiling */ 2318 struct spinlock_deprecated mcount_spinlock; 2319 2320 /* locks com (tty) data/hardware accesses: a FASTINTR() */ 2321 struct spinlock_deprecated com_spinlock; 2322 2323 /* locks kernel kprintfs */ 2324 struct spinlock_deprecated cons_spinlock; 2325 2326 /* lock regions around the clock hardware */ 2327 struct spinlock_deprecated clock_spinlock; 2328 2329 /* lock around the MP rendezvous */ 2330 struct spinlock_deprecated smp_rv_spinlock; 2331 2332 static void 2333 init_locks(void) 2334 { 2335 /* 2336 * mp_lock = 0; BSP already owns the MP lock 2337 */ 2338 /* 2339 * Get the initial mp_lock with a count of 1 for the BSP. 2340 * This uses a LOGICAL cpu ID, ie BSP == 0. 2341 */ 2342 #ifdef SMP 2343 cpu_get_initial_mplock(); 2344 #endif 2345 /* DEPRECATED */ 2346 spin_lock_init(&mcount_spinlock); 2347 spin_lock_init(&fast_intr_spinlock); 2348 spin_lock_init(&intr_spinlock); 2349 spin_lock_init(&mpintr_spinlock); 2350 spin_lock_init(&imen_spinlock); 2351 spin_lock_init(&smp_rv_spinlock); 2352 spin_lock_init(&com_spinlock); 2353 spin_lock_init(&clock_spinlock); 2354 spin_lock_init(&cons_spinlock); 2355 2356 /* our token pool needs to work early */ 2357 lwkt_token_pool_init(); 2358 } 2359 2360