1 /*- 2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 3 * Copyright (c) 1992 Terrence R. Lambert. 4 * Copyright (c) 2003 Peter Wemm. 5 * Copyright (c) 2008-2017 The DragonFly Project. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * William Jolitz. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $ 41 */ 42 43 //#include "use_npx.h" 44 #include "use_isa.h" 45 #include "opt_cpu.h" 46 #include "opt_ddb.h" 47 #include "opt_inet.h" 48 #include "opt_msgbuf.h" 49 #include "opt_swap.h" 50 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/sysmsg.h> 54 #include <sys/signalvar.h> 55 #include <sys/kernel.h> 56 #include <sys/linker.h> 57 #include <sys/malloc.h> 58 #include <sys/proc.h> 59 #include <sys/priv.h> 60 #include <sys/buf.h> 61 #include <sys/reboot.h> 62 #include <sys/mbuf.h> 63 #include <sys/msgbuf.h> 64 #include <sys/sysent.h> 65 #include <sys/sysctl.h> 66 #include <sys/vmmeter.h> 67 #include <sys/bus.h> 68 #include <sys/usched.h> 69 #include <sys/reg.h> 70 #include <sys/sbuf.h> 71 #include <sys/ctype.h> 72 #include <sys/serialize.h> 73 #include <sys/systimer.h> 74 75 #include <vm/vm.h> 76 #include <vm/vm_param.h> 77 #include <sys/lock.h> 78 #include <vm/vm_kern.h> 79 #include <vm/vm_object.h> 80 #include <vm/vm_page.h> 81 #include <vm/vm_map.h> 82 #include <vm/vm_pager.h> 83 #include <vm/vm_extern.h> 84 85 #include <sys/thread2.h> 86 #include <sys/mplock2.h> 87 88 #include <sys/exec.h> 89 #include <sys/cons.h> 90 91 #include <sys/efi.h> 92 93 #include <ddb/ddb.h> 94 95 #include <machine/cpu.h> 96 #include <machine/clock.h> 97 #include <machine/specialreg.h> 98 #if 0 /* JG */ 99 #include <machine/bootinfo.h> 100 #endif 101 #include <machine/md_var.h> 102 #include <machine/metadata.h> 103 #include <machine/pc/bios.h> 104 #include <machine/pcb_ext.h> 105 #include <machine/globaldata.h> /* CPU_prvspace */ 106 #include <machine/smp.h> 107 #include <machine/cputypes.h> 108 #include <machine/intr_machdep.h> 109 #include <machine/framebuffer.h> 110 111 #ifdef OLD_BUS_ARCH 112 #include <bus/isa/isa_device.h> 113 #endif 114 #include <machine_base/isa/isa_intr.h> 115 #include <bus/isa/rtc.h> 116 #include <sys/random.h> 117 #include <sys/ptrace.h> 118 #include <machine/sigframe.h> 119 120 #include <sys/machintr.h> 121 #include <machine_base/icu/icu_abi.h> 122 #include <machine_base/icu/elcr_var.h> 123 #include <machine_base/apic/lapic.h> 124 #include <machine_base/apic/ioapic.h> 125 #include <machine_base/apic/ioapic_abi.h> 126 #include <machine/mptable.h> 127 128 #define PHYSMAP_ENTRIES 10 129 #define MAXBUFSTRUCTSIZE ((size_t)512 * 1024 * 1024) 130 131 extern u_int64_t hammer_time(u_int64_t, u_int64_t); 132 133 extern void printcpuinfo(void); /* XXX header file */ 134 extern void identify_cpu(void); 135 extern void panicifcpuunsupported(void); 136 137 static void cpu_startup(void *); 138 static void pic_finish(void *); 139 static void cpu_finish(void *); 140 141 static void set_fpregs_xmm(struct save87 *, struct savexmm *); 142 static void fill_fpregs_xmm(struct savexmm *, struct save87 *); 143 static void init_locks(void); 144 145 extern void pcpu_timer_always(struct intrframe *); 146 147 SYSINIT(cpu, SI_BOOT2_START_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 148 SYSINIT(pic_finish, SI_BOOT2_FINISH_PIC, SI_ORDER_FIRST, pic_finish, NULL); 149 SYSINIT(cpu_finish, SI_BOOT2_FINISH_CPU, SI_ORDER_FIRST, cpu_finish, NULL); 150 151 #ifdef DDB 152 extern vm_offset_t ksym_start, ksym_end; 153 #endif 154 155 struct privatespace CPU_prvspace_bsp __aligned(4096); 156 struct privatespace *CPU_prvspace[MAXCPU] = { &CPU_prvspace_bsp }; 157 158 vm_paddr_t efi_systbl_phys; 159 int _udatasel, _ucodesel, _ucode32sel; 160 u_long atdevbase; 161 int64_t tsc_offsets[MAXCPU]; 162 cpumask_t smp_idleinvl_mask; 163 cpumask_t smp_idleinvl_reqs; 164 165 /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */ 166 __read_mostly static int cpu_mwait_halt_global; 167 __read_mostly static int clock_debug1; 168 __read_mostly static int flame_poll_debug; 169 170 SYSCTL_INT(_debug, OID_AUTO, flame_poll_debug, 171 CTLFLAG_RW, &flame_poll_debug, 0, ""); 172 TUNABLE_INT("debug.flame_poll_debug", &flame_poll_debug); 173 174 #if defined(SWTCH_OPTIM_STATS) 175 extern int swtch_optim_stats; 176 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, 177 CTLFLAG_RD, &swtch_optim_stats, 0, ""); 178 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, 179 CTLFLAG_RD, &tlb_flush_count, 0, ""); 180 #endif 181 SYSCTL_INT(_debug, OID_AUTO, clock_debug1, 182 CTLFLAG_RW, &clock_debug1, 0, ""); 183 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_halt, 184 CTLFLAG_RD, &cpu_mwait_halt_global, 0, ""); 185 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_spin, 186 CTLFLAG_RD, &cpu_mwait_spin, 0, "monitor/mwait target state"); 187 188 #define CPU_MWAIT_HAS_CX \ 189 ((cpu_feature2 & CPUID2_MON) && \ 190 (cpu_mwait_feature & CPUID_MWAIT_EXT)) 191 192 #define CPU_MWAIT_CX_NAMELEN 16 193 194 #define CPU_MWAIT_C1 1 195 #define CPU_MWAIT_C2 2 196 #define CPU_MWAIT_C3 3 197 #define CPU_MWAIT_CX_MAX 8 198 199 #define CPU_MWAIT_HINT_AUTO -1 /* C1 and C2 */ 200 #define CPU_MWAIT_HINT_AUTODEEP -2 /* C3+ */ 201 202 SYSCTL_NODE(_machdep, OID_AUTO, mwait, CTLFLAG_RW, 0, "MWAIT features"); 203 SYSCTL_NODE(_machdep_mwait, OID_AUTO, CX, CTLFLAG_RW, 0, "MWAIT Cx settings"); 204 205 struct cpu_mwait_cx { 206 int subcnt; 207 char name[4]; 208 struct sysctl_ctx_list sysctl_ctx; 209 struct sysctl_oid *sysctl_tree; 210 }; 211 static struct cpu_mwait_cx cpu_mwait_cx_info[CPU_MWAIT_CX_MAX]; 212 static char cpu_mwait_cx_supported[256]; 213 214 static int cpu_mwait_c1_hints_cnt; 215 static int cpu_mwait_hints_cnt; 216 static int *cpu_mwait_hints; 217 218 static int cpu_mwait_deep_hints_cnt; 219 static int *cpu_mwait_deep_hints; 220 221 #define CPU_IDLE_REPEAT_DEFAULT 750 222 223 static u_int cpu_idle_repeat = CPU_IDLE_REPEAT_DEFAULT; 224 static u_long cpu_idle_repeat_max = CPU_IDLE_REPEAT_DEFAULT; 225 static u_int cpu_mwait_repeat_shift = 1; 226 227 #define CPU_MWAIT_C3_PREAMBLE_BM_ARB 0x1 228 #define CPU_MWAIT_C3_PREAMBLE_BM_STS 0x2 229 230 static int cpu_mwait_c3_preamble = 231 CPU_MWAIT_C3_PREAMBLE_BM_ARB | 232 CPU_MWAIT_C3_PREAMBLE_BM_STS; 233 234 SYSCTL_STRING(_machdep_mwait_CX, OID_AUTO, supported, CTLFLAG_RD, 235 cpu_mwait_cx_supported, 0, "MWAIT supported C states"); 236 SYSCTL_INT(_machdep_mwait_CX, OID_AUTO, c3_preamble, CTLFLAG_RD, 237 &cpu_mwait_c3_preamble, 0, "C3+ preamble mask"); 238 239 static int cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS, 240 int *, boolean_t); 241 static int cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS); 242 static int cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS); 243 static int cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS); 244 245 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, idle, CTLTYPE_STRING|CTLFLAG_RW, 246 NULL, 0, cpu_mwait_cx_idle_sysctl, "A", ""); 247 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, spin, CTLTYPE_STRING|CTLFLAG_RW, 248 NULL, 0, cpu_mwait_cx_spin_sysctl, "A", ""); 249 SYSCTL_UINT(_machdep_mwait_CX, OID_AUTO, repeat_shift, CTLFLAG_RW, 250 &cpu_mwait_repeat_shift, 0, ""); 251 252 long physmem = 0; 253 254 u_long ebda_addr = 0; 255 256 int imcr_present = 0; 257 258 int naps = 0; /* # of Applications processors */ 259 260 u_int base_memory; 261 262 static int 263 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS) 264 { 265 u_long pmem = ctob(physmem); 266 int error; 267 268 error = sysctl_handle_long(oidp, &pmem, 0, req); 269 270 return (error); 271 } 272 273 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_ULONG|CTLFLAG_RD, 274 0, 0, sysctl_hw_physmem, "LU", 275 "Total system memory in bytes (number of pages * page size)"); 276 277 static int 278 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS) 279 { 280 u_long usermem = ctob(physmem - vmstats.v_wire_count); 281 int error; 282 283 error = sysctl_handle_long(oidp, &usermem, 0, req); 284 285 return (error); 286 } 287 288 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_ULONG|CTLFLAG_RD, 289 0, 0, sysctl_hw_usermem, "LU", ""); 290 291 static int 292 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS) 293 { 294 int error; 295 u_long availpages; 296 297 availpages = x86_64_btop(avail_end - avail_start); 298 error = sysctl_handle_long(oidp, &availpages, 0, req); 299 300 return (error); 301 } 302 303 SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_ULONG|CTLFLAG_RD, 304 0, 0, sysctl_hw_availpages, "LU", ""); 305 306 vm_paddr_t Maxmem; 307 vm_paddr_t Realmem; 308 309 /* 310 * The number of PHYSMAP entries must be one less than the number of 311 * PHYSSEG entries because the PHYSMAP entry that spans the largest 312 * physical address that is accessible by ISA DMA is split into two 313 * PHYSSEG entries. 314 */ 315 vm_phystable_t phys_avail[VM_PHYSSEG_MAX + 1]; 316 vm_phystable_t dump_avail[VM_PHYSSEG_MAX + 1]; 317 318 /* must be 1 less so 0 0 can signal end of chunks */ 319 #define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 1) 320 #define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 1) 321 322 static vm_offset_t buffer_sva, buffer_eva; 323 vm_offset_t clean_sva, clean_eva; 324 static vm_offset_t pager_sva, pager_eva; 325 static struct trapframe proc0_tf; 326 327 static void cpu_implement_smap(void); 328 329 static void 330 cpu_startup(void *dummy) 331 { 332 caddr_t v; 333 vm_size_t size = 0; 334 vm_offset_t firstaddr; 335 336 /* 337 * Good {morning,afternoon,evening,night}. 338 */ 339 kprintf("%s", version); 340 startrtclock(); 341 printcpuinfo(); 342 panicifcpuunsupported(); 343 if (cpu_stdext_feature & CPUID_STDEXT_SMAP) 344 cpu_implement_smap(); 345 346 kprintf("real memory = %ju (%ju MB)\n", 347 (intmax_t)Realmem, 348 (intmax_t)Realmem / 1024 / 1024); 349 /* 350 * Display any holes after the first chunk of extended memory. 351 */ 352 if (bootverbose) { 353 int indx; 354 355 kprintf("Physical memory chunk(s):\n"); 356 for (indx = 0; phys_avail[indx].phys_end != 0; ++indx) { 357 vm_paddr_t size1; 358 359 size1 = phys_avail[indx].phys_end - 360 phys_avail[indx].phys_beg; 361 362 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n", 363 (intmax_t)phys_avail[indx].phys_beg, 364 (intmax_t)phys_avail[indx].phys_end - 1, 365 (intmax_t)size1, 366 (intmax_t)(size1 / PAGE_SIZE)); 367 } 368 } 369 370 /* 371 * Allocate space for system data structures. 372 * The first available kernel virtual address is in "v". 373 * As pages of kernel virtual memory are allocated, "v" is incremented. 374 * As pages of memory are allocated and cleared, 375 * "firstaddr" is incremented. 376 * An index into the kernel page table corresponding to the 377 * virtual memory address maintained in "v" is kept in "mapaddr". 378 */ 379 380 /* 381 * Make two passes. The first pass calculates how much memory is 382 * needed and allocates it. The second pass assigns virtual 383 * addresses to the various data structures. 384 */ 385 firstaddr = 0; 386 again: 387 v = (caddr_t)firstaddr; 388 389 #define valloc(name, type, num) \ 390 (name) = (type *)v; v = (caddr_t)((name)+(num)) 391 #define valloclim(name, type, num, lim) \ 392 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) 393 394 /* 395 * Calculate nbuf such that maxbufspace uses approximately 1/20 396 * of physical memory by default, with a minimum of 50 buffers. 397 * 398 * The calculation is made after discounting 128MB. 399 * 400 * NOTE: maxbufspace is (nbuf * NBUFCALCSIZE) (NBUFCALCSIZE ~= 16KB). 401 * nbuf = (kbytes / factor) would cover all of memory. 402 */ 403 if (nbuf == 0) { 404 long factor = NBUFCALCSIZE / 1024; /* KB/nbuf */ 405 long kbytes = physmem * (PAGE_SIZE / 1024); /* physmem */ 406 407 nbuf = 50; 408 if (kbytes > 128 * 1024) 409 nbuf += (kbytes - 128 * 1024) / (factor * 20); 410 if (maxbcache && nbuf > maxbcache / NBUFCALCSIZE) 411 nbuf = maxbcache / NBUFCALCSIZE; 412 if ((size_t)nbuf * sizeof(struct buf) > MAXBUFSTRUCTSIZE) { 413 kprintf("Warning: nbuf capped at %ld due to the " 414 "reasonability limit\n", nbuf); 415 nbuf = MAXBUFSTRUCTSIZE / sizeof(struct buf); 416 } 417 } 418 419 /* 420 * Do not allow the buffer_map to be more then 1/2 the size of the 421 * kernel_map. 422 */ 423 if (nbuf > (virtual_end - virtual_start + 424 virtual2_end - virtual2_start) / (MAXBSIZE * 2)) { 425 nbuf = (virtual_end - virtual_start + 426 virtual2_end - virtual2_start) / (MAXBSIZE * 2); 427 kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf); 428 } 429 430 /* 431 * Do not allow the buffer_map to use more than 50% of available 432 * physical-equivalent memory. Since the VM pages which back 433 * individual buffers are typically wired, having too many bufs 434 * can prevent the system from paging properly. 435 */ 436 if (nbuf > physmem * PAGE_SIZE / (NBUFCALCSIZE * 2)) { 437 nbuf = physmem * PAGE_SIZE / (NBUFCALCSIZE * 2); 438 kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf); 439 } 440 441 /* 442 * Do not allow the sizeof(struct buf) * nbuf to exceed 1/4 of 443 * the valloc space which is just the virtual_end - virtual_start 444 * section. This is typically ~2GB regardless of the amount of 445 * memory, so we use 500MB as a metric. 446 * 447 * This is because we use valloc() to allocate the buf header array. 448 * 449 * NOTE: buffer space in bytes is limited by vfs.*bufspace sysctls. 450 */ 451 if (nbuf > (virtual_end - virtual_start) / (sizeof(struct buf) * 4)) { 452 nbuf = (virtual_end - virtual_start) / 453 (sizeof(struct buf) * 4); 454 kprintf("Warning: nbufs capped at %ld due to " 455 "valloc considerations\n", 456 nbuf); 457 } 458 459 nswbuf_mem = lmax(lmin(nbuf / 32, 512), 8); 460 #ifdef NSWBUF_MIN 461 if (nswbuf_mem < NSWBUF_MIN) 462 nswbuf_mem = NSWBUF_MIN; 463 #endif 464 nswbuf_kva = lmax(lmin(nbuf / 4, 512), 16); 465 #ifdef NSWBUF_MIN 466 if (nswbuf_kva < NSWBUF_MIN) 467 nswbuf_kva = NSWBUF_MIN; 468 #endif 469 470 valloc(swbuf_mem, struct buf, nswbuf_mem); 471 valloc(swbuf_kva, struct buf, nswbuf_kva); 472 valloc(buf, struct buf, nbuf); 473 474 /* 475 * End of first pass, size has been calculated so allocate memory 476 */ 477 if (firstaddr == 0) { 478 size = (vm_size_t)(v - firstaddr); 479 firstaddr = kmem_alloc(kernel_map, round_page(size), 480 VM_SUBSYS_BUF); 481 if (firstaddr == 0) 482 panic("startup: no room for tables"); 483 goto again; 484 } 485 486 /* 487 * End of second pass, addresses have been assigned 488 * 489 * nbuf is an int, make sure we don't overflow the field. 490 * 491 * On 64-bit systems we always reserve maximal allocations for 492 * buffer cache buffers and there are no fragmentation issues, 493 * so the KVA segment does not have to be excessively oversized. 494 */ 495 if ((vm_size_t)(v - firstaddr) != size) 496 panic("startup: table size inconsistency"); 497 498 kmem_suballoc(kernel_map, clean_map, &clean_sva, &clean_eva, 499 ((vm_offset_t)(nbuf + 16) * MAXBSIZE) + 500 ((nswbuf_mem + nswbuf_kva) * MAXPHYS) + pager_map_size); 501 kmem_suballoc(clean_map, buffer_map, &buffer_sva, &buffer_eva, 502 ((vm_offset_t)(nbuf + 16) * MAXBSIZE)); 503 buffer_map->system_map = 1; 504 kmem_suballoc(clean_map, pager_map, &pager_sva, &pager_eva, 505 ((vm_offset_t)(nswbuf_mem + nswbuf_kva) * MAXPHYS) + 506 pager_map_size); 507 pager_map->system_map = 1; 508 kprintf("avail memory = %ju (%ju MB)\n", 509 (uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages), 510 (uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages) / 511 1024 / 1024); 512 } 513 514 struct cpu_idle_stat { 515 int hint; 516 int reserved; 517 u_long halt; 518 u_long spin; 519 u_long repeat; 520 u_long repeat_last; 521 u_long repeat_delta; 522 u_long mwait_cx[CPU_MWAIT_CX_MAX]; 523 } __cachealign; 524 525 #define CPU_IDLE_STAT_HALT -1 526 #define CPU_IDLE_STAT_SPIN -2 527 528 static struct cpu_idle_stat cpu_idle_stats[MAXCPU]; 529 530 static int 531 sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS) 532 { 533 int idx = arg2, cpu, error; 534 u_long val = 0; 535 536 if (idx == CPU_IDLE_STAT_HALT) { 537 for (cpu = 0; cpu < ncpus; ++cpu) 538 val += cpu_idle_stats[cpu].halt; 539 } else if (idx == CPU_IDLE_STAT_SPIN) { 540 for (cpu = 0; cpu < ncpus; ++cpu) 541 val += cpu_idle_stats[cpu].spin; 542 } else { 543 KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX, 544 ("invalid index %d", idx)); 545 for (cpu = 0; cpu < ncpus; ++cpu) 546 val += cpu_idle_stats[cpu].mwait_cx[idx]; 547 } 548 549 error = sysctl_handle_quad(oidp, &val, 0, req); 550 if (error || req->newptr == NULL) 551 return error; 552 553 if (idx == CPU_IDLE_STAT_HALT) { 554 for (cpu = 0; cpu < ncpus; ++cpu) 555 cpu_idle_stats[cpu].halt = 0; 556 cpu_idle_stats[0].halt = val; 557 } else if (idx == CPU_IDLE_STAT_SPIN) { 558 for (cpu = 0; cpu < ncpus; ++cpu) 559 cpu_idle_stats[cpu].spin = 0; 560 cpu_idle_stats[0].spin = val; 561 } else { 562 KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX, 563 ("invalid index %d", idx)); 564 for (cpu = 0; cpu < ncpus; ++cpu) 565 cpu_idle_stats[cpu].mwait_cx[idx] = 0; 566 cpu_idle_stats[0].mwait_cx[idx] = val; 567 } 568 return 0; 569 } 570 571 static void 572 cpu_mwait_attach(void) 573 { 574 struct sbuf sb; 575 int hint_idx, i; 576 577 if (!CPU_MWAIT_HAS_CX) 578 return; 579 580 if (cpu_vendor_id == CPU_VENDOR_INTEL && 581 (CPUID_TO_FAMILY(cpu_id) > 0xf || 582 (CPUID_TO_FAMILY(cpu_id) == 0x6 && 583 CPUID_TO_MODEL(cpu_id) >= 0xf))) { 584 int bm_sts = 1; 585 586 /* 587 * Pentium dual-core, Core 2 and beyond do not need any 588 * additional activities to enter deep C-state, i.e. C3(+). 589 */ 590 cpu_mwait_cx_no_bmarb(); 591 592 TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts); 593 if (!bm_sts) 594 cpu_mwait_cx_no_bmsts(); 595 } 596 597 sbuf_new(&sb, cpu_mwait_cx_supported, 598 sizeof(cpu_mwait_cx_supported), SBUF_FIXEDLEN); 599 600 for (i = 0; i < CPU_MWAIT_CX_MAX; ++i) { 601 struct cpu_mwait_cx *cx = &cpu_mwait_cx_info[i]; 602 int sub; 603 604 ksnprintf(cx->name, sizeof(cx->name), "C%d", i); 605 606 sysctl_ctx_init(&cx->sysctl_ctx); 607 cx->sysctl_tree = SYSCTL_ADD_NODE(&cx->sysctl_ctx, 608 SYSCTL_STATIC_CHILDREN(_machdep_mwait), OID_AUTO, 609 cx->name, CTLFLAG_RW, NULL, "Cx control/info"); 610 if (cx->sysctl_tree == NULL) 611 continue; 612 613 cx->subcnt = CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu, i); 614 SYSCTL_ADD_INT(&cx->sysctl_ctx, 615 SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO, 616 "subcnt", CTLFLAG_RD, &cx->subcnt, 0, 617 "sub-state count"); 618 SYSCTL_ADD_PROC(&cx->sysctl_ctx, 619 SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO, 620 "entered", (CTLTYPE_QUAD | CTLFLAG_RW), 0, 621 i, sysctl_cpu_idle_cnt, "Q", "# of times entered"); 622 623 for (sub = 0; sub < cx->subcnt; ++sub) 624 sbuf_printf(&sb, "C%d/%d ", i, sub); 625 } 626 sbuf_trim(&sb); 627 sbuf_finish(&sb); 628 629 /* 630 * Non-deep C-states 631 */ 632 cpu_mwait_c1_hints_cnt = cpu_mwait_cx_info[CPU_MWAIT_C1].subcnt; 633 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i) 634 cpu_mwait_hints_cnt += cpu_mwait_cx_info[i].subcnt; 635 cpu_mwait_hints = kmalloc(sizeof(int) * cpu_mwait_hints_cnt, 636 M_DEVBUF, M_WAITOK); 637 638 hint_idx = 0; 639 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i) { 640 int j, subcnt; 641 642 subcnt = cpu_mwait_cx_info[i].subcnt; 643 for (j = 0; j < subcnt; ++j) { 644 KASSERT(hint_idx < cpu_mwait_hints_cnt, 645 ("invalid mwait hint index %d", hint_idx)); 646 cpu_mwait_hints[hint_idx] = MWAIT_EAX_HINT(i, j); 647 ++hint_idx; 648 } 649 } 650 KASSERT(hint_idx == cpu_mwait_hints_cnt, 651 ("mwait hint count %d != index %d", 652 cpu_mwait_hints_cnt, hint_idx)); 653 654 if (bootverbose) { 655 kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt); 656 for (i = 0; i < cpu_mwait_hints_cnt; ++i) { 657 int hint = cpu_mwait_hints[i]; 658 659 kprintf(" C%d/%d hint 0x%04x\n", 660 MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint), 661 hint); 662 } 663 } 664 665 /* 666 * Deep C-states 667 */ 668 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i) 669 cpu_mwait_deep_hints_cnt += cpu_mwait_cx_info[i].subcnt; 670 cpu_mwait_deep_hints = kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt, 671 M_DEVBUF, M_WAITOK); 672 673 hint_idx = 0; 674 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i) { 675 int j, subcnt; 676 677 subcnt = cpu_mwait_cx_info[i].subcnt; 678 for (j = 0; j < subcnt; ++j) { 679 KASSERT(hint_idx < cpu_mwait_deep_hints_cnt, 680 ("invalid mwait deep hint index %d", hint_idx)); 681 cpu_mwait_deep_hints[hint_idx] = MWAIT_EAX_HINT(i, j); 682 ++hint_idx; 683 } 684 } 685 KASSERT(hint_idx == cpu_mwait_deep_hints_cnt, 686 ("mwait deep hint count %d != index %d", 687 cpu_mwait_deep_hints_cnt, hint_idx)); 688 689 if (bootverbose) { 690 kprintf("MWAIT deep hints:\n"); 691 for (i = 0; i < cpu_mwait_deep_hints_cnt; ++i) { 692 int hint = cpu_mwait_deep_hints[i]; 693 694 kprintf(" C%d/%d hint 0x%04x\n", 695 MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint), 696 hint); 697 } 698 } 699 cpu_idle_repeat_max = 256 * cpu_mwait_deep_hints_cnt; 700 701 for (i = 0; i < ncpus; ++i) { 702 char name[16]; 703 704 ksnprintf(name, sizeof(name), "idle%d", i); 705 SYSCTL_ADD_PROC(NULL, 706 SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX), OID_AUTO, 707 name, (CTLTYPE_STRING | CTLFLAG_RW), &cpu_idle_stats[i], 708 0, cpu_mwait_cx_pcpu_idle_sysctl, "A", ""); 709 } 710 } 711 712 static void 713 cpu_finish(void *dummy __unused) 714 { 715 cpu_setregs(); 716 cpu_mwait_attach(); 717 } 718 719 static void 720 pic_finish(void *dummy __unused) 721 { 722 /* Log ELCR information */ 723 elcr_dump(); 724 725 /* Log MPTABLE information */ 726 mptable_pci_int_dump(); 727 728 /* Finalize PCI */ 729 MachIntrABI.finalize(); 730 } 731 732 /* 733 * Send an interrupt to process. 734 * 735 * Stack is set up to allow sigcode stored 736 * at top to call routine, followed by kcall 737 * to sigreturn routine below. After sigreturn 738 * resets the signal mask, the stack, and the 739 * frame pointer, it returns to the user 740 * specified pc, psl. 741 */ 742 void 743 sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code) 744 { 745 struct lwp *lp = curthread->td_lwp; 746 struct proc *p = lp->lwp_proc; 747 struct trapframe *regs; 748 struct sigacts *psp = p->p_sigacts; 749 struct sigframe sf, *sfp; 750 int oonstack; 751 char *sp; 752 753 regs = lp->lwp_md.md_regs; 754 oonstack = (lp->lwp_sigstk.ss_flags & SS_ONSTACK) ? 1 : 0; 755 756 /* Save user context */ 757 bzero(&sf, sizeof(struct sigframe)); 758 sf.sf_uc.uc_sigmask = *mask; 759 sf.sf_uc.uc_stack = lp->lwp_sigstk; 760 sf.sf_uc.uc_mcontext.mc_onstack = oonstack; 761 KKASSERT(__offsetof(struct trapframe, tf_rdi) == 0); 762 /* gcc errors out on optimized bcopy */ 763 _bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(struct trapframe)); 764 765 /* Make the size of the saved context visible to userland */ 766 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); 767 768 /* Allocate and validate space for the signal handler context. */ 769 if ((lp->lwp_flags & LWP_ALTSTACK) != 0 && !oonstack && 770 SIGISMEMBER(psp->ps_sigonstack, sig)) { 771 sp = (char *)lp->lwp_sigstk.ss_sp + lp->lwp_sigstk.ss_size - 772 sizeof(struct sigframe); 773 lp->lwp_sigstk.ss_flags |= SS_ONSTACK; 774 } else { 775 /* We take red zone into account */ 776 sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128; 777 } 778 779 /* 780 * XXX AVX needs 64-byte alignment but sigframe has other fields and 781 * the embedded ucontext is not at the front, so aligning this won't 782 * help us. Fortunately we bcopy in/out of the sigframe, so the 783 * kernel is ok. 784 * 785 * The problem though is if userland winds up trying to use the 786 * context directly. 787 */ 788 sfp = (struct sigframe *)((intptr_t)sp & ~(intptr_t)0xF); 789 790 /* Translate the signal is appropriate */ 791 if (p->p_sysent->sv_sigtbl) { 792 if (sig <= p->p_sysent->sv_sigsize) 793 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 794 } 795 796 /* 797 * Build the argument list for the signal handler. 798 * 799 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx) 800 */ 801 regs->tf_rdi = sig; /* argument 1 */ 802 regs->tf_rdx = (register_t)&sfp->sf_uc; /* argument 3 */ 803 804 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 805 /* 806 * Signal handler installed with SA_SIGINFO. 807 * 808 * action(signo, siginfo, ucontext) 809 */ 810 regs->tf_rsi = (register_t)&sfp->sf_si; /* argument 2 */ 811 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */ 812 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 813 814 /* fill siginfo structure */ 815 sf.sf_si.si_signo = sig; 816 sf.sf_si.si_pid = psp->ps_frominfo[sig].pid; 817 sf.sf_si.si_uid = psp->ps_frominfo[sig].uid; 818 sf.sf_si.si_code = code; 819 sf.sf_si.si_addr = (void *)regs->tf_addr; 820 } else { 821 /* 822 * Old FreeBSD-style arguments. 823 * 824 * handler (signo, code, [uc], addr) 825 */ 826 regs->tf_rsi = (register_t)code; /* argument 2 */ 827 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */ 828 sf.sf_ahu.sf_handler = catcher; 829 } 830 831 /* 832 * If we're a vm86 process, we want to save the segment registers. 833 * We also change eflags to be our emulated eflags, not the actual 834 * eflags. 835 */ 836 #if 0 /* JG */ 837 if (regs->tf_eflags & PSL_VM) { 838 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 839 struct vm86_kernel *vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86; 840 841 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 842 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 843 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 844 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 845 846 if (vm86->vm86_has_vme == 0) 847 sf.sf_uc.uc_mcontext.mc_eflags = 848 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 849 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 850 851 /* 852 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 853 * syscalls made by the signal handler. This just avoids 854 * wasting time for our lazy fixup of such faults. PSL_NT 855 * does nothing in vm86 mode, but vm86 programs can set it 856 * almost legitimately in probes for old cpu types. 857 */ 858 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 859 } 860 #endif 861 862 /* 863 * Save the FPU state and reinit the FP unit 864 */ 865 npxpush(&sf.sf_uc.uc_mcontext); 866 867 /* 868 * Copy the sigframe out to the user's stack. 869 */ 870 if (copyout(&sf, sfp, sizeof(struct sigframe)) != 0) { 871 /* 872 * Something is wrong with the stack pointer. 873 * ...Kill the process. 874 */ 875 sigexit(lp, SIGILL); 876 } 877 878 regs->tf_rsp = (register_t)sfp; 879 regs->tf_rip = trunc_page64(PS_STRINGS - *(p->p_sysent->sv_szsigcode)); 880 regs->tf_rip -= SZSIGCODE_EXTRA_BYTES; 881 882 /* 883 * x86 abi specifies that the direction flag must be cleared 884 * on function entry 885 */ 886 regs->tf_rflags &= ~(PSL_T | PSL_D); 887 888 /* 889 * 64 bit mode has a code and stack selector but 890 * no data or extra selector. %fs and %gs are not 891 * stored in-context. 892 */ 893 regs->tf_cs = _ucodesel; 894 regs->tf_ss = _udatasel; 895 clear_quickret(); 896 } 897 898 /* 899 * Sanitize the trapframe for a virtual kernel passing control to a custom 900 * VM context. Remove any items that would otherwise create a privilage 901 * issue. 902 * 903 * XXX at the moment we allow userland to set the resume flag. Is this a 904 * bad idea? 905 */ 906 int 907 cpu_sanitize_frame(struct trapframe *frame) 908 { 909 frame->tf_cs = _ucodesel; 910 frame->tf_ss = _udatasel; 911 /* XXX VM (8086) mode not supported? */ 912 frame->tf_rflags &= (PSL_RF | PSL_USERCHANGE | PSL_VM_UNSUPP); 913 frame->tf_rflags |= PSL_RESERVED_DEFAULT | PSL_I; 914 915 return(0); 916 } 917 918 /* 919 * Sanitize the tls so loading the descriptor does not blow up 920 * on us. For x86_64 we don't have to do anything. 921 */ 922 int 923 cpu_sanitize_tls(struct savetls *tls) 924 { 925 return(0); 926 } 927 928 /* 929 * sigreturn(ucontext_t *sigcntxp) 930 * 931 * System call to cleanup state after a signal 932 * has been taken. Reset signal mask and 933 * stack state from context left by sendsig (above). 934 * Return to previous pc and psl as specified by 935 * context left by sendsig. Check carefully to 936 * make sure that the user has not modified the 937 * state to gain improper privileges. 938 * 939 * MPSAFE 940 */ 941 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 942 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 943 944 int 945 sys_sigreturn(struct sysmsg *sysmsg, const struct sigreturn_args *uap) 946 { 947 struct lwp *lp = curthread->td_lwp; 948 struct trapframe *regs; 949 ucontext_t uc; 950 ucontext_t *ucp; 951 register_t rflags; 952 int cs; 953 int error; 954 955 /* 956 * We have to copy the information into kernel space so userland 957 * can't modify it while we are sniffing it. 958 */ 959 regs = lp->lwp_md.md_regs; 960 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 961 if (error) 962 return (error); 963 ucp = &uc; 964 rflags = ucp->uc_mcontext.mc_rflags; 965 966 /* VM (8086) mode not supported */ 967 rflags &= ~PSL_VM_UNSUPP; 968 969 #if 0 /* JG */ 970 if (eflags & PSL_VM) { 971 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 972 struct vm86_kernel *vm86; 973 974 /* 975 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 976 * set up the vm86 area, and we can't enter vm86 mode. 977 */ 978 if (lp->lwp_thread->td_pcb->pcb_ext == 0) 979 return (EINVAL); 980 vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86; 981 if (vm86->vm86_inited == 0) 982 return (EINVAL); 983 984 /* go back to user mode if both flags are set */ 985 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) 986 trapsignal(lp, SIGBUS, 0); 987 988 if (vm86->vm86_has_vme) { 989 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 990 (eflags & VME_USERCHANGE) | PSL_VM; 991 } else { 992 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 993 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 994 (eflags & VM_USERCHANGE) | PSL_VM; 995 } 996 bcopy(&ucp->uc_mcontext.mc_gs, tf, sizeof(struct trapframe)); 997 tf->tf_eflags = eflags; 998 tf->tf_vm86_ds = tf->tf_ds; 999 tf->tf_vm86_es = tf->tf_es; 1000 tf->tf_vm86_fs = tf->tf_fs; 1001 tf->tf_vm86_gs = tf->tf_gs; 1002 tf->tf_ds = _udatasel; 1003 tf->tf_es = _udatasel; 1004 tf->tf_fs = _udatasel; 1005 tf->tf_gs = _udatasel; 1006 } else 1007 #endif 1008 { 1009 /* 1010 * Don't allow users to change privileged or reserved flags. 1011 */ 1012 /* 1013 * XXX do allow users to change the privileged flag PSL_RF. 1014 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers 1015 * should sometimes set it there too. tf_eflags is kept in 1016 * the signal context during signal handling and there is no 1017 * other place to remember it, so the PSL_RF bit may be 1018 * corrupted by the signal handler without us knowing. 1019 * Corruption of the PSL_RF bit at worst causes one more or 1020 * one less debugger trap, so allowing it is fairly harmless. 1021 */ 1022 if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { 1023 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags); 1024 return(EINVAL); 1025 } 1026 1027 /* 1028 * Don't allow users to load a valid privileged %cs. Let the 1029 * hardware check for invalid selectors, excess privilege in 1030 * other selectors, invalid %eip's and invalid %esp's. 1031 */ 1032 cs = ucp->uc_mcontext.mc_cs; 1033 if (!CS_SECURE(cs)) { 1034 kprintf("sigreturn: cs = 0x%x\n", cs); 1035 trapsignal(lp, SIGBUS, T_PROTFLT); 1036 return(EINVAL); 1037 } 1038 /* gcc errors out on optimized bcopy */ 1039 _bcopy(&ucp->uc_mcontext.mc_rdi, regs, 1040 sizeof(struct trapframe)); 1041 } 1042 1043 /* 1044 * Restore the FPU state from the frame 1045 */ 1046 crit_enter(); 1047 npxpop(&ucp->uc_mcontext); 1048 1049 if (ucp->uc_mcontext.mc_onstack & 1) 1050 lp->lwp_sigstk.ss_flags |= SS_ONSTACK; 1051 else 1052 lp->lwp_sigstk.ss_flags &= ~SS_ONSTACK; 1053 1054 lp->lwp_sigmask = ucp->uc_sigmask; 1055 SIG_CANTMASK(lp->lwp_sigmask); 1056 clear_quickret(); 1057 crit_exit(); 1058 return(EJUSTRETURN); 1059 } 1060 1061 /* 1062 * Machine dependent boot() routine 1063 * 1064 * I haven't seen anything to put here yet 1065 * Possibly some stuff might be grafted back here from boot() 1066 */ 1067 void 1068 cpu_boot(int howto) 1069 { 1070 } 1071 1072 /* 1073 * Shutdown the CPU as much as possible 1074 */ 1075 void 1076 cpu_halt(void) 1077 { 1078 for (;;) 1079 __asm__ __volatile("hlt"); 1080 } 1081 1082 /* 1083 * cpu_idle() represents the idle LWKT. You cannot return from this function 1084 * (unless you want to blow things up!). Instead we look for runnable threads 1085 * and loop or halt as appropriate. Giant is not held on entry to the thread. 1086 * 1087 * The main loop is entered with a critical section held, we must release 1088 * the critical section before doing anything else. lwkt_switch() will 1089 * check for pending interrupts due to entering and exiting its own 1090 * critical section. 1091 * 1092 * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up. 1093 * However, there are cases where the idlethread will be entered with 1094 * the possibility that no IPI will occur and in such cases 1095 * lwkt_switch() sets TDF_IDLE_NOHLT. 1096 * 1097 * NOTE: cpu_idle_repeat determines how many entries into the idle thread 1098 * must occur before it starts using ACPI halt. 1099 * 1100 * NOTE: Value overridden in hammer_time(). 1101 */ 1102 static int cpu_idle_hlt = 2; 1103 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, 1104 &cpu_idle_hlt, 0, "Idle loop HLT enable"); 1105 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_repeat, CTLFLAG_RW, 1106 &cpu_idle_repeat, 0, "Idle entries before acpi hlt"); 1107 1108 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_hltcnt, (CTLTYPE_QUAD | CTLFLAG_RW), 1109 0, CPU_IDLE_STAT_HALT, sysctl_cpu_idle_cnt, "Q", "Idle loop entry halts"); 1110 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_spincnt, (CTLTYPE_QUAD | CTLFLAG_RW), 1111 0, CPU_IDLE_STAT_SPIN, sysctl_cpu_idle_cnt, "Q", "Idle loop entry spins"); 1112 1113 static void 1114 cpu_idle_default_hook(void) 1115 { 1116 /* 1117 * We must guarentee that hlt is exactly the instruction 1118 * following the sti. 1119 */ 1120 __asm __volatile("sti; hlt"); 1121 } 1122 1123 /* Other subsystems (e.g., ACPI) can hook this later. */ 1124 void (*cpu_idle_hook)(void) = cpu_idle_default_hook; 1125 1126 static __inline int 1127 cpu_mwait_cx_hint(struct cpu_idle_stat *stat) 1128 { 1129 int hint, cx_idx; 1130 u_int idx; 1131 1132 hint = stat->hint; 1133 if (hint >= 0) 1134 goto done; 1135 1136 idx = (stat->repeat + stat->repeat_last + stat->repeat_delta) >> 1137 cpu_mwait_repeat_shift; 1138 if (idx >= cpu_mwait_c1_hints_cnt) { 1139 /* Step up faster, once we walked through all C1 states */ 1140 stat->repeat_delta += 1 << (cpu_mwait_repeat_shift + 1); 1141 } 1142 if (hint == CPU_MWAIT_HINT_AUTODEEP) { 1143 if (idx >= cpu_mwait_deep_hints_cnt) 1144 idx = cpu_mwait_deep_hints_cnt - 1; 1145 hint = cpu_mwait_deep_hints[idx]; 1146 } else { 1147 if (idx >= cpu_mwait_hints_cnt) 1148 idx = cpu_mwait_hints_cnt - 1; 1149 hint = cpu_mwait_hints[idx]; 1150 } 1151 done: 1152 cx_idx = MWAIT_EAX_TO_CX(hint); 1153 if (cx_idx >= 0 && cx_idx < CPU_MWAIT_CX_MAX) 1154 stat->mwait_cx[cx_idx]++; 1155 return hint; 1156 } 1157 1158 void 1159 cpu_idle(void) 1160 { 1161 globaldata_t gd = mycpu; 1162 struct cpu_idle_stat *stat = &cpu_idle_stats[gd->gd_cpuid]; 1163 struct thread *td __debugvar = gd->gd_curthread; 1164 int reqflags; 1165 1166 stat->repeat = stat->repeat_last = cpu_idle_repeat_max; 1167 1168 crit_exit(); 1169 KKASSERT(td->td_critcount == 0); 1170 1171 for (;;) { 1172 /* 1173 * See if there are any LWKTs ready to go. 1174 */ 1175 lwkt_switch(); 1176 1177 /* 1178 * When halting inside a cli we must check for reqflags 1179 * races, particularly [re]schedule requests. Running 1180 * splz() does the job. 1181 * 1182 * cpu_idle_hlt: 1183 * 0 Never halt, just spin 1184 * 1185 * 1 Always use MONITOR/MWAIT if avail, HLT 1186 * otherwise. 1187 * 1188 * Better default for modern (Haswell+) Intel 1189 * cpus. 1190 * 1191 * 2 Use HLT/MONITOR/MWAIT up to a point and then 1192 * use the ACPI halt (default). This is a hybrid 1193 * approach. See machdep.cpu_idle_repeat. 1194 * 1195 * Better default for modern AMD cpus and older 1196 * Intel cpus. 1197 * 1198 * 3 Always use the ACPI halt. This typically 1199 * eats the least amount of power but the cpu 1200 * will be slow waking up. Slows down e.g. 1201 * compiles and other pipe/event oriented stuff. 1202 * 1203 * Usually the best default for AMD cpus. 1204 * 1205 * 4 Always use HLT. 1206 * 1207 * 5 Always spin. 1208 * 1209 * NOTE: Interrupts are enabled and we are not in a critical 1210 * section. 1211 * 1212 * NOTE: Preemptions do not reset gd_idle_repeat. Also we 1213 * don't bother capping gd_idle_repeat, it is ok if 1214 * it overflows (we do make it unsigned, however). 1215 * 1216 * Implement optimized invltlb operations when halted 1217 * in idle. By setting the bit in smp_idleinvl_mask 1218 * we inform other cpus that they can set _reqs to 1219 * request an invltlb. Current the code to do that 1220 * sets the bits in _reqs anyway, but then check _mask 1221 * to determine if they can assume the invltlb will execute. 1222 * 1223 * A critical section is required to ensure that interrupts 1224 * do not fully run until after we've had a chance to execute 1225 * the request. 1226 */ 1227 if (gd->gd_idle_repeat == 0) { 1228 stat->repeat = (stat->repeat + stat->repeat_last) >> 1; 1229 if (stat->repeat > cpu_idle_repeat_max) 1230 stat->repeat = cpu_idle_repeat_max; 1231 stat->repeat_last = 0; 1232 stat->repeat_delta = 0; 1233 } 1234 ++stat->repeat_last; 1235 1236 /* 1237 * General idle thread halt code 1238 * 1239 * IBRS NOTES - IBRS is a SPECTRE mitigation. When going 1240 * idle, disable IBRS to reduce hyperthread 1241 * overhead. 1242 */ 1243 ++gd->gd_idle_repeat; 1244 1245 switch(cpu_idle_hlt) { 1246 default: 1247 case 0: 1248 /* 1249 * Always spin 1250 */ 1251 ; 1252 do_spin: 1253 splz(); 1254 __asm __volatile("sti"); 1255 stat->spin++; 1256 crit_enter_gd(gd); 1257 crit_exit_gd(gd); 1258 break; 1259 case 2: 1260 /* 1261 * Use MONITOR/MWAIT (or HLT) for a few cycles, 1262 * then start using the ACPI halt code if we 1263 * continue to be idle. 1264 */ 1265 if (gd->gd_idle_repeat >= cpu_idle_repeat) 1266 goto do_acpi; 1267 /* FALL THROUGH */ 1268 case 1: 1269 /* 1270 * Always use MONITOR/MWAIT (will use HLT if 1271 * MONITOR/MWAIT not available). 1272 */ 1273 if (cpu_mi_feature & CPU_MI_MONITOR) { 1274 splz(); /* XXX */ 1275 reqflags = gd->gd_reqflags; 1276 if (reqflags & RQF_IDLECHECK_WK_MASK) 1277 goto do_spin; 1278 crit_enter_gd(gd); 1279 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask, gd->gd_cpuid); 1280 /* 1281 * IBRS/STIBP 1282 */ 1283 if (pscpu->trampoline.tr_pcb_spec_ctrl[1] & 1284 SPEC_CTRL_DUMMY_ENABLE) { 1285 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1286 } 1287 cpu_mmw_pause_int(&gd->gd_reqflags, reqflags, 1288 cpu_mwait_cx_hint(stat), 0); 1289 if (pscpu->trampoline.tr_pcb_spec_ctrl[0] & 1290 SPEC_CTRL_DUMMY_ENABLE) { 1291 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1292 } 1293 stat->halt++; 1294 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask, gd->gd_cpuid); 1295 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs, 1296 gd->gd_cpuid)) { 1297 cpu_invltlb(); 1298 cpu_mfence(); 1299 } 1300 crit_exit_gd(gd); 1301 break; 1302 } 1303 /* FALLTHROUGH */ 1304 case 4: 1305 /* 1306 * Use HLT 1307 */ 1308 __asm __volatile("cli"); 1309 splz(); 1310 crit_enter_gd(gd); 1311 if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) { 1312 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask, 1313 gd->gd_cpuid); 1314 if (pscpu->trampoline.tr_pcb_spec_ctrl[1] & 1315 SPEC_CTRL_DUMMY_ENABLE) { 1316 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1317 } 1318 cpu_idle_default_hook(); 1319 if (pscpu->trampoline.tr_pcb_spec_ctrl[0] & 1320 SPEC_CTRL_DUMMY_ENABLE) { 1321 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1322 } 1323 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask, 1324 gd->gd_cpuid); 1325 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs, 1326 gd->gd_cpuid)) { 1327 cpu_invltlb(); 1328 cpu_mfence(); 1329 } 1330 } 1331 __asm __volatile("sti"); 1332 stat->halt++; 1333 crit_exit_gd(gd); 1334 break; 1335 case 3: 1336 /* 1337 * Use ACPI halt 1338 */ 1339 ; 1340 do_acpi: 1341 __asm __volatile("cli"); 1342 splz(); 1343 crit_enter_gd(gd); 1344 if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) { 1345 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask, 1346 gd->gd_cpuid); 1347 if (pscpu->trampoline.tr_pcb_spec_ctrl[1] & 1348 SPEC_CTRL_DUMMY_ENABLE) { 1349 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1350 } 1351 cpu_idle_hook(); 1352 if (pscpu->trampoline.tr_pcb_spec_ctrl[0] & 1353 SPEC_CTRL_DUMMY_ENABLE) { 1354 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1355 } 1356 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask, 1357 gd->gd_cpuid); 1358 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs, 1359 gd->gd_cpuid)) { 1360 cpu_invltlb(); 1361 cpu_mfence(); 1362 } 1363 } 1364 __asm __volatile("sti"); 1365 stat->halt++; 1366 crit_exit_gd(gd); 1367 break; 1368 } 1369 } 1370 } 1371 1372 /* 1373 * Called from deep ACPI via cpu_idle_hook() (see above) to actually halt 1374 * the cpu in C1. ACPI might use other halt methods for deeper states 1375 * and not reach here. 1376 * 1377 * For now we always use HLT as we are not sure what ACPI may have actually 1378 * done. MONITOR/MWAIT might not be appropriate. 1379 * 1380 * NOTE: MONITOR/MWAIT does not appear to throttle AMD cpus, while HLT 1381 * does. On Intel, MONITOR/MWAIT does appear to throttle the cpu. 1382 */ 1383 void 1384 cpu_idle_halt(void) 1385 { 1386 globaldata_t gd; 1387 1388 gd = mycpu; 1389 #if 0 1390 /* DISABLED FOR NOW */ 1391 struct cpu_idle_stat *stat; 1392 int reqflags; 1393 1394 1395 if ((cpu_idle_hlt == 1 || cpu_idle_hlt == 2) && 1396 (cpu_mi_feature & CPU_MI_MONITOR) && 1397 cpu_vendor_id != CPU_VENDOR_AMD) { 1398 /* 1399 * Use MONITOR/MWAIT 1400 * 1401 * (NOTE: On ryzen, MWAIT does not throttle clocks, so we 1402 * have to use HLT) 1403 */ 1404 stat = &cpu_idle_stats[gd->gd_cpuid]; 1405 reqflags = gd->gd_reqflags; 1406 if ((reqflags & RQF_IDLECHECK_WK_MASK) == 0) { 1407 __asm __volatile("sti"); 1408 cpu_mmw_pause_int(&gd->gd_reqflags, reqflags, 1409 cpu_mwait_cx_hint(stat), 0); 1410 } else { 1411 __asm __volatile("sti; pause"); 1412 } 1413 } else 1414 #endif 1415 { 1416 /* 1417 * Use HLT 1418 */ 1419 if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) 1420 __asm __volatile("sti; hlt"); 1421 else 1422 __asm __volatile("sti; pause"); 1423 } 1424 } 1425 1426 1427 /* 1428 * Called in a loop indirectly via Xcpustop 1429 */ 1430 void 1431 cpu_smp_stopped(void) 1432 { 1433 globaldata_t gd = mycpu; 1434 volatile __uint64_t *ptr; 1435 __uint64_t ovalue; 1436 1437 ptr = CPUMASK_ADDR(started_cpus, gd->gd_cpuid); 1438 ovalue = *ptr; 1439 if ((ovalue & CPUMASK_SIMPLE(gd->gd_cpuid & 63)) == 0) { 1440 if (cpu_mi_feature & CPU_MI_MONITOR) { 1441 if (cpu_mwait_hints) { 1442 cpu_mmw_pause_long(__DEVOLATILE(void *, ptr), 1443 ovalue, 1444 cpu_mwait_hints[ 1445 cpu_mwait_hints_cnt - 1], 0); 1446 } else { 1447 cpu_mmw_pause_long(__DEVOLATILE(void *, ptr), 1448 ovalue, 0, 0); 1449 } 1450 } else { 1451 cpu_halt(); /* depend on lapic timer */ 1452 } 1453 } 1454 } 1455 1456 /* 1457 * This routine is called if a spinlock has been held through the 1458 * exponential backoff period and is seriously contested. On a real cpu 1459 * we let it spin. 1460 */ 1461 void 1462 cpu_spinlock_contested(void) 1463 { 1464 cpu_pause(); 1465 } 1466 1467 /* 1468 * Clear registers on exec 1469 */ 1470 void 1471 exec_setregs(u_long entry, u_long stack, u_long ps_strings) 1472 { 1473 struct thread *td = curthread; 1474 struct lwp *lp = td->td_lwp; 1475 struct pcb *pcb = td->td_pcb; 1476 struct trapframe *regs = lp->lwp_md.md_regs; 1477 1478 user_ldt_free(pcb); 1479 1480 clear_quickret(); 1481 bzero((char *)regs, sizeof(struct trapframe)); 1482 regs->tf_rip = entry; 1483 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; /* align the stack */ 1484 regs->tf_rdi = stack; /* argv */ 1485 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); 1486 regs->tf_ss = _udatasel; 1487 regs->tf_cs = _ucodesel; 1488 regs->tf_rbx = ps_strings; 1489 1490 /* 1491 * Reset the hardware debug registers if they were in use. 1492 * They won't have any meaning for the newly exec'd process. 1493 */ 1494 if (pcb->pcb_flags & PCB_DBREGS) { 1495 pcb->pcb_dr0 = 0; 1496 pcb->pcb_dr1 = 0; 1497 pcb->pcb_dr2 = 0; 1498 pcb->pcb_dr3 = 0; 1499 pcb->pcb_dr6 = 0; 1500 pcb->pcb_dr7 = 0; /* JG set bit 10? */ 1501 if (pcb == td->td_pcb) { 1502 /* 1503 * Clear the debug registers on the running 1504 * CPU, otherwise they will end up affecting 1505 * the next process we switch to. 1506 */ 1507 reset_dbregs(); 1508 } 1509 pcb->pcb_flags &= ~PCB_DBREGS; 1510 } 1511 1512 /* 1513 * Initialize the math emulator (if any) for the current process. 1514 * Actually, just clear the bit that says that the emulator has 1515 * been initialized. Initialization is delayed until the process 1516 * traps to the emulator (if it is done at all) mainly because 1517 * emulators don't provide an entry point for initialization. 1518 */ 1519 pcb->pcb_flags &= ~FP_SOFTFP; 1520 1521 /* 1522 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing 1523 * gd_npxthread. Otherwise a preemptive interrupt thread 1524 * may panic in npxdna(). 1525 */ 1526 crit_enter(); 1527 load_cr0(rcr0() | CR0_MP); 1528 1529 /* 1530 * NOTE: The MSR values must be correct so we can return to 1531 * userland. gd_user_fs/gs must be correct so the switch 1532 * code knows what the current MSR values are. 1533 */ 1534 pcb->pcb_fsbase = 0; /* Values loaded from PCB on switch */ 1535 pcb->pcb_gsbase = 0; 1536 mdcpu->gd_user_fs = 0; /* Cache of current MSR values */ 1537 mdcpu->gd_user_gs = 0; 1538 wrmsr(MSR_FSBASE, 0); /* Set MSR values for return to userland */ 1539 wrmsr(MSR_KGSBASE, 0); 1540 1541 /* Initialize the npx (if any) for the current process. */ 1542 npxinit(); 1543 crit_exit(); 1544 1545 pcb->pcb_ds = _udatasel; 1546 pcb->pcb_es = _udatasel; 1547 pcb->pcb_fs = _udatasel; 1548 pcb->pcb_gs = _udatasel; 1549 } 1550 1551 void 1552 cpu_setregs(void) 1553 { 1554 register_t cr0; 1555 1556 cr0 = rcr0(); 1557 cr0 |= CR0_NE; /* Done by npxinit() */ 1558 cr0 |= CR0_MP | CR0_TS; /* Done at every execve() too. */ 1559 cr0 |= CR0_WP | CR0_AM; 1560 load_cr0(cr0); 1561 load_gs(_udatasel); 1562 } 1563 1564 static int 1565 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS) 1566 { 1567 int error; 1568 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, 1569 req); 1570 if (!error && req->newptr) 1571 resettodr(); 1572 return (error); 1573 } 1574 1575 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, 1576 &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); 1577 1578 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, 1579 CTLFLAG_RW, &disable_rtc_set, 0, ""); 1580 1581 #if 0 /* JG */ 1582 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, 1583 CTLFLAG_RD, &bootinfo, bootinfo, ""); 1584 #endif 1585 1586 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, 1587 CTLFLAG_RW, &wall_cmos_clock, 0, ""); 1588 1589 static int 1590 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) 1591 { 1592 struct efi_map_header *efihdr; 1593 caddr_t kmdp; 1594 uint32_t efisize; 1595 1596 kmdp = preload_search_by_type("elf kernel"); 1597 if (kmdp == NULL) 1598 kmdp = preload_search_by_type("elf64 kernel"); 1599 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1600 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1601 if (efihdr == NULL) 1602 return (0); 1603 efisize = *((uint32_t *)efihdr - 1); 1604 return (SYSCTL_OUT(req, efihdr, efisize)); 1605 } 1606 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 1607 efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map"); 1608 1609 /* 1610 * Initialize x86 and configure to run kernel 1611 */ 1612 1613 /* 1614 * Initialize segments & interrupt table 1615 */ 1616 1617 int _default_ldt; 1618 struct user_segment_descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ 1619 struct gate_descriptor idt_arr[MAXCPU][NIDT]; 1620 #if 0 /* JG */ 1621 union descriptor ldt[NLDT]; /* local descriptor table */ 1622 #endif 1623 1624 /* table descriptors - used to load tables by cpu */ 1625 struct region_descriptor r_gdt; 1626 struct region_descriptor r_idt_arr[MAXCPU]; 1627 1628 /* JG proc0paddr is a virtual address */ 1629 void *proc0paddr; 1630 /* JG alignment? */ 1631 char proc0paddr_buff[LWKT_THREAD_STACK]; 1632 1633 1634 /* software prototypes -- in more palatable form */ 1635 struct soft_segment_descriptor gdt_segs[] = { 1636 /* GNULL_SEL 0 Null Descriptor */ 1637 { 0x0, /* segment base address */ 1638 0x0, /* length */ 1639 0, /* segment type */ 1640 0, /* segment descriptor priority level */ 1641 0, /* segment descriptor present */ 1642 0, /* long */ 1643 0, /* default 32 vs 16 bit size */ 1644 0 /* limit granularity (byte/page units)*/ }, 1645 /* GCODE_SEL 1 Code Descriptor for kernel */ 1646 { 0x0, /* segment base address */ 1647 0xfffff, /* length - all address space */ 1648 SDT_MEMERA, /* segment type */ 1649 SEL_KPL, /* segment descriptor priority level */ 1650 1, /* segment descriptor present */ 1651 1, /* long */ 1652 0, /* default 32 vs 16 bit size */ 1653 1 /* limit granularity (byte/page units)*/ }, 1654 /* GDATA_SEL 2 Data Descriptor for kernel */ 1655 { 0x0, /* segment base address */ 1656 0xfffff, /* length - all address space */ 1657 SDT_MEMRWA, /* segment type */ 1658 SEL_KPL, /* segment descriptor priority level */ 1659 1, /* segment descriptor present */ 1660 1, /* long */ 1661 0, /* default 32 vs 16 bit size */ 1662 1 /* limit granularity (byte/page units)*/ }, 1663 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */ 1664 { 0x0, /* segment base address */ 1665 0xfffff, /* length - all address space */ 1666 SDT_MEMERA, /* segment type */ 1667 SEL_UPL, /* segment descriptor priority level */ 1668 1, /* segment descriptor present */ 1669 0, /* long */ 1670 1, /* default 32 vs 16 bit size */ 1671 1 /* limit granularity (byte/page units)*/ }, 1672 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */ 1673 { 0x0, /* segment base address */ 1674 0xfffff, /* length - all address space */ 1675 SDT_MEMRWA, /* segment type */ 1676 SEL_UPL, /* segment descriptor priority level */ 1677 1, /* segment descriptor present */ 1678 0, /* long */ 1679 1, /* default 32 vs 16 bit size */ 1680 1 /* limit granularity (byte/page units)*/ }, 1681 /* GUCODE_SEL 5 64 bit Code Descriptor for user */ 1682 { 0x0, /* segment base address */ 1683 0xfffff, /* length - all address space */ 1684 SDT_MEMERA, /* segment type */ 1685 SEL_UPL, /* segment descriptor priority level */ 1686 1, /* segment descriptor present */ 1687 1, /* long */ 1688 0, /* default 32 vs 16 bit size */ 1689 1 /* limit granularity (byte/page units)*/ }, 1690 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */ 1691 { 1692 0x0, /* segment base address */ 1693 sizeof(struct x86_64tss)-1,/* length - all address space */ 1694 SDT_SYSTSS, /* segment type */ 1695 SEL_KPL, /* segment descriptor priority level */ 1696 1, /* segment descriptor present */ 1697 0, /* long */ 1698 0, /* unused - default 32 vs 16 bit size */ 1699 0 /* limit granularity (byte/page units)*/ }, 1700 /* Actually, the TSS is a system descriptor which is double size */ 1701 { 0x0, /* segment base address */ 1702 0x0, /* length */ 1703 0, /* segment type */ 1704 0, /* segment descriptor priority level */ 1705 0, /* segment descriptor present */ 1706 0, /* long */ 1707 0, /* default 32 vs 16 bit size */ 1708 0 /* limit granularity (byte/page units)*/ }, 1709 /* GUGS32_SEL 8 32 bit GS Descriptor for user */ 1710 { 0x0, /* segment base address */ 1711 0xfffff, /* length - all address space */ 1712 SDT_MEMRWA, /* segment type */ 1713 SEL_UPL, /* segment descriptor priority level */ 1714 1, /* segment descriptor present */ 1715 0, /* long */ 1716 1, /* default 32 vs 16 bit size */ 1717 1 /* limit granularity (byte/page units)*/ }, 1718 }; 1719 1720 void 1721 setidt_global(int idx, inthand_t *func, int typ, int dpl, int ist) 1722 { 1723 int cpu; 1724 1725 for (cpu = 0; cpu < MAXCPU; ++cpu) { 1726 struct gate_descriptor *ip = &idt_arr[cpu][idx]; 1727 1728 ip->gd_looffset = (uintptr_t)func; 1729 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 1730 ip->gd_ist = ist; 1731 ip->gd_xx = 0; 1732 ip->gd_type = typ; 1733 ip->gd_dpl = dpl; 1734 ip->gd_p = 1; 1735 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 1736 } 1737 } 1738 1739 void 1740 setidt(int idx, inthand_t *func, int typ, int dpl, int ist, int cpu) 1741 { 1742 struct gate_descriptor *ip; 1743 1744 KASSERT(cpu >= 0 && cpu < ncpus, ("invalid cpu %d", cpu)); 1745 1746 ip = &idt_arr[cpu][idx]; 1747 ip->gd_looffset = (uintptr_t)func; 1748 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 1749 ip->gd_ist = ist; 1750 ip->gd_xx = 0; 1751 ip->gd_type = typ; 1752 ip->gd_dpl = dpl; 1753 ip->gd_p = 1; 1754 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 1755 } 1756 1757 #define IDTVEC(name) __CONCAT(X,name) 1758 1759 extern inthand_t 1760 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 1761 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 1762 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 1763 IDTVEC(page), IDTVEC(mchk), IDTVEC(fpu), IDTVEC(align), 1764 IDTVEC(xmm), IDTVEC(dblfault), 1765 IDTVEC(fast_syscall), IDTVEC(fast_syscall32); 1766 1767 extern inthand_t 1768 IDTVEC(rsvd00), IDTVEC(rsvd01), IDTVEC(rsvd02), IDTVEC(rsvd03), 1769 IDTVEC(rsvd04), IDTVEC(rsvd05), IDTVEC(rsvd06), IDTVEC(rsvd07), 1770 IDTVEC(rsvd08), IDTVEC(rsvd09), IDTVEC(rsvd0a), IDTVEC(rsvd0b), 1771 IDTVEC(rsvd0c), IDTVEC(rsvd0d), IDTVEC(rsvd0e), IDTVEC(rsvd0f), 1772 IDTVEC(rsvd10), IDTVEC(rsvd11), IDTVEC(rsvd12), IDTVEC(rsvd13), 1773 IDTVEC(rsvd14), IDTVEC(rsvd15), IDTVEC(rsvd16), IDTVEC(rsvd17), 1774 IDTVEC(rsvd18), IDTVEC(rsvd19), IDTVEC(rsvd1a), IDTVEC(rsvd1b), 1775 IDTVEC(rsvd1c), IDTVEC(rsvd1d), IDTVEC(rsvd1e), IDTVEC(rsvd1f), 1776 IDTVEC(rsvd20), IDTVEC(rsvd21), IDTVEC(rsvd22), IDTVEC(rsvd23), 1777 IDTVEC(rsvd24), IDTVEC(rsvd25), IDTVEC(rsvd26), IDTVEC(rsvd27), 1778 IDTVEC(rsvd28), IDTVEC(rsvd29), IDTVEC(rsvd2a), IDTVEC(rsvd2b), 1779 IDTVEC(rsvd2c), IDTVEC(rsvd2d), IDTVEC(rsvd2e), IDTVEC(rsvd2f), 1780 IDTVEC(rsvd30), IDTVEC(rsvd31), IDTVEC(rsvd32), IDTVEC(rsvd33), 1781 IDTVEC(rsvd34), IDTVEC(rsvd35), IDTVEC(rsvd36), IDTVEC(rsvd37), 1782 IDTVEC(rsvd38), IDTVEC(rsvd39), IDTVEC(rsvd3a), IDTVEC(rsvd3b), 1783 IDTVEC(rsvd3c), IDTVEC(rsvd3d), IDTVEC(rsvd3e), IDTVEC(rsvd3f), 1784 IDTVEC(rsvd40), IDTVEC(rsvd41), IDTVEC(rsvd42), IDTVEC(rsvd43), 1785 IDTVEC(rsvd44), IDTVEC(rsvd45), IDTVEC(rsvd46), IDTVEC(rsvd47), 1786 IDTVEC(rsvd48), IDTVEC(rsvd49), IDTVEC(rsvd4a), IDTVEC(rsvd4b), 1787 IDTVEC(rsvd4c), IDTVEC(rsvd4d), IDTVEC(rsvd4e), IDTVEC(rsvd4f), 1788 IDTVEC(rsvd50), IDTVEC(rsvd51), IDTVEC(rsvd52), IDTVEC(rsvd53), 1789 IDTVEC(rsvd54), IDTVEC(rsvd55), IDTVEC(rsvd56), IDTVEC(rsvd57), 1790 IDTVEC(rsvd58), IDTVEC(rsvd59), IDTVEC(rsvd5a), IDTVEC(rsvd5b), 1791 IDTVEC(rsvd5c), IDTVEC(rsvd5d), IDTVEC(rsvd5e), IDTVEC(rsvd5f), 1792 IDTVEC(rsvd60), IDTVEC(rsvd61), IDTVEC(rsvd62), IDTVEC(rsvd63), 1793 IDTVEC(rsvd64), IDTVEC(rsvd65), IDTVEC(rsvd66), IDTVEC(rsvd67), 1794 IDTVEC(rsvd68), IDTVEC(rsvd69), IDTVEC(rsvd6a), IDTVEC(rsvd6b), 1795 IDTVEC(rsvd6c), IDTVEC(rsvd6d), IDTVEC(rsvd6e), IDTVEC(rsvd6f), 1796 IDTVEC(rsvd70), IDTVEC(rsvd71), IDTVEC(rsvd72), IDTVEC(rsvd73), 1797 IDTVEC(rsvd74), IDTVEC(rsvd75), IDTVEC(rsvd76), IDTVEC(rsvd77), 1798 IDTVEC(rsvd78), IDTVEC(rsvd79), IDTVEC(rsvd7a), IDTVEC(rsvd7b), 1799 IDTVEC(rsvd7c), IDTVEC(rsvd7d), IDTVEC(rsvd7e), IDTVEC(rsvd7f), 1800 IDTVEC(rsvd80), IDTVEC(rsvd81), IDTVEC(rsvd82), IDTVEC(rsvd83), 1801 IDTVEC(rsvd84), IDTVEC(rsvd85), IDTVEC(rsvd86), IDTVEC(rsvd87), 1802 IDTVEC(rsvd88), IDTVEC(rsvd89), IDTVEC(rsvd8a), IDTVEC(rsvd8b), 1803 IDTVEC(rsvd8c), IDTVEC(rsvd8d), IDTVEC(rsvd8e), IDTVEC(rsvd8f), 1804 IDTVEC(rsvd90), IDTVEC(rsvd91), IDTVEC(rsvd92), IDTVEC(rsvd93), 1805 IDTVEC(rsvd94), IDTVEC(rsvd95), IDTVEC(rsvd96), IDTVEC(rsvd97), 1806 IDTVEC(rsvd98), IDTVEC(rsvd99), IDTVEC(rsvd9a), IDTVEC(rsvd9b), 1807 IDTVEC(rsvd9c), IDTVEC(rsvd9d), IDTVEC(rsvd9e), IDTVEC(rsvd9f), 1808 IDTVEC(rsvda0), IDTVEC(rsvda1), IDTVEC(rsvda2), IDTVEC(rsvda3), 1809 IDTVEC(rsvda4), IDTVEC(rsvda5), IDTVEC(rsvda6), IDTVEC(rsvda7), 1810 IDTVEC(rsvda8), IDTVEC(rsvda9), IDTVEC(rsvdaa), IDTVEC(rsvdab), 1811 IDTVEC(rsvdac), IDTVEC(rsvdad), IDTVEC(rsvdae), IDTVEC(rsvdaf), 1812 IDTVEC(rsvdb0), IDTVEC(rsvdb1), IDTVEC(rsvdb2), IDTVEC(rsvdb3), 1813 IDTVEC(rsvdb4), IDTVEC(rsvdb5), IDTVEC(rsvdb6), IDTVEC(rsvdb7), 1814 IDTVEC(rsvdb8), IDTVEC(rsvdb9), IDTVEC(rsvdba), IDTVEC(rsvdbb), 1815 IDTVEC(rsvdbc), IDTVEC(rsvdbd), IDTVEC(rsvdbe), IDTVEC(rsvdbf), 1816 IDTVEC(rsvdc0), IDTVEC(rsvdc1), IDTVEC(rsvdc2), IDTVEC(rsvdc3), 1817 IDTVEC(rsvdc4), IDTVEC(rsvdc5), IDTVEC(rsvdc6), IDTVEC(rsvdc7), 1818 IDTVEC(rsvdc8), IDTVEC(rsvdc9), IDTVEC(rsvdca), IDTVEC(rsvdcb), 1819 IDTVEC(rsvdcc), IDTVEC(rsvdcd), IDTVEC(rsvdce), IDTVEC(rsvdcf), 1820 IDTVEC(rsvdd0), IDTVEC(rsvdd1), IDTVEC(rsvdd2), IDTVEC(rsvdd3), 1821 IDTVEC(rsvdd4), IDTVEC(rsvdd5), IDTVEC(rsvdd6), IDTVEC(rsvdd7), 1822 IDTVEC(rsvdd8), IDTVEC(rsvdd9), IDTVEC(rsvdda), IDTVEC(rsvddb), 1823 IDTVEC(rsvddc), IDTVEC(rsvddd), IDTVEC(rsvdde), IDTVEC(rsvddf), 1824 IDTVEC(rsvde0), IDTVEC(rsvde1), IDTVEC(rsvde2), IDTVEC(rsvde3), 1825 IDTVEC(rsvde4), IDTVEC(rsvde5), IDTVEC(rsvde6), IDTVEC(rsvde7), 1826 IDTVEC(rsvde8), IDTVEC(rsvde9), IDTVEC(rsvdea), IDTVEC(rsvdeb), 1827 IDTVEC(rsvdec), IDTVEC(rsvded), IDTVEC(rsvdee), IDTVEC(rsvdef), 1828 IDTVEC(rsvdf0), IDTVEC(rsvdf1), IDTVEC(rsvdf2), IDTVEC(rsvdf3), 1829 IDTVEC(rsvdf4), IDTVEC(rsvdf5), IDTVEC(rsvdf6), IDTVEC(rsvdf7), 1830 IDTVEC(rsvdf8), IDTVEC(rsvdf9), IDTVEC(rsvdfa), IDTVEC(rsvdfb), 1831 IDTVEC(rsvdfc), IDTVEC(rsvdfd), IDTVEC(rsvdfe), IDTVEC(rsvdff); 1832 1833 inthand_t *rsvdary[NIDT] = { 1834 &IDTVEC(rsvd00), &IDTVEC(rsvd01), &IDTVEC(rsvd02), &IDTVEC(rsvd03), 1835 &IDTVEC(rsvd04), &IDTVEC(rsvd05), &IDTVEC(rsvd06), &IDTVEC(rsvd07), 1836 &IDTVEC(rsvd08), &IDTVEC(rsvd09), &IDTVEC(rsvd0a), &IDTVEC(rsvd0b), 1837 &IDTVEC(rsvd0c), &IDTVEC(rsvd0d), &IDTVEC(rsvd0e), &IDTVEC(rsvd0f), 1838 &IDTVEC(rsvd10), &IDTVEC(rsvd11), &IDTVEC(rsvd12), &IDTVEC(rsvd13), 1839 &IDTVEC(rsvd14), &IDTVEC(rsvd15), &IDTVEC(rsvd16), &IDTVEC(rsvd17), 1840 &IDTVEC(rsvd18), &IDTVEC(rsvd19), &IDTVEC(rsvd1a), &IDTVEC(rsvd1b), 1841 &IDTVEC(rsvd1c), &IDTVEC(rsvd1d), &IDTVEC(rsvd1e), &IDTVEC(rsvd1f), 1842 &IDTVEC(rsvd20), &IDTVEC(rsvd21), &IDTVEC(rsvd22), &IDTVEC(rsvd23), 1843 &IDTVEC(rsvd24), &IDTVEC(rsvd25), &IDTVEC(rsvd26), &IDTVEC(rsvd27), 1844 &IDTVEC(rsvd28), &IDTVEC(rsvd29), &IDTVEC(rsvd2a), &IDTVEC(rsvd2b), 1845 &IDTVEC(rsvd2c), &IDTVEC(rsvd2d), &IDTVEC(rsvd2e), &IDTVEC(rsvd2f), 1846 &IDTVEC(rsvd30), &IDTVEC(rsvd31), &IDTVEC(rsvd32), &IDTVEC(rsvd33), 1847 &IDTVEC(rsvd34), &IDTVEC(rsvd35), &IDTVEC(rsvd36), &IDTVEC(rsvd37), 1848 &IDTVEC(rsvd38), &IDTVEC(rsvd39), &IDTVEC(rsvd3a), &IDTVEC(rsvd3b), 1849 &IDTVEC(rsvd3c), &IDTVEC(rsvd3d), &IDTVEC(rsvd3e), &IDTVEC(rsvd3f), 1850 &IDTVEC(rsvd40), &IDTVEC(rsvd41), &IDTVEC(rsvd42), &IDTVEC(rsvd43), 1851 &IDTVEC(rsvd44), &IDTVEC(rsvd45), &IDTVEC(rsvd46), &IDTVEC(rsvd47), 1852 &IDTVEC(rsvd48), &IDTVEC(rsvd49), &IDTVEC(rsvd4a), &IDTVEC(rsvd4b), 1853 &IDTVEC(rsvd4c), &IDTVEC(rsvd4d), &IDTVEC(rsvd4e), &IDTVEC(rsvd4f), 1854 &IDTVEC(rsvd50), &IDTVEC(rsvd51), &IDTVEC(rsvd52), &IDTVEC(rsvd53), 1855 &IDTVEC(rsvd54), &IDTVEC(rsvd55), &IDTVEC(rsvd56), &IDTVEC(rsvd57), 1856 &IDTVEC(rsvd58), &IDTVEC(rsvd59), &IDTVEC(rsvd5a), &IDTVEC(rsvd5b), 1857 &IDTVEC(rsvd5c), &IDTVEC(rsvd5d), &IDTVEC(rsvd5e), &IDTVEC(rsvd5f), 1858 &IDTVEC(rsvd60), &IDTVEC(rsvd61), &IDTVEC(rsvd62), &IDTVEC(rsvd63), 1859 &IDTVEC(rsvd64), &IDTVEC(rsvd65), &IDTVEC(rsvd66), &IDTVEC(rsvd67), 1860 &IDTVEC(rsvd68), &IDTVEC(rsvd69), &IDTVEC(rsvd6a), &IDTVEC(rsvd6b), 1861 &IDTVEC(rsvd6c), &IDTVEC(rsvd6d), &IDTVEC(rsvd6e), &IDTVEC(rsvd6f), 1862 &IDTVEC(rsvd70), &IDTVEC(rsvd71), &IDTVEC(rsvd72), &IDTVEC(rsvd73), 1863 &IDTVEC(rsvd74), &IDTVEC(rsvd75), &IDTVEC(rsvd76), &IDTVEC(rsvd77), 1864 &IDTVEC(rsvd78), &IDTVEC(rsvd79), &IDTVEC(rsvd7a), &IDTVEC(rsvd7b), 1865 &IDTVEC(rsvd7c), &IDTVEC(rsvd7d), &IDTVEC(rsvd7e), &IDTVEC(rsvd7f), 1866 &IDTVEC(rsvd80), &IDTVEC(rsvd81), &IDTVEC(rsvd82), &IDTVEC(rsvd83), 1867 &IDTVEC(rsvd84), &IDTVEC(rsvd85), &IDTVEC(rsvd86), &IDTVEC(rsvd87), 1868 &IDTVEC(rsvd88), &IDTVEC(rsvd89), &IDTVEC(rsvd8a), &IDTVEC(rsvd8b), 1869 &IDTVEC(rsvd8c), &IDTVEC(rsvd8d), &IDTVEC(rsvd8e), &IDTVEC(rsvd8f), 1870 &IDTVEC(rsvd90), &IDTVEC(rsvd91), &IDTVEC(rsvd92), &IDTVEC(rsvd93), 1871 &IDTVEC(rsvd94), &IDTVEC(rsvd95), &IDTVEC(rsvd96), &IDTVEC(rsvd97), 1872 &IDTVEC(rsvd98), &IDTVEC(rsvd99), &IDTVEC(rsvd9a), &IDTVEC(rsvd9b), 1873 &IDTVEC(rsvd9c), &IDTVEC(rsvd9d), &IDTVEC(rsvd9e), &IDTVEC(rsvd9f), 1874 &IDTVEC(rsvda0), &IDTVEC(rsvda1), &IDTVEC(rsvda2), &IDTVEC(rsvda3), 1875 &IDTVEC(rsvda4), &IDTVEC(rsvda5), &IDTVEC(rsvda6), &IDTVEC(rsvda7), 1876 &IDTVEC(rsvda8), &IDTVEC(rsvda9), &IDTVEC(rsvdaa), &IDTVEC(rsvdab), 1877 &IDTVEC(rsvdac), &IDTVEC(rsvdad), &IDTVEC(rsvdae), &IDTVEC(rsvdaf), 1878 &IDTVEC(rsvdb0), &IDTVEC(rsvdb1), &IDTVEC(rsvdb2), &IDTVEC(rsvdb3), 1879 &IDTVEC(rsvdb4), &IDTVEC(rsvdb5), &IDTVEC(rsvdb6), &IDTVEC(rsvdb7), 1880 &IDTVEC(rsvdb8), &IDTVEC(rsvdb9), &IDTVEC(rsvdba), &IDTVEC(rsvdbb), 1881 &IDTVEC(rsvdbc), &IDTVEC(rsvdbd), &IDTVEC(rsvdbe), &IDTVEC(rsvdbf), 1882 &IDTVEC(rsvdc0), &IDTVEC(rsvdc1), &IDTVEC(rsvdc2), &IDTVEC(rsvdc3), 1883 &IDTVEC(rsvdc4), &IDTVEC(rsvdc5), &IDTVEC(rsvdc6), &IDTVEC(rsvdc7), 1884 &IDTVEC(rsvdc8), &IDTVEC(rsvdc9), &IDTVEC(rsvdca), &IDTVEC(rsvdcb), 1885 &IDTVEC(rsvdcc), &IDTVEC(rsvdcd), &IDTVEC(rsvdce), &IDTVEC(rsvdcf), 1886 &IDTVEC(rsvdd0), &IDTVEC(rsvdd1), &IDTVEC(rsvdd2), &IDTVEC(rsvdd3), 1887 &IDTVEC(rsvdd4), &IDTVEC(rsvdd5), &IDTVEC(rsvdd6), &IDTVEC(rsvdd7), 1888 &IDTVEC(rsvdd8), &IDTVEC(rsvdd9), &IDTVEC(rsvdda), &IDTVEC(rsvddb), 1889 &IDTVEC(rsvddc), &IDTVEC(rsvddd), &IDTVEC(rsvdde), &IDTVEC(rsvddf), 1890 &IDTVEC(rsvde0), &IDTVEC(rsvde1), &IDTVEC(rsvde2), &IDTVEC(rsvde3), 1891 &IDTVEC(rsvde4), &IDTVEC(rsvde5), &IDTVEC(rsvde6), &IDTVEC(rsvde7), 1892 &IDTVEC(rsvde8), &IDTVEC(rsvde9), &IDTVEC(rsvdea), &IDTVEC(rsvdeb), 1893 &IDTVEC(rsvdec), &IDTVEC(rsvded), &IDTVEC(rsvdee), &IDTVEC(rsvdef), 1894 &IDTVEC(rsvdf0), &IDTVEC(rsvdf1), &IDTVEC(rsvdf2), &IDTVEC(rsvdf3), 1895 &IDTVEC(rsvdf4), &IDTVEC(rsvdf5), &IDTVEC(rsvdf6), &IDTVEC(rsvdf7), 1896 &IDTVEC(rsvdf8), &IDTVEC(rsvdf9), &IDTVEC(rsvdfa), &IDTVEC(rsvdfb), 1897 &IDTVEC(rsvdfc), &IDTVEC(rsvdfd), &IDTVEC(rsvdfe), &IDTVEC(rsvdff) 1898 }; 1899 1900 void 1901 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd) 1902 { 1903 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 1904 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 1905 ssd->ssd_type = sd->sd_type; 1906 ssd->ssd_dpl = sd->sd_dpl; 1907 ssd->ssd_p = sd->sd_p; 1908 ssd->ssd_def32 = sd->sd_def32; 1909 ssd->ssd_gran = sd->sd_gran; 1910 } 1911 1912 void 1913 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd) 1914 { 1915 1916 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 1917 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 1918 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 1919 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 1920 sd->sd_type = ssd->ssd_type; 1921 sd->sd_dpl = ssd->ssd_dpl; 1922 sd->sd_p = ssd->ssd_p; 1923 sd->sd_long = ssd->ssd_long; 1924 sd->sd_def32 = ssd->ssd_def32; 1925 sd->sd_gran = ssd->ssd_gran; 1926 } 1927 1928 void 1929 ssdtosyssd(struct soft_segment_descriptor *ssd, 1930 struct system_segment_descriptor *sd) 1931 { 1932 1933 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 1934 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 1935 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 1936 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 1937 sd->sd_type = ssd->ssd_type; 1938 sd->sd_dpl = ssd->ssd_dpl; 1939 sd->sd_p = ssd->ssd_p; 1940 sd->sd_gran = ssd->ssd_gran; 1941 } 1942 1943 /* 1944 * Populate the (physmap) array with base/bound pairs describing the 1945 * available physical memory in the system, then test this memory and 1946 * build the phys_avail array describing the actually-available memory. 1947 * 1948 * If we cannot accurately determine the physical memory map, then use 1949 * value from the 0xE801 call, and failing that, the RTC. 1950 * 1951 * Total memory size may be set by the kernel environment variable 1952 * hw.physmem or the compile-time define MAXMEM. 1953 * 1954 * Memory is aligned to PHYSMAP_ALIGN which must be a multiple 1955 * of PAGE_SIZE. This also greatly reduces the memory test time 1956 * which would otherwise be excessive on machines with > 8G of ram. 1957 * 1958 * XXX first should be vm_paddr_t. 1959 */ 1960 1961 #define PHYSMAP_ALIGN (vm_paddr_t)(128 * 1024) 1962 #define PHYSMAP_ALIGN_MASK (vm_paddr_t)(PHYSMAP_ALIGN - 1) 1963 #define PHYSMAP_SIZE VM_PHYSSEG_MAX 1964 1965 vm_paddr_t physmap[PHYSMAP_SIZE]; 1966 struct bios_smap *smapbase, *smap, *smapend; 1967 struct efi_map_header *efihdrbase; 1968 u_int32_t smapsize; 1969 1970 #define PHYSMAP_HANDWAVE (vm_paddr_t)(2 * 1024 * 1024) 1971 #define PHYSMAP_HANDWAVE_MASK (PHYSMAP_HANDWAVE - 1) 1972 1973 static void 1974 add_smap_entries(int *physmap_idx) 1975 { 1976 int i; 1977 1978 smapsize = *((u_int32_t *)smapbase - 1); 1979 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1980 1981 for (smap = smapbase; smap < smapend; smap++) { 1982 if (boothowto & RB_VERBOSE) 1983 kprintf("SMAP type=%02x base=%016lx len=%016lx\n", 1984 smap->type, smap->base, smap->length); 1985 1986 if (smap->type != SMAP_TYPE_MEMORY) 1987 continue; 1988 1989 if (smap->length == 0) 1990 continue; 1991 1992 for (i = 0; i <= *physmap_idx; i += 2) { 1993 if (smap->base < physmap[i + 1]) { 1994 if (boothowto & RB_VERBOSE) { 1995 kprintf("Overlapping or non-monotonic " 1996 "memory region, ignoring " 1997 "second region\n"); 1998 } 1999 break; 2000 } 2001 } 2002 if (i <= *physmap_idx) 2003 continue; 2004 2005 Realmem += smap->length; 2006 2007 /* 2008 * NOTE: This little bit of code initially expands 2009 * physmap[1] as well as later entries. 2010 */ 2011 if (smap->base == physmap[*physmap_idx + 1]) { 2012 physmap[*physmap_idx + 1] += smap->length; 2013 continue; 2014 } 2015 2016 *physmap_idx += 2; 2017 if (*physmap_idx == PHYSMAP_SIZE) { 2018 kprintf("Too many segments in the physical " 2019 "address map, giving up\n"); 2020 break; 2021 } 2022 physmap[*physmap_idx] = smap->base; 2023 physmap[*physmap_idx + 1] = smap->base + smap->length; 2024 } 2025 } 2026 2027 static void 2028 add_efi_map_entries(int *physmap_idx) 2029 { 2030 struct efi_md *map, *p; 2031 const char *type; 2032 size_t efisz; 2033 int i, ndesc; 2034 2035 static const char *types[] = { 2036 "Reserved", 2037 "LoaderCode", 2038 "LoaderData", 2039 "BootServicesCode", 2040 "BootServicesData", 2041 "RuntimeServicesCode", 2042 "RuntimeServicesData", 2043 "ConventionalMemory", 2044 "UnusableMemory", 2045 "ACPIReclaimMemory", 2046 "ACPIMemoryNVS", 2047 "MemoryMappedIO", 2048 "MemoryMappedIOPortSpace", 2049 "PalCode" 2050 }; 2051 2052 /* 2053 * Memory map data provided by UEFI via the GetMemoryMap 2054 * Boot Services API. 2055 */ 2056 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; 2057 map = (struct efi_md *)((uint8_t *)efihdrbase + efisz); 2058 2059 if (efihdrbase->descriptor_size == 0) 2060 return; 2061 ndesc = efihdrbase->memory_size / efihdrbase->descriptor_size; 2062 2063 if (boothowto & RB_VERBOSE) 2064 kprintf("%23s %12s %12s %8s %4s\n", 2065 "Type", "Physical", "Virtual", "#Pages", "Attr"); 2066 2067 for (i = 0, p = map; i < ndesc; i++, 2068 p = efi_next_descriptor(p, efihdrbase->descriptor_size)) { 2069 if (boothowto & RB_VERBOSE) { 2070 if (p->md_type <= EFI_MD_TYPE_PALCODE) 2071 type = types[p->md_type]; 2072 else 2073 type = "<INVALID>"; 2074 kprintf("%23s %012lx %12p %08lx ", type, p->md_phys, 2075 p->md_virt, p->md_pages); 2076 if (p->md_attr & EFI_MD_ATTR_UC) 2077 kprintf("UC "); 2078 if (p->md_attr & EFI_MD_ATTR_WC) 2079 kprintf("WC "); 2080 if (p->md_attr & EFI_MD_ATTR_WT) 2081 kprintf("WT "); 2082 if (p->md_attr & EFI_MD_ATTR_WB) 2083 kprintf("WB "); 2084 if (p->md_attr & EFI_MD_ATTR_UCE) 2085 kprintf("UCE "); 2086 if (p->md_attr & EFI_MD_ATTR_WP) 2087 kprintf("WP "); 2088 if (p->md_attr & EFI_MD_ATTR_RP) 2089 kprintf("RP "); 2090 if (p->md_attr & EFI_MD_ATTR_XP) 2091 kprintf("XP "); 2092 if (p->md_attr & EFI_MD_ATTR_RT) 2093 kprintf("RUNTIME"); 2094 kprintf("\n"); 2095 } 2096 2097 switch (p->md_type) { 2098 case EFI_MD_TYPE_CODE: 2099 case EFI_MD_TYPE_DATA: 2100 case EFI_MD_TYPE_BS_CODE: 2101 case EFI_MD_TYPE_BS_DATA: 2102 case EFI_MD_TYPE_FREE: 2103 /* 2104 * We're allowed to use any entry with these types. 2105 */ 2106 break; 2107 default: 2108 continue; 2109 } 2110 2111 Realmem += p->md_pages * PAGE_SIZE; 2112 2113 /* 2114 * NOTE: This little bit of code initially expands 2115 * physmap[1] as well as later entries. 2116 */ 2117 if (p->md_phys == physmap[*physmap_idx + 1]) { 2118 physmap[*physmap_idx + 1] += p->md_pages * PAGE_SIZE; 2119 continue; 2120 } 2121 2122 *physmap_idx += 2; 2123 if (*physmap_idx == PHYSMAP_SIZE) { 2124 kprintf("Too many segments in the physical " 2125 "address map, giving up\n"); 2126 break; 2127 } 2128 physmap[*physmap_idx] = p->md_phys; 2129 physmap[*physmap_idx + 1] = p->md_phys + p->md_pages * PAGE_SIZE; 2130 } 2131 } 2132 2133 struct fb_info efi_fb_info; 2134 static int have_efi_framebuffer = 0; 2135 2136 static void 2137 efi_fb_init_vaddr(int direct_map) 2138 { 2139 uint64_t sz; 2140 vm_offset_t addr, v; 2141 2142 v = efi_fb_info.vaddr; 2143 sz = efi_fb_info.stride * efi_fb_info.height; 2144 2145 if (direct_map) { 2146 addr = PHYS_TO_DMAP(efi_fb_info.paddr); 2147 if (addr >= DMAP_MIN_ADDRESS && addr + sz <= DMapMaxAddress) 2148 efi_fb_info.vaddr = addr; 2149 } else { 2150 efi_fb_info.vaddr = 2151 (vm_offset_t)pmap_mapdev_attr(efi_fb_info.paddr, 2152 sz, 2153 PAT_WRITE_COMBINING); 2154 } 2155 } 2156 2157 static u_int 2158 efifb_color_depth(struct efi_fb *efifb) 2159 { 2160 uint32_t mask; 2161 u_int depth; 2162 2163 mask = efifb->fb_mask_red | efifb->fb_mask_green | 2164 efifb->fb_mask_blue | efifb->fb_mask_reserved; 2165 if (mask == 0) 2166 return (0); 2167 for (depth = 1; mask != 1; depth++) 2168 mask >>= 1; 2169 return (depth); 2170 } 2171 2172 int 2173 probe_efi_fb(int early) 2174 { 2175 struct efi_fb *efifb; 2176 caddr_t kmdp; 2177 u_int depth; 2178 2179 if (have_efi_framebuffer) { 2180 if (!early && 2181 (efi_fb_info.vaddr == 0 || 2182 efi_fb_info.vaddr == PHYS_TO_DMAP(efi_fb_info.paddr))) 2183 efi_fb_init_vaddr(0); 2184 return 0; 2185 } 2186 2187 kmdp = preload_search_by_type("elf kernel"); 2188 if (kmdp == NULL) 2189 kmdp = preload_search_by_type("elf64 kernel"); 2190 efifb = (struct efi_fb *)preload_search_info(kmdp, 2191 MODINFO_METADATA | MODINFOMD_EFI_FB); 2192 if (efifb == NULL) 2193 return 1; 2194 2195 depth = efifb_color_depth(efifb); 2196 /* 2197 * Our bootloader should already notice, when we won't be able to 2198 * use the UEFI framebuffer. 2199 */ 2200 if (depth != 24 && depth != 32) 2201 return 1; 2202 2203 have_efi_framebuffer = 1; 2204 2205 efi_fb_info.is_vga_boot_display = 1; 2206 efi_fb_info.width = efifb->fb_width; 2207 efi_fb_info.height = efifb->fb_height; 2208 efi_fb_info.depth = depth; 2209 efi_fb_info.stride = efifb->fb_stride * (depth / 8); 2210 efi_fb_info.paddr = efifb->fb_addr; 2211 if (early) { 2212 efi_fb_info.vaddr = 0; 2213 } else { 2214 efi_fb_init_vaddr(0); 2215 } 2216 efi_fb_info.fbops.fb_set_par = NULL; 2217 efi_fb_info.fbops.fb_blank = NULL; 2218 efi_fb_info.fbops.fb_debug_enter = NULL; 2219 efi_fb_info.device = NULL; 2220 2221 return 0; 2222 } 2223 2224 static void 2225 efifb_startup(void *arg) 2226 { 2227 probe_efi_fb(0); 2228 } 2229 2230 SYSINIT(efi_fb_info, SI_BOOT1_POST, SI_ORDER_FIRST, efifb_startup, NULL); 2231 2232 static void 2233 getmemsize(caddr_t kmdp, u_int64_t first) 2234 { 2235 int off, physmap_idx, pa_indx, da_indx; 2236 int i, j; 2237 vm_paddr_t pa; 2238 vm_paddr_t msgbuf_size; 2239 u_long physmem_tunable; 2240 pt_entry_t *pte; 2241 quad_t dcons_addr, dcons_size; 2242 2243 bzero(physmap, sizeof(physmap)); 2244 physmap_idx = 0; 2245 2246 /* 2247 * get memory map from INT 15:E820, kindly supplied by the loader. 2248 * 2249 * subr_module.c says: 2250 * "Consumer may safely assume that size value precedes data." 2251 * ie: an int32_t immediately precedes smap. 2252 */ 2253 efihdrbase = (struct efi_map_header *)preload_search_info(kmdp, 2254 MODINFO_METADATA | MODINFOMD_EFI_MAP); 2255 smapbase = (struct bios_smap *)preload_search_info(kmdp, 2256 MODINFO_METADATA | MODINFOMD_SMAP); 2257 if (smapbase == NULL && efihdrbase == NULL) 2258 panic("No BIOS smap or EFI map info from loader!"); 2259 2260 if (efihdrbase == NULL) 2261 add_smap_entries(&physmap_idx); 2262 else 2263 add_efi_map_entries(&physmap_idx); 2264 2265 base_memory = physmap[1] / 1024; 2266 /* make hole for AP bootstrap code */ 2267 physmap[1] = mp_bootaddress(base_memory); 2268 2269 /* Save EBDA address, if any */ 2270 ebda_addr = (u_long)(*(u_short *)(KERNBASE + 0x40e)); 2271 ebda_addr <<= 4; 2272 2273 /* 2274 * Maxmem isn't the "maximum memory", it's one larger than the 2275 * highest page of the physical address space. It should be 2276 * called something like "Maxphyspage". We may adjust this 2277 * based on ``hw.physmem'' and the results of the memory test. 2278 */ 2279 Maxmem = atop(physmap[physmap_idx + 1]); 2280 2281 #ifdef MAXMEM 2282 Maxmem = MAXMEM / 4; 2283 #endif 2284 2285 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 2286 Maxmem = atop(physmem_tunable); 2287 2288 /* 2289 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 2290 * in the system. 2291 */ 2292 if (Maxmem > atop(physmap[physmap_idx + 1])) 2293 Maxmem = atop(physmap[physmap_idx + 1]); 2294 2295 /* 2296 * Blowing out the DMAP will blow up the system. 2297 */ 2298 if (Maxmem > atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS)) { 2299 kprintf("Limiting Maxmem due to DMAP size\n"); 2300 Maxmem = atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS); 2301 } 2302 2303 if (atop(physmap[physmap_idx + 1]) != Maxmem && 2304 (boothowto & RB_VERBOSE)) { 2305 kprintf("Physical memory use set to %ldK\n", Maxmem * 4); 2306 } 2307 2308 /* 2309 * Call pmap initialization to make new kernel address space 2310 * 2311 * Mask off page 0. 2312 */ 2313 pmap_bootstrap(&first); 2314 physmap[0] = PAGE_SIZE; 2315 2316 /* 2317 * Align the physmap to PHYSMAP_ALIGN and cut out anything 2318 * exceeding Maxmem. 2319 */ 2320 for (i = j = 0; i <= physmap_idx; i += 2) { 2321 if (physmap[i+1] > ptoa(Maxmem)) 2322 physmap[i+1] = ptoa(Maxmem); 2323 physmap[i] = (physmap[i] + PHYSMAP_ALIGN_MASK) & 2324 ~PHYSMAP_ALIGN_MASK; 2325 physmap[i+1] = physmap[i+1] & ~PHYSMAP_ALIGN_MASK; 2326 2327 physmap[j] = physmap[i]; 2328 physmap[j+1] = physmap[i+1]; 2329 2330 if (physmap[i] < physmap[i+1]) 2331 j += 2; 2332 } 2333 physmap_idx = j - 2; 2334 2335 /* 2336 * Align anything else used in the validation loop. 2337 * 2338 * Also make sure that our 2MB kernel text+data+bss mappings 2339 * do not overlap potentially allocatable space. 2340 */ 2341 first = (first + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK; 2342 2343 /* 2344 * Size up each available chunk of physical memory. 2345 */ 2346 pa_indx = 0; 2347 da_indx = 0; 2348 phys_avail[pa_indx].phys_beg = physmap[0]; 2349 phys_avail[pa_indx].phys_end = physmap[0]; 2350 dump_avail[da_indx].phys_beg = 0; 2351 dump_avail[da_indx].phys_end = physmap[0]; 2352 pte = CMAP1; 2353 2354 /* 2355 * Get dcons buffer address 2356 */ 2357 if (kgetenv_quad("dcons.addr", &dcons_addr) == 0 || 2358 kgetenv_quad("dcons.size", &dcons_size) == 0) 2359 dcons_addr = 0; 2360 2361 /* 2362 * Validate the physical memory. The physical memory segments 2363 * have already been aligned to PHYSMAP_ALIGN which is a multiple 2364 * of PAGE_SIZE. 2365 * 2366 * We no longer perform an exhaustive memory test. Instead we 2367 * simply test the first and last word in each physmap[] 2368 * segment. 2369 */ 2370 for (i = 0; i <= physmap_idx; i += 2) { 2371 vm_paddr_t end; 2372 vm_paddr_t incr; 2373 2374 end = physmap[i + 1]; 2375 2376 for (pa = physmap[i]; pa < end; pa += incr) { 2377 int page_bad, full; 2378 volatile uint64_t *ptr = (uint64_t *)CADDR1; 2379 uint64_t tmp; 2380 2381 full = FALSE; 2382 2383 /* 2384 * Calculate incr. Just test the first and 2385 * last page in each physmap[] segment. 2386 */ 2387 if (pa == end - PAGE_SIZE) 2388 incr = PAGE_SIZE; 2389 else 2390 incr = end - pa - PAGE_SIZE; 2391 2392 /* 2393 * Make sure we don't skip blacked out areas. 2394 */ 2395 if (pa < 0x200000 && 0x200000 < end) { 2396 incr = 0x200000 - pa; 2397 } 2398 if (dcons_addr > 0 && 2399 pa < dcons_addr && 2400 dcons_addr < end) { 2401 incr = dcons_addr - pa; 2402 } 2403 2404 /* 2405 * Block out kernel memory as not available. 2406 */ 2407 if (pa >= 0x200000 && pa < first) { 2408 incr = first - pa; 2409 if (pa + incr > end) 2410 incr = end - pa; 2411 goto do_dump_avail; 2412 } 2413 2414 /* 2415 * Block out the dcons buffer if it exists. 2416 */ 2417 if (dcons_addr > 0 && 2418 pa >= trunc_page(dcons_addr) && 2419 pa < dcons_addr + dcons_size) { 2420 incr = dcons_addr + dcons_size - pa; 2421 incr = (incr + PAGE_MASK) & 2422 ~(vm_paddr_t)PAGE_MASK; 2423 if (pa + incr > end) 2424 incr = end - pa; 2425 goto do_dump_avail; 2426 } 2427 2428 page_bad = FALSE; 2429 2430 /* 2431 * Map the page non-cacheable for the memory 2432 * test. 2433 */ 2434 *pte = pa | 2435 kernel_pmap->pmap_bits[PG_V_IDX] | 2436 kernel_pmap->pmap_bits[PG_RW_IDX] | 2437 kernel_pmap->pmap_bits[PG_N_IDX]; 2438 cpu_invlpg(__DEVOLATILE(void *, ptr)); 2439 cpu_mfence(); 2440 2441 /* 2442 * Save original value for restoration later. 2443 */ 2444 tmp = *ptr; 2445 2446 /* 2447 * Test for alternating 1's and 0's 2448 */ 2449 *ptr = 0xaaaaaaaaaaaaaaaaLLU; 2450 cpu_mfence(); 2451 if (*ptr != 0xaaaaaaaaaaaaaaaaLLU) 2452 page_bad = TRUE; 2453 /* 2454 * Test for alternating 0's and 1's 2455 */ 2456 *ptr = 0x5555555555555555LLU; 2457 cpu_mfence(); 2458 if (*ptr != 0x5555555555555555LLU) 2459 page_bad = TRUE; 2460 /* 2461 * Test for all 1's 2462 */ 2463 *ptr = 0xffffffffffffffffLLU; 2464 cpu_mfence(); 2465 if (*ptr != 0xffffffffffffffffLLU) 2466 page_bad = TRUE; 2467 /* 2468 * Test for all 0's 2469 */ 2470 *ptr = 0x0; 2471 cpu_mfence(); 2472 if (*ptr != 0x0) 2473 page_bad = TRUE; 2474 2475 /* 2476 * Restore original value. 2477 */ 2478 *ptr = tmp; 2479 2480 /* 2481 * Adjust array of valid/good pages. 2482 */ 2483 if (page_bad == TRUE) { 2484 incr = PAGE_SIZE; 2485 continue; 2486 } 2487 2488 /* 2489 * Collapse page address into phys_avail[]. Do a 2490 * continuation of the current phys_avail[] index 2491 * when possible. 2492 */ 2493 if (phys_avail[pa_indx].phys_end == pa) { 2494 /* 2495 * Continuation 2496 */ 2497 phys_avail[pa_indx].phys_end += incr; 2498 } else if (phys_avail[pa_indx].phys_beg == 2499 phys_avail[pa_indx].phys_end) { 2500 /* 2501 * Current phys_avail is completely empty, 2502 * reuse the index. 2503 */ 2504 phys_avail[pa_indx].phys_beg = pa; 2505 phys_avail[pa_indx].phys_end = pa + incr; 2506 } else { 2507 /* 2508 * Allocate next phys_avail index. 2509 */ 2510 ++pa_indx; 2511 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 2512 kprintf( 2513 "Too many holes in the physical address space, giving up\n"); 2514 --pa_indx; 2515 full = TRUE; 2516 goto do_dump_avail; 2517 } 2518 phys_avail[pa_indx].phys_beg = pa; 2519 phys_avail[pa_indx].phys_end = pa + incr; 2520 } 2521 physmem += incr / PAGE_SIZE; 2522 2523 /* 2524 * pa available for dumping 2525 */ 2526 do_dump_avail: 2527 if (dump_avail[da_indx].phys_end == pa) { 2528 dump_avail[da_indx].phys_end += incr; 2529 } else { 2530 ++da_indx; 2531 if (da_indx == DUMP_AVAIL_ARRAY_END) { 2532 --da_indx; 2533 goto do_next; 2534 } 2535 dump_avail[da_indx].phys_beg = pa; 2536 dump_avail[da_indx].phys_end = pa + incr; 2537 } 2538 do_next: 2539 if (full) 2540 break; 2541 } 2542 } 2543 *pte = 0; 2544 cpu_invltlb(); 2545 cpu_mfence(); 2546 2547 /* 2548 * The last chunk must contain at least one page plus the message 2549 * buffer to avoid complicating other code (message buffer address 2550 * calculation, etc.). 2551 */ 2552 msgbuf_size = (MSGBUF_SIZE + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK; 2553 2554 while (phys_avail[pa_indx].phys_beg + PHYSMAP_ALIGN + msgbuf_size >= 2555 phys_avail[pa_indx].phys_end) { 2556 physmem -= atop(phys_avail[pa_indx].phys_end - 2557 phys_avail[pa_indx].phys_beg); 2558 phys_avail[pa_indx].phys_beg = 0; 2559 phys_avail[pa_indx].phys_end = 0; 2560 --pa_indx; 2561 } 2562 2563 Maxmem = atop(phys_avail[pa_indx].phys_end); 2564 2565 /* Trim off space for the message buffer. */ 2566 phys_avail[pa_indx].phys_end -= msgbuf_size; 2567 2568 avail_end = phys_avail[pa_indx].phys_end; 2569 2570 /* Map the message buffer. */ 2571 for (off = 0; off < msgbuf_size; off += PAGE_SIZE) { 2572 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); 2573 } 2574 2575 /* 2576 * Try to get EFI framebuffer working as early as possible. 2577 * 2578 * WARN: Some BIOSes do not list the EFI framebuffer memory, causing 2579 * the pmap probe code to create a DMAP that does not cover its 2580 * physical address space, efi_fb_init_vaddr(1) might not return 2581 * an initialized framebuffer base pointer. In this situation the 2582 * later efi_fb_init_vaddr(0) call will deal with it. 2583 */ 2584 if (have_efi_framebuffer) 2585 efi_fb_init_vaddr(1); 2586 } 2587 2588 struct machintr_abi MachIntrABI; 2589 2590 /* 2591 * IDT VECTORS: 2592 * 0 Divide by zero 2593 * 1 Debug 2594 * 2 NMI 2595 * 3 BreakPoint 2596 * 4 OverFlow 2597 * 5 Bound-Range 2598 * 6 Invalid OpCode 2599 * 7 Device Not Available (x87) 2600 * 8 Double-Fault 2601 * 9 Coprocessor Segment overrun (unsupported, reserved) 2602 * 10 Invalid-TSS 2603 * 11 Segment not present 2604 * 12 Stack 2605 * 13 General Protection 2606 * 14 Page Fault 2607 * 15 Reserved 2608 * 16 x87 FP Exception pending 2609 * 17 Alignment Check 2610 * 18 Machine Check 2611 * 19 SIMD floating point 2612 * 20-31 reserved 2613 * 32-255 INTn/external sources 2614 */ 2615 u_int64_t 2616 hammer_time(u_int64_t modulep, u_int64_t physfree) 2617 { 2618 caddr_t kmdp; 2619 int gsel_tss, x, cpu; 2620 #if 0 /* JG */ 2621 int metadata_missing, off; 2622 #endif 2623 struct mdglobaldata *gd; 2624 struct privatespace *ps; 2625 u_int64_t msr; 2626 2627 /* 2628 * Prevent lowering of the ipl if we call tsleep() early. 2629 */ 2630 gd = &CPU_prvspace[0]->mdglobaldata; 2631 ps = (struct privatespace *)gd; 2632 bzero(gd, sizeof(*gd)); 2633 bzero(&ps->common_tss, sizeof(ps->common_tss)); 2634 2635 /* 2636 * Note: on both UP and SMP curthread must be set non-NULL 2637 * early in the boot sequence because the system assumes 2638 * that 'curthread' is never NULL. 2639 */ 2640 2641 gd->mi.gd_curthread = &thread0; 2642 thread0.td_gd = &gd->mi; 2643 2644 atdevbase = ISA_HOLE_START + PTOV_OFFSET; 2645 2646 #if 0 /* JG */ 2647 metadata_missing = 0; 2648 if (bootinfo.bi_modulep) { 2649 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; 2650 preload_bootstrap_relocate(KERNBASE); 2651 } else { 2652 metadata_missing = 1; 2653 } 2654 if (bootinfo.bi_envp) 2655 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; 2656 #endif 2657 2658 preload_metadata = (caddr_t)(uintptr_t)(modulep + PTOV_OFFSET); 2659 preload_bootstrap_relocate(PTOV_OFFSET); 2660 kmdp = preload_search_by_type("elf kernel"); 2661 if (kmdp == NULL) 2662 kmdp = preload_search_by_type("elf64 kernel"); 2663 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 2664 kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + PTOV_OFFSET; 2665 #ifdef DDB 2666 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 2667 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 2668 #endif 2669 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); 2670 2671 if (boothowto & RB_VERBOSE) 2672 bootverbose++; 2673 2674 /* 2675 * Default MachIntrABI to ICU 2676 */ 2677 MachIntrABI = MachIntrABI_ICU; 2678 2679 /* 2680 * start with one cpu. Note: with one cpu, ncpus_fit_mask remain 0. 2681 */ 2682 ncpus = 1; 2683 ncpus_fit = 1; 2684 /* Init basic tunables, hz etc */ 2685 init_param1(); 2686 2687 /* 2688 * make gdt memory segments 2689 */ 2690 gdt_segs[GPROC0_SEL].ssd_base = 2691 (uintptr_t) &CPU_prvspace[0]->common_tss; 2692 2693 gd->mi.gd_prvspace = CPU_prvspace[0]; 2694 2695 for (x = 0; x < NGDT; x++) { 2696 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1)) 2697 ssdtosd(&gdt_segs[x], &gdt[x]); 2698 } 2699 ssdtosyssd(&gdt_segs[GPROC0_SEL], 2700 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 2701 2702 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 2703 r_gdt.rd_base = (long) gdt; 2704 lgdt(&r_gdt); 2705 2706 wrmsr(MSR_FSBASE, 0); /* User value */ 2707 wrmsr(MSR_GSBASE, (u_int64_t)&gd->mi); 2708 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 2709 2710 mi_gdinit(&gd->mi, 0); 2711 cpu_gdinit(gd, 0); 2712 proc0paddr = proc0paddr_buff; 2713 mi_proc0init(&gd->mi, proc0paddr); 2714 safepri = TDPRI_MAX; 2715 2716 /* spinlocks and the BGL */ 2717 init_locks(); 2718 2719 /* exceptions */ 2720 for (x = 0; x < NIDT; x++) 2721 setidt_global(x, rsvdary[x], SDT_SYSIGT, SEL_KPL, 0); 2722 setidt_global(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); 2723 setidt_global(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 2); 2724 setidt_global(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 1); 2725 setidt_global(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0); 2726 setidt_global(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0); 2727 setidt_global(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); 2728 setidt_global(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); 2729 setidt_global(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); 2730 setidt_global(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 2731 setidt_global(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); 2732 setidt_global(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); 2733 setidt_global(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); 2734 setidt_global(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); 2735 setidt_global(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); 2736 setidt_global(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); 2737 setidt_global(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); 2738 setidt_global(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); 2739 setidt_global(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0); 2740 setidt_global(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); 2741 2742 for (cpu = 0; cpu < MAXCPU; ++cpu) { 2743 r_idt_arr[cpu].rd_limit = sizeof(idt_arr[cpu]) - 1; 2744 r_idt_arr[cpu].rd_base = (long) &idt_arr[cpu][0]; 2745 } 2746 2747 lidt(&r_idt_arr[0]); 2748 2749 /* 2750 * Initialize the console before we print anything out. 2751 */ 2752 cninit(); 2753 2754 #if 0 /* JG */ 2755 if (metadata_missing) 2756 kprintf("WARNING: loader(8) metadata is missing!\n"); 2757 #endif 2758 2759 #if NISA >0 2760 elcr_probe(); 2761 isa_defaultirq(); 2762 #endif 2763 rand_initialize(); 2764 2765 /* 2766 * Initialize IRQ mapping 2767 * 2768 * NOTE: 2769 * SHOULD be after elcr_probe() 2770 */ 2771 MachIntrABI_ICU.initmap(); 2772 MachIntrABI_IOAPIC.initmap(); 2773 2774 #ifdef DDB 2775 kdb_init(); 2776 if (boothowto & RB_KDB) 2777 Debugger("Boot flags requested debugger"); 2778 #endif 2779 2780 identify_cpu(); /* Final stage of CPU initialization */ 2781 initializecpu(0); /* Initialize CPU registers */ 2782 2783 /* 2784 * On modern Intel cpus, haswell or later, cpu_idle_hlt=1 is better 2785 * because the cpu does significant power management in MWAIT 2786 * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP). 2787 * 2788 * On many AMD cpus cpu_idle_hlt=3 is better, because the cpu does 2789 * significant power management only when using ACPI halt mode. 2790 * (However, on Ryzen, mode 4 (HLT) also does power management). 2791 * 2792 * On older AMD or Intel cpus, cpu_idle_hlt=2 is better because ACPI 2793 * is needed to reduce power consumption, but wakeup times are often 2794 * too long. 2795 */ 2796 if (cpu_vendor_id == CPU_VENDOR_INTEL && 2797 CPUID_TO_MODEL(cpu_id) >= 0x3C) { /* Haswell or later */ 2798 cpu_idle_hlt = 1; 2799 } 2800 if (cpu_vendor_id == CPU_VENDOR_AMD) { 2801 if (CPUID_TO_FAMILY(cpu_id) >= 0x17) { 2802 /* Ryzen or later */ 2803 cpu_idle_hlt = 3; 2804 } else if (CPUID_TO_FAMILY(cpu_id) >= 0x14) { 2805 /* Bobcat or later */ 2806 cpu_idle_hlt = 3; 2807 } 2808 } 2809 2810 TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable); /* for compat */ 2811 TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable); 2812 TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable); 2813 TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt); 2814 2815 /* 2816 * By default always enable the ioapic. Certain virtual machines 2817 * may not work with the I/O apic enabled and can be specified in 2818 * the case statement below. On the other hand, if the ioapic is 2819 * disabled for virtual machines which DO work with the I/O apic, 2820 * the virtual machine can implode if we disable the I/O apic. 2821 * 2822 * For now enable the ioapic for all guests. 2823 * 2824 * NOTE: This must be done after identify_cpu(), which sets 2825 * 'cpu_feature2'. 2826 */ 2827 if (ioapic_enable < 0) { 2828 ioapic_enable = 1; 2829 switch(vmm_guest) { 2830 case VMM_GUEST_NONE: /* should be enabled on real HW */ 2831 case VMM_GUEST_KVM: /* must be enabled or VM implodes */ 2832 ioapic_enable = 1; 2833 break; 2834 default: /* enable by default for other VMs */ 2835 ioapic_enable = 1; 2836 break; 2837 } 2838 } 2839 2840 /* 2841 * TSS entry point for interrupts, traps, and exceptions 2842 * (sans NMI). This will always go to near the top of the pcpu 2843 * trampoline area. Hardware-pushed data will be copied into 2844 * the trap-frame on entry, and (if necessary) returned to the 2845 * trampoline on exit. 2846 * 2847 * We store some pcb data for the trampoline code above the 2848 * stack the cpu hw pushes into, and arrange things so the 2849 * address of tr_pcb_rsp is the same as the desired top of 2850 * stack. 2851 */ 2852 ps->common_tss.tss_rsp0 = (register_t)&ps->trampoline.tr_pcb_rsp; 2853 ps->trampoline.tr_pcb_rsp = ps->common_tss.tss_rsp0; 2854 ps->trampoline.tr_pcb_gs_kernel = (register_t)gd; 2855 ps->trampoline.tr_pcb_cr3 = KPML4phys; /* adj to user cr3 live */ 2856 ps->dbltramp.tr_pcb_gs_kernel = (register_t)gd; 2857 ps->dbltramp.tr_pcb_cr3 = KPML4phys; 2858 ps->dbgtramp.tr_pcb_gs_kernel = (register_t)gd; 2859 ps->dbgtramp.tr_pcb_cr3 = KPML4phys; 2860 2861 /* double fault stack */ 2862 ps->common_tss.tss_ist1 = (register_t)&ps->dbltramp.tr_pcb_rsp; 2863 /* #DB debugger needs its own stack */ 2864 ps->common_tss.tss_ist2 = (register_t)&ps->dbgtramp.tr_pcb_rsp; 2865 2866 /* Set the IO permission bitmap (empty due to tss seg limit) */ 2867 ps->common_tss.tss_iobase = sizeof(struct x86_64tss); 2868 2869 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 2870 gd->gd_tss_gdt = &gdt[GPROC0_SEL]; 2871 gd->gd_common_tssd = *gd->gd_tss_gdt; 2872 ltr(gsel_tss); 2873 2874 /* Set up the fast syscall stuff */ 2875 msr = rdmsr(MSR_EFER) | EFER_SCE; 2876 wrmsr(MSR_EFER, msr); 2877 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); 2878 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 2879 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 2880 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 2881 wrmsr(MSR_STAR, msr); 2882 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_IOPL|PSL_AC); 2883 2884 getmemsize(kmdp, physfree); 2885 init_param2(physmem); 2886 2887 /* now running on new page tables, configured,and u/iom is accessible */ 2888 2889 /* Map the message buffer. */ 2890 #if 0 /* JG */ 2891 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) 2892 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); 2893 #endif 2894 2895 msgbufinit(msgbufp, MSGBUF_SIZE); 2896 2897 2898 /* transfer to user mode */ 2899 2900 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 2901 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 2902 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 2903 2904 load_ds(_udatasel); 2905 load_es(_udatasel); 2906 load_fs(_udatasel); 2907 2908 /* setup proc 0's pcb */ 2909 thread0.td_pcb->pcb_flags = 0; 2910 thread0.td_pcb->pcb_cr3 = KPML4phys; 2911 thread0.td_pcb->pcb_cr3_iso = 0; 2912 thread0.td_pcb->pcb_ext = NULL; 2913 lwp0.lwp_md.md_regs = &proc0_tf; /* XXX needed? */ 2914 2915 /* Location of kernel stack for locore */ 2916 return ((u_int64_t)thread0.td_pcb); 2917 } 2918 2919 /* 2920 * Initialize machine-dependant portions of the global data structure. 2921 * Note that the global data area and cpu0's idlestack in the private 2922 * data space were allocated in locore. 2923 * 2924 * Note: the idlethread's cpl is 0 2925 * 2926 * WARNING! Called from early boot, 'mycpu' may not work yet. 2927 */ 2928 void 2929 cpu_gdinit(struct mdglobaldata *gd, int cpu) 2930 { 2931 if (cpu) 2932 gd->mi.gd_curthread = &gd->mi.gd_idlethread; 2933 2934 lwkt_init_thread(&gd->mi.gd_idlethread, 2935 gd->mi.gd_prvspace->idlestack, 2936 sizeof(gd->mi.gd_prvspace->idlestack), 2937 0, &gd->mi); 2938 lwkt_set_comm(&gd->mi.gd_idlethread, "idle_%d", cpu); 2939 gd->mi.gd_idlethread.td_switch = cpu_lwkt_switch; 2940 gd->mi.gd_idlethread.td_sp -= sizeof(void *); 2941 *(void **)gd->mi.gd_idlethread.td_sp = cpu_idle_restore; 2942 } 2943 2944 /* 2945 * We only have to check for DMAP bounds, the globaldata space is 2946 * actually part of the kernel_map so we don't have to waste time 2947 * checking CPU_prvspace[*]. 2948 */ 2949 int 2950 is_globaldata_space(vm_offset_t saddr, vm_offset_t eaddr) 2951 { 2952 #if 0 2953 if (saddr >= (vm_offset_t)&CPU_prvspace[0] && 2954 eaddr <= (vm_offset_t)&CPU_prvspace[MAXCPU]) { 2955 return (TRUE); 2956 } 2957 #endif 2958 if (saddr >= DMAP_MIN_ADDRESS && eaddr <= DMAP_MAX_ADDRESS) 2959 return (TRUE); 2960 return (FALSE); 2961 } 2962 2963 struct globaldata * 2964 globaldata_find(int cpu) 2965 { 2966 KKASSERT(cpu >= 0 && cpu < ncpus); 2967 return(&CPU_prvspace[cpu]->mdglobaldata.mi); 2968 } 2969 2970 /* 2971 * This path should be safe from the SYSRET issue because only stopped threads 2972 * can have their %rip adjusted this way (and all heavy weight thread switches 2973 * clear QUICKREF and thus do not use SYSRET). However, the code path is 2974 * convoluted so add a safety by forcing %rip to be cannonical. 2975 */ 2976 int 2977 ptrace_set_pc(struct lwp *lp, unsigned long addr) 2978 { 2979 if (addr & 0x0000800000000000LLU) 2980 lp->lwp_md.md_regs->tf_rip = addr | 0xFFFF000000000000LLU; 2981 else 2982 lp->lwp_md.md_regs->tf_rip = addr & 0x0000FFFFFFFFFFFFLLU; 2983 return (0); 2984 } 2985 2986 int 2987 ptrace_single_step(struct lwp *lp) 2988 { 2989 lp->lwp_md.md_regs->tf_rflags |= PSL_T; 2990 return (0); 2991 } 2992 2993 int 2994 fill_regs(struct lwp *lp, struct reg *regs) 2995 { 2996 struct trapframe *tp; 2997 2998 if ((tp = lp->lwp_md.md_regs) == NULL) 2999 return EINVAL; 3000 bcopy(&tp->tf_rdi, ®s->r_rdi, sizeof(*regs)); 3001 return (0); 3002 } 3003 3004 int 3005 set_regs(struct lwp *lp, struct reg *regs) 3006 { 3007 struct trapframe *tp; 3008 3009 tp = lp->lwp_md.md_regs; 3010 if (!EFL_SECURE(regs->r_rflags, tp->tf_rflags) || 3011 !CS_SECURE(regs->r_cs)) 3012 return (EINVAL); 3013 bcopy(®s->r_rdi, &tp->tf_rdi, sizeof(*regs)); 3014 clear_quickret(); 3015 return (0); 3016 } 3017 3018 static void 3019 fill_fpregs_xmm(struct savexmm *sv_xmm, struct save87 *sv_87) 3020 { 3021 struct env87 *penv_87 = &sv_87->sv_env; 3022 struct envxmm *penv_xmm = &sv_xmm->sv_env; 3023 int i; 3024 3025 /* FPU control/status */ 3026 penv_87->en_cw = penv_xmm->en_cw; 3027 penv_87->en_sw = penv_xmm->en_sw; 3028 penv_87->en_tw = penv_xmm->en_tw; 3029 penv_87->en_fip = penv_xmm->en_fip; 3030 penv_87->en_fcs = penv_xmm->en_fcs; 3031 penv_87->en_opcode = penv_xmm->en_opcode; 3032 penv_87->en_foo = penv_xmm->en_foo; 3033 penv_87->en_fos = penv_xmm->en_fos; 3034 3035 /* FPU registers */ 3036 for (i = 0; i < 8; ++i) 3037 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; 3038 } 3039 3040 static void 3041 set_fpregs_xmm(struct save87 *sv_87, struct savexmm *sv_xmm) 3042 { 3043 struct env87 *penv_87 = &sv_87->sv_env; 3044 struct envxmm *penv_xmm = &sv_xmm->sv_env; 3045 int i; 3046 3047 /* FPU control/status */ 3048 penv_xmm->en_cw = penv_87->en_cw; 3049 penv_xmm->en_sw = penv_87->en_sw; 3050 penv_xmm->en_tw = penv_87->en_tw; 3051 penv_xmm->en_fip = penv_87->en_fip; 3052 penv_xmm->en_fcs = penv_87->en_fcs; 3053 penv_xmm->en_opcode = penv_87->en_opcode; 3054 penv_xmm->en_foo = penv_87->en_foo; 3055 penv_xmm->en_fos = penv_87->en_fos; 3056 3057 /* FPU registers */ 3058 for (i = 0; i < 8; ++i) 3059 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; 3060 } 3061 3062 int 3063 fill_fpregs(struct lwp *lp, struct fpreg *fpregs) 3064 { 3065 if (lp->lwp_thread == NULL || lp->lwp_thread->td_pcb == NULL) 3066 return EINVAL; 3067 if (cpu_fxsr) { 3068 fill_fpregs_xmm(&lp->lwp_thread->td_pcb->pcb_save.sv_xmm, 3069 (struct save87 *)fpregs); 3070 return (0); 3071 } 3072 bcopy(&lp->lwp_thread->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs); 3073 return (0); 3074 } 3075 3076 int 3077 set_fpregs(struct lwp *lp, struct fpreg *fpregs) 3078 { 3079 if (cpu_fxsr) { 3080 set_fpregs_xmm((struct save87 *)fpregs, 3081 &lp->lwp_thread->td_pcb->pcb_save.sv_xmm); 3082 return (0); 3083 } 3084 bcopy(fpregs, &lp->lwp_thread->td_pcb->pcb_save.sv_87, sizeof *fpregs); 3085 return (0); 3086 } 3087 3088 int 3089 fill_dbregs(struct lwp *lp, struct dbreg *dbregs) 3090 { 3091 struct pcb *pcb; 3092 3093 if (lp == NULL) { 3094 dbregs->dr[0] = rdr0(); 3095 dbregs->dr[1] = rdr1(); 3096 dbregs->dr[2] = rdr2(); 3097 dbregs->dr[3] = rdr3(); 3098 dbregs->dr[4] = rdr4(); 3099 dbregs->dr[5] = rdr5(); 3100 dbregs->dr[6] = rdr6(); 3101 dbregs->dr[7] = rdr7(); 3102 return (0); 3103 } 3104 if (lp->lwp_thread == NULL || (pcb = lp->lwp_thread->td_pcb) == NULL) 3105 return EINVAL; 3106 dbregs->dr[0] = pcb->pcb_dr0; 3107 dbregs->dr[1] = pcb->pcb_dr1; 3108 dbregs->dr[2] = pcb->pcb_dr2; 3109 dbregs->dr[3] = pcb->pcb_dr3; 3110 dbregs->dr[4] = 0; 3111 dbregs->dr[5] = 0; 3112 dbregs->dr[6] = pcb->pcb_dr6; 3113 dbregs->dr[7] = pcb->pcb_dr7; 3114 return (0); 3115 } 3116 3117 int 3118 set_dbregs(struct lwp *lp, struct dbreg *dbregs) 3119 { 3120 if (lp == NULL) { 3121 load_dr0(dbregs->dr[0]); 3122 load_dr1(dbregs->dr[1]); 3123 load_dr2(dbregs->dr[2]); 3124 load_dr3(dbregs->dr[3]); 3125 load_dr4(dbregs->dr[4]); 3126 load_dr5(dbregs->dr[5]); 3127 load_dr6(dbregs->dr[6]); 3128 load_dr7(dbregs->dr[7]); 3129 } else { 3130 struct pcb *pcb; 3131 struct ucred *ucred; 3132 int i; 3133 uint64_t mask1, mask2; 3134 3135 /* 3136 * Don't let an illegal value for dr7 get set. Specifically, 3137 * check for undefined settings. Setting these bit patterns 3138 * result in undefined behaviour and can lead to an unexpected 3139 * TRCTRAP. 3140 */ 3141 /* JG this loop looks unreadable */ 3142 /* Check 4 2-bit fields for invalid patterns. 3143 * These fields are R/Wi, for i = 0..3 3144 */ 3145 /* Is 10 in LENi allowed when running in compatibility mode? */ 3146 /* Pattern 10 in R/Wi might be used to indicate 3147 * breakpoint on I/O. Further analysis should be 3148 * carried to decide if it is safe and useful to 3149 * provide access to that capability 3150 */ 3151 for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 4; 3152 i++, mask1 <<= 4, mask2 <<= 4) 3153 if ((dbregs->dr[7] & mask1) == mask2) 3154 return (EINVAL); 3155 3156 pcb = lp->lwp_thread->td_pcb; 3157 ucred = lp->lwp_proc->p_ucred; 3158 3159 /* 3160 * Don't let a process set a breakpoint that is not within the 3161 * process's address space. If a process could do this, it 3162 * could halt the system by setting a breakpoint in the kernel 3163 * (if ddb was enabled). Thus, we need to check to make sure 3164 * that no breakpoints are being enabled for addresses outside 3165 * process's address space, unless, perhaps, we were called by 3166 * uid 0. 3167 * 3168 * XXX - what about when the watched area of the user's 3169 * address space is written into from within the kernel 3170 * ... wouldn't that still cause a breakpoint to be generated 3171 * from within kernel mode? 3172 */ 3173 3174 if (priv_check_cred(ucred, PRIV_ROOT, 0) != 0) { 3175 if (dbregs->dr[7] & 0x3) { 3176 /* dr0 is enabled */ 3177 if (dbregs->dr[0] >= VM_MAX_USER_ADDRESS) 3178 return (EINVAL); 3179 } 3180 3181 if (dbregs->dr[7] & (0x3<<2)) { 3182 /* dr1 is enabled */ 3183 if (dbregs->dr[1] >= VM_MAX_USER_ADDRESS) 3184 return (EINVAL); 3185 } 3186 3187 if (dbregs->dr[7] & (0x3<<4)) { 3188 /* dr2 is enabled */ 3189 if (dbregs->dr[2] >= VM_MAX_USER_ADDRESS) 3190 return (EINVAL); 3191 } 3192 3193 if (dbregs->dr[7] & (0x3<<6)) { 3194 /* dr3 is enabled */ 3195 if (dbregs->dr[3] >= VM_MAX_USER_ADDRESS) 3196 return (EINVAL); 3197 } 3198 } 3199 3200 pcb->pcb_dr0 = dbregs->dr[0]; 3201 pcb->pcb_dr1 = dbregs->dr[1]; 3202 pcb->pcb_dr2 = dbregs->dr[2]; 3203 pcb->pcb_dr3 = dbregs->dr[3]; 3204 pcb->pcb_dr6 = dbregs->dr[6]; 3205 pcb->pcb_dr7 = dbregs->dr[7]; 3206 3207 pcb->pcb_flags |= PCB_DBREGS; 3208 } 3209 3210 return (0); 3211 } 3212 3213 /* 3214 * Return > 0 if a hardware breakpoint has been hit, and the 3215 * breakpoint was in user space. Return 0, otherwise. 3216 */ 3217 int 3218 user_dbreg_trap(void) 3219 { 3220 u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */ 3221 u_int64_t bp; /* breakpoint bits extracted from dr6 */ 3222 int nbp; /* number of breakpoints that triggered */ 3223 caddr_t addr[4]; /* breakpoint addresses */ 3224 int i; 3225 3226 dr7 = rdr7(); 3227 if ((dr7 & 0xff) == 0) { 3228 /* 3229 * all GE and LE bits in the dr7 register are zero, 3230 * thus the trap couldn't have been caused by the 3231 * hardware debug registers 3232 */ 3233 return 0; 3234 } 3235 3236 nbp = 0; 3237 dr6 = rdr6(); 3238 bp = dr6 & 0xf; 3239 3240 if (bp == 0) { 3241 /* 3242 * None of the breakpoint bits are set meaning this 3243 * trap was not caused by any of the debug registers 3244 */ 3245 return 0; 3246 } 3247 3248 /* 3249 * at least one of the breakpoints were hit, check to see 3250 * which ones and if any of them are user space addresses 3251 */ 3252 3253 if (bp & 0x01) { 3254 addr[nbp++] = (caddr_t)rdr0(); 3255 } 3256 if (bp & 0x02) { 3257 addr[nbp++] = (caddr_t)rdr1(); 3258 } 3259 if (bp & 0x04) { 3260 addr[nbp++] = (caddr_t)rdr2(); 3261 } 3262 if (bp & 0x08) { 3263 addr[nbp++] = (caddr_t)rdr3(); 3264 } 3265 3266 for (i = 0; i < nbp; i++) { 3267 if (addr[i] < (caddr_t)VM_MAX_USER_ADDRESS) { 3268 /* 3269 * addr[i] is in user space 3270 */ 3271 return nbp; 3272 } 3273 } 3274 3275 /* 3276 * None of the breakpoints are in user space. 3277 */ 3278 return 0; 3279 } 3280 3281 3282 #ifndef DDB 3283 void 3284 Debugger(const char *msg) 3285 { 3286 kprintf("Debugger(\"%s\") called.\n", msg); 3287 } 3288 #endif /* no DDB */ 3289 3290 #ifdef DDB 3291 3292 /* 3293 * Provide inb() and outb() as functions. They are normally only 3294 * available as macros calling inlined functions, thus cannot be 3295 * called inside DDB. 3296 * 3297 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined. 3298 */ 3299 3300 #undef inb 3301 #undef outb 3302 3303 /* silence compiler warnings */ 3304 u_char inb(u_int); 3305 void outb(u_int, u_char); 3306 3307 u_char 3308 inb(u_int port) 3309 { 3310 u_char data; 3311 /* 3312 * We use %%dx and not %1 here because i/o is done at %dx and not at 3313 * %edx, while gcc generates inferior code (movw instead of movl) 3314 * if we tell it to load (u_short) port. 3315 */ 3316 __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); 3317 return (data); 3318 } 3319 3320 void 3321 outb(u_int port, u_char data) 3322 { 3323 u_char al; 3324 /* 3325 * Use an unnecessary assignment to help gcc's register allocator. 3326 * This make a large difference for gcc-1.40 and a tiny difference 3327 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for 3328 * best results. gcc-2.6.0 can't handle this. 3329 */ 3330 al = data; 3331 __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); 3332 } 3333 3334 #endif /* DDB */ 3335 3336 3337 3338 /* 3339 * initialize all the SMP locks 3340 */ 3341 3342 /* critical region when masking or unmasking interupts */ 3343 struct spinlock_deprecated imen_spinlock; 3344 3345 /* locks com (tty) data/hardware accesses: a FASTINTR() */ 3346 struct spinlock_deprecated com_spinlock; 3347 3348 /* lock regions around the clock hardware */ 3349 struct spinlock_deprecated clock_spinlock; 3350 3351 static void 3352 init_locks(void) 3353 { 3354 /* 3355 * Get the initial mplock with a count of 1 for the BSP. 3356 * This uses a LOGICAL cpu ID, ie BSP == 0. 3357 */ 3358 cpu_get_initial_mplock(); 3359 /* DEPRECATED */ 3360 spin_init_deprecated(&imen_spinlock); 3361 spin_init_deprecated(&com_spinlock); 3362 spin_init_deprecated(&clock_spinlock); 3363 3364 /* our token pool needs to work early */ 3365 lwkt_token_pool_init(); 3366 } 3367 3368 boolean_t 3369 cpu_mwait_hint_valid(uint32_t hint) 3370 { 3371 int cx_idx, sub; 3372 3373 cx_idx = MWAIT_EAX_TO_CX(hint); 3374 if (cx_idx >= CPU_MWAIT_CX_MAX) 3375 return FALSE; 3376 3377 sub = MWAIT_EAX_TO_CX_SUB(hint); 3378 if (sub >= cpu_mwait_cx_info[cx_idx].subcnt) 3379 return FALSE; 3380 3381 return TRUE; 3382 } 3383 3384 void 3385 cpu_mwait_cx_no_bmsts(void) 3386 { 3387 atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_STS); 3388 } 3389 3390 void 3391 cpu_mwait_cx_no_bmarb(void) 3392 { 3393 atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_ARB); 3394 } 3395 3396 static int 3397 cpu_mwait_cx_hint2name(int hint, char *name, int namelen, boolean_t allow_auto) 3398 { 3399 int old_cx_idx, sub = 0; 3400 3401 if (hint >= 0) { 3402 old_cx_idx = MWAIT_EAX_TO_CX(hint); 3403 sub = MWAIT_EAX_TO_CX_SUB(hint); 3404 } else if (hint == CPU_MWAIT_HINT_AUTO) { 3405 old_cx_idx = allow_auto ? CPU_MWAIT_C2 : CPU_MWAIT_CX_MAX; 3406 } else if (hint == CPU_MWAIT_HINT_AUTODEEP) { 3407 old_cx_idx = allow_auto ? CPU_MWAIT_C3 : CPU_MWAIT_CX_MAX; 3408 } else { 3409 old_cx_idx = CPU_MWAIT_CX_MAX; 3410 } 3411 3412 if (!CPU_MWAIT_HAS_CX) 3413 strlcpy(name, "NONE", namelen); 3414 else if (allow_auto && hint == CPU_MWAIT_HINT_AUTO) 3415 strlcpy(name, "AUTO", namelen); 3416 else if (allow_auto && hint == CPU_MWAIT_HINT_AUTODEEP) 3417 strlcpy(name, "AUTODEEP", namelen); 3418 else if (old_cx_idx >= CPU_MWAIT_CX_MAX || 3419 sub >= cpu_mwait_cx_info[old_cx_idx].subcnt) 3420 strlcpy(name, "INVALID", namelen); 3421 else 3422 ksnprintf(name, namelen, "C%d/%d", old_cx_idx, sub); 3423 3424 return old_cx_idx; 3425 } 3426 3427 static int 3428 cpu_mwait_cx_name2hint(char *name, int *hint0, boolean_t allow_auto) 3429 { 3430 int cx_idx, sub, hint; 3431 char *ptr, *start; 3432 3433 if (allow_auto && strcmp(name, "AUTO") == 0) { 3434 hint = CPU_MWAIT_HINT_AUTO; 3435 cx_idx = CPU_MWAIT_C2; 3436 goto done; 3437 } 3438 if (allow_auto && strcmp(name, "AUTODEEP") == 0) { 3439 hint = CPU_MWAIT_HINT_AUTODEEP; 3440 cx_idx = CPU_MWAIT_C3; 3441 goto done; 3442 } 3443 3444 if (strlen(name) < 4 || toupper(name[0]) != 'C') 3445 return -1; 3446 start = &name[1]; 3447 ptr = NULL; 3448 3449 cx_idx = strtol(start, &ptr, 10); 3450 if (ptr == start || *ptr != '/') 3451 return -1; 3452 if (cx_idx < 0 || cx_idx >= CPU_MWAIT_CX_MAX) 3453 return -1; 3454 3455 start = ptr + 1; 3456 ptr = NULL; 3457 3458 sub = strtol(start, &ptr, 10); 3459 if (*ptr != '\0') 3460 return -1; 3461 if (sub < 0 || sub >= cpu_mwait_cx_info[cx_idx].subcnt) 3462 return -1; 3463 3464 hint = MWAIT_EAX_HINT(cx_idx, sub); 3465 done: 3466 *hint0 = hint; 3467 return cx_idx; 3468 } 3469 3470 static int 3471 cpu_mwait_cx_transit(int old_cx_idx, int cx_idx) 3472 { 3473 if (cx_idx >= CPU_MWAIT_C3 && cpu_mwait_c3_preamble) 3474 return EOPNOTSUPP; 3475 if (old_cx_idx < CPU_MWAIT_C3 && cx_idx >= CPU_MWAIT_C3) { 3476 int error; 3477 3478 error = cputimer_intr_powersave_addreq(); 3479 if (error) 3480 return error; 3481 } else if (old_cx_idx >= CPU_MWAIT_C3 && cx_idx < CPU_MWAIT_C3) { 3482 cputimer_intr_powersave_remreq(); 3483 } 3484 return 0; 3485 } 3486 3487 static int 3488 cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS, int *hint0, 3489 boolean_t allow_auto) 3490 { 3491 int error, cx_idx, old_cx_idx, hint; 3492 char name[CPU_MWAIT_CX_NAMELEN]; 3493 3494 hint = *hint0; 3495 old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name), 3496 allow_auto); 3497 3498 error = sysctl_handle_string(oidp, name, sizeof(name), req); 3499 if (error != 0 || req->newptr == NULL) 3500 return error; 3501 3502 if (!CPU_MWAIT_HAS_CX) 3503 return EOPNOTSUPP; 3504 3505 cx_idx = cpu_mwait_cx_name2hint(name, &hint, allow_auto); 3506 if (cx_idx < 0) 3507 return EINVAL; 3508 3509 error = cpu_mwait_cx_transit(old_cx_idx, cx_idx); 3510 if (error) 3511 return error; 3512 3513 *hint0 = hint; 3514 return 0; 3515 } 3516 3517 static int 3518 cpu_mwait_cx_setname(struct cpu_idle_stat *stat, const char *cx_name) 3519 { 3520 int error, cx_idx, old_cx_idx, hint; 3521 char name[CPU_MWAIT_CX_NAMELEN]; 3522 3523 KASSERT(CPU_MWAIT_HAS_CX, ("cpu does not support mwait CX extension")); 3524 3525 hint = stat->hint; 3526 old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE); 3527 3528 strlcpy(name, cx_name, sizeof(name)); 3529 cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE); 3530 if (cx_idx < 0) 3531 return EINVAL; 3532 3533 error = cpu_mwait_cx_transit(old_cx_idx, cx_idx); 3534 if (error) 3535 return error; 3536 3537 stat->hint = hint; 3538 return 0; 3539 } 3540 3541 static int 3542 cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS) 3543 { 3544 int hint = cpu_mwait_halt_global; 3545 int error, cx_idx, cpu; 3546 char name[CPU_MWAIT_CX_NAMELEN], cx_name[CPU_MWAIT_CX_NAMELEN]; 3547 3548 cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE); 3549 3550 error = sysctl_handle_string(oidp, name, sizeof(name), req); 3551 if (error != 0 || req->newptr == NULL) 3552 return error; 3553 3554 if (!CPU_MWAIT_HAS_CX) 3555 return EOPNOTSUPP; 3556 3557 /* Save name for later per-cpu CX configuration */ 3558 strlcpy(cx_name, name, sizeof(cx_name)); 3559 3560 cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE); 3561 if (cx_idx < 0) 3562 return EINVAL; 3563 3564 /* Change per-cpu CX configuration */ 3565 for (cpu = 0; cpu < ncpus; ++cpu) { 3566 error = cpu_mwait_cx_setname(&cpu_idle_stats[cpu], cx_name); 3567 if (error) 3568 return error; 3569 } 3570 3571 cpu_mwait_halt_global = hint; 3572 return 0; 3573 } 3574 3575 static int 3576 cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS) 3577 { 3578 struct cpu_idle_stat *stat = arg1; 3579 int error; 3580 3581 error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req, 3582 &stat->hint, TRUE); 3583 return error; 3584 } 3585 3586 static int 3587 cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS) 3588 { 3589 int error; 3590 3591 error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req, 3592 &cpu_mwait_spin, FALSE); 3593 return error; 3594 } 3595 3596 /* 3597 * This manual debugging code is called unconditionally from Xtimer 3598 * (the per-cpu timer interrupt) whether the current thread is in a 3599 * critical section or not) and can be useful in tracking down lockups. 3600 * 3601 * NOTE: MANUAL DEBUG CODE 3602 */ 3603 #if 0 3604 static int saveticks[SMP_MAXCPU]; 3605 static int savecounts[SMP_MAXCPU]; 3606 #endif 3607 static tsc_uclock_t last_tsc[SMP_MAXCPU]; 3608 3609 void 3610 pcpu_timer_always(struct intrframe *frame) 3611 { 3612 globaldata_t gd; 3613 thread_t td; 3614 char *top; 3615 char *bot; 3616 char *rbp; 3617 char *rip; 3618 int n; 3619 tsc_uclock_t tsc; 3620 3621 if (flame_poll_debug == 0) 3622 return; 3623 gd = mycpu; 3624 tsc = rdtsc() - last_tsc[gd->gd_cpuid]; 3625 if (tsc_frequency == 0 || tsc < tsc_frequency) 3626 return; 3627 last_tsc[gd->gd_cpuid] = rdtsc(); 3628 3629 td = gd->gd_curthread; 3630 if (td == NULL) 3631 return; 3632 bot = (char *)td->td_kstack + PAGE_SIZE; /* skip guard */ 3633 top = (char *)td->td_kstack + td->td_kstack_size; 3634 if (bot >= top) 3635 return; 3636 3637 rip = (char *)(intptr_t)frame->if_rip; 3638 kprintf("POLL%02d %016lx", gd->gd_cpuid, (intptr_t)rip); 3639 rbp = (char *)(intptr_t)frame->if_rbp; 3640 3641 for (n = 1; n < 8; ++n) { 3642 if (rbp < bot || rbp > top - 8 || ((intptr_t)rbp & 7)) 3643 break; 3644 kprintf("<-%016lx", (intptr_t)*(char **)(rbp + 8)); 3645 if (*(char **)rbp <= rbp) 3646 break; 3647 rbp = *(char **)rbp; 3648 } 3649 kprintf("\n"); 3650 cpu_sfence(); 3651 } 3652 3653 SET_DECLARE(smap_open, char); 3654 SET_DECLARE(smap_close, char); 3655 3656 static void 3657 cpu_implement_smap(void) 3658 { 3659 char **scan; 3660 3661 for (scan = SET_BEGIN(smap_open); /* nop -> stac */ 3662 scan < SET_LIMIT(smap_open); ++scan) { 3663 (*scan)[0] = 0x0F; 3664 (*scan)[1] = 0x01; 3665 (*scan)[2] = 0xCB; 3666 } 3667 for (scan = SET_BEGIN(smap_close); /* nop -> clac */ 3668 scan < SET_LIMIT(smap_close); ++scan) { 3669 (*scan)[0] = 0x0F; 3670 (*scan)[1] = 0x01; 3671 (*scan)[2] = 0xCA; 3672 } 3673 } 3674 3675 /* 3676 * From a hard interrupt 3677 */ 3678 int 3679 cpu_interrupt_running(struct thread *td) 3680 { 3681 struct mdglobaldata *gd = mdcpu; 3682 3683 if (clock_debug1 > 0) { 3684 --clock_debug1; 3685 kprintf("%d %016lx %016lx %016lx\n", 3686 ((td->td_flags & TDF_INTTHREAD) != 0), 3687 gd->gd_ipending[0], 3688 gd->gd_ipending[1], 3689 gd->gd_ipending[2]); 3690 if (td->td_flags & TDF_CLKTHREAD) { 3691 kprintf("CLKTD %s PREEMPT %s\n", 3692 td->td_comm, 3693 (td->td_preempted ? 3694 td->td_preempted->td_comm : "")); 3695 } else { 3696 kprintf("NORTD %s\n", td->td_comm); 3697 } 3698 } 3699 if ((td->td_flags & TDF_INTTHREAD) || 3700 gd->gd_ipending[0] || 3701 gd->gd_ipending[1] || 3702 gd->gd_ipending[2]) { 3703 return 1; 3704 } else { 3705 return 0; 3706 } 3707 } 3708