1 /*- 2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 3 * Copyright (c) 1992 Terrence R. Lambert. 4 * Copyright (c) 2003 Peter Wemm. 5 * Copyright (c) 2008-2017 The DragonFly Project. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * William Jolitz. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $ 41 */ 42 43 //#include "use_npx.h" 44 #include "use_isa.h" 45 #include "opt_cpu.h" 46 #include "opt_ddb.h" 47 #include "opt_inet.h" 48 #include "opt_msgbuf.h" 49 #include "opt_swap.h" 50 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/sysmsg.h> 54 #include <sys/signalvar.h> 55 #include <sys/kernel.h> 56 #include <sys/linker.h> 57 #include <sys/malloc.h> 58 #include <sys/proc.h> 59 #include <sys/priv.h> 60 #include <sys/buf.h> 61 #include <sys/reboot.h> 62 #include <sys/mbuf.h> 63 #include <sys/msgbuf.h> 64 #include <sys/sysent.h> 65 #include <sys/sysctl.h> 66 #include <sys/vmmeter.h> 67 #include <sys/bus.h> 68 #include <sys/usched.h> 69 #include <sys/reg.h> 70 #include <sys/sbuf.h> 71 #include <sys/ctype.h> 72 #include <sys/serialize.h> 73 #include <sys/systimer.h> 74 75 #include <vm/vm.h> 76 #include <vm/vm_param.h> 77 #include <sys/lock.h> 78 #include <vm/vm_kern.h> 79 #include <vm/vm_object.h> 80 #include <vm/vm_page.h> 81 #include <vm/vm_map.h> 82 #include <vm/vm_pager.h> 83 #include <vm/vm_extern.h> 84 85 #include <sys/thread2.h> 86 #include <sys/mplock2.h> 87 88 #include <sys/exec.h> 89 #include <sys/cons.h> 90 91 #include <sys/efi.h> 92 93 #include <ddb/ddb.h> 94 95 #include <machine/cpu.h> 96 #include <machine/clock.h> 97 #include <machine/specialreg.h> 98 #if 0 /* JG */ 99 #include <machine/bootinfo.h> 100 #endif 101 #include <machine/md_var.h> 102 #include <machine/metadata.h> 103 #include <machine/pc/bios.h> 104 #include <machine/pcb_ext.h> 105 #include <machine/globaldata.h> /* CPU_prvspace */ 106 #include <machine/smp.h> 107 #include <machine/cputypes.h> 108 #include <machine/intr_machdep.h> 109 #include <machine/framebuffer.h> 110 111 #ifdef OLD_BUS_ARCH 112 #include <bus/isa/isa_device.h> 113 #endif 114 #include <machine_base/isa/isa_intr.h> 115 #include <bus/isa/rtc.h> 116 #include <sys/random.h> 117 #include <sys/ptrace.h> 118 #include <machine/sigframe.h> 119 120 #include <sys/machintr.h> 121 #include <machine_base/icu/icu_abi.h> 122 #include <machine_base/icu/elcr_var.h> 123 #include <machine_base/apic/lapic.h> 124 #include <machine_base/apic/ioapic.h> 125 #include <machine_base/apic/ioapic_abi.h> 126 #include <machine/mptable.h> 127 128 #define PHYSMAP_ENTRIES 10 129 #define MAXBUFSTRUCTSIZE ((size_t)512 * 1024 * 1024) 130 131 extern u_int64_t hammer_time(u_int64_t, u_int64_t); 132 133 extern void printcpuinfo(void); /* XXX header file */ 134 extern void identify_cpu(void); 135 extern void panicifcpuunsupported(void); 136 137 static void cpu_startup(void *); 138 static void pic_finish(void *); 139 static void cpu_finish(void *); 140 141 static void set_fpregs_xmm(struct save87 *, struct savexmm *); 142 static void fill_fpregs_xmm(struct savexmm *, struct save87 *); 143 static void init_locks(void); 144 145 extern void pcpu_timer_always(struct intrframe *); 146 147 SYSINIT(cpu, SI_BOOT2_START_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 148 SYSINIT(pic_finish, SI_BOOT2_FINISH_PIC, SI_ORDER_FIRST, pic_finish, NULL); 149 SYSINIT(cpu_finish, SI_BOOT2_FINISH_CPU, SI_ORDER_FIRST, cpu_finish, NULL); 150 151 #ifdef DDB 152 extern vm_offset_t ksym_start, ksym_end; 153 #endif 154 155 struct privatespace CPU_prvspace_bsp __aligned(4096); 156 struct privatespace *CPU_prvspace[MAXCPU] = { &CPU_prvspace_bsp }; 157 158 vm_paddr_t efi_systbl_phys; 159 int _udatasel, _ucodesel, _ucode32sel; 160 u_long atdevbase; 161 int64_t tsc_offsets[MAXCPU]; 162 cpumask_t smp_idleinvl_mask; 163 cpumask_t smp_idleinvl_reqs; 164 165 /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */ 166 __read_mostly static int cpu_mwait_halt_global; 167 __read_mostly static int clock_debug1; 168 __read_mostly static int flame_poll_debug; 169 170 SYSCTL_INT(_debug, OID_AUTO, flame_poll_debug, 171 CTLFLAG_RW, &flame_poll_debug, 0, ""); 172 TUNABLE_INT("debug.flame_poll_debug", &flame_poll_debug); 173 174 #if defined(SWTCH_OPTIM_STATS) 175 extern int swtch_optim_stats; 176 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, 177 CTLFLAG_RD, &swtch_optim_stats, 0, ""); 178 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, 179 CTLFLAG_RD, &tlb_flush_count, 0, ""); 180 #endif 181 SYSCTL_INT(_debug, OID_AUTO, clock_debug1, 182 CTLFLAG_RW, &clock_debug1, 0, ""); 183 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_halt, 184 CTLFLAG_RD, &cpu_mwait_halt_global, 0, ""); 185 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_spin, 186 CTLFLAG_RD, &cpu_mwait_spin, 0, "monitor/mwait target state"); 187 188 #define CPU_MWAIT_HAS_CX \ 189 ((cpu_feature2 & CPUID2_MON) && \ 190 (cpu_mwait_feature & CPUID_MWAIT_EXT)) 191 192 #define CPU_MWAIT_CX_NAMELEN 16 193 194 #define CPU_MWAIT_C1 1 195 #define CPU_MWAIT_C2 2 196 #define CPU_MWAIT_C3 3 197 #define CPU_MWAIT_CX_MAX 8 198 199 #define CPU_MWAIT_HINT_AUTO -1 /* C1 and C2 */ 200 #define CPU_MWAIT_HINT_AUTODEEP -2 /* C3+ */ 201 202 SYSCTL_NODE(_machdep, OID_AUTO, mwait, CTLFLAG_RW, 0, "MWAIT features"); 203 SYSCTL_NODE(_machdep_mwait, OID_AUTO, CX, CTLFLAG_RW, 0, "MWAIT Cx settings"); 204 205 struct cpu_mwait_cx { 206 int subcnt; 207 char name[4]; 208 struct sysctl_ctx_list sysctl_ctx; 209 struct sysctl_oid *sysctl_tree; 210 }; 211 static struct cpu_mwait_cx cpu_mwait_cx_info[CPU_MWAIT_CX_MAX]; 212 static char cpu_mwait_cx_supported[256]; 213 214 static int cpu_mwait_c1_hints_cnt; 215 static int cpu_mwait_hints_cnt; 216 static int *cpu_mwait_hints; 217 218 static int cpu_mwait_deep_hints_cnt; 219 static int *cpu_mwait_deep_hints; 220 221 #define CPU_IDLE_REPEAT_DEFAULT 750 222 223 static u_int cpu_idle_repeat = CPU_IDLE_REPEAT_DEFAULT; 224 static u_long cpu_idle_repeat_max = CPU_IDLE_REPEAT_DEFAULT; 225 static u_int cpu_mwait_repeat_shift = 1; 226 227 #define CPU_MWAIT_C3_PREAMBLE_BM_ARB 0x1 228 #define CPU_MWAIT_C3_PREAMBLE_BM_STS 0x2 229 230 static int cpu_mwait_c3_preamble = 231 CPU_MWAIT_C3_PREAMBLE_BM_ARB | 232 CPU_MWAIT_C3_PREAMBLE_BM_STS; 233 234 SYSCTL_STRING(_machdep_mwait_CX, OID_AUTO, supported, CTLFLAG_RD, 235 cpu_mwait_cx_supported, 0, "MWAIT supported C states"); 236 SYSCTL_INT(_machdep_mwait_CX, OID_AUTO, c3_preamble, CTLFLAG_RD, 237 &cpu_mwait_c3_preamble, 0, "C3+ preamble mask"); 238 239 static int cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS, 240 int *, boolean_t); 241 static int cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS); 242 static int cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS); 243 static int cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS); 244 245 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, idle, CTLTYPE_STRING|CTLFLAG_RW, 246 NULL, 0, cpu_mwait_cx_idle_sysctl, "A", ""); 247 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, spin, CTLTYPE_STRING|CTLFLAG_RW, 248 NULL, 0, cpu_mwait_cx_spin_sysctl, "A", ""); 249 SYSCTL_UINT(_machdep_mwait_CX, OID_AUTO, repeat_shift, CTLFLAG_RW, 250 &cpu_mwait_repeat_shift, 0, ""); 251 252 long physmem = 0; 253 254 u_long ebda_addr = 0; 255 256 int imcr_present = 0; 257 258 int naps = 0; /* # of Applications processors */ 259 260 u_int base_memory; 261 262 static int 263 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS) 264 { 265 u_long pmem = ctob(physmem); 266 int error; 267 268 error = sysctl_handle_long(oidp, &pmem, 0, req); 269 270 return (error); 271 } 272 273 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_ULONG|CTLFLAG_RD, 274 0, 0, sysctl_hw_physmem, "LU", 275 "Total system memory in bytes (number of pages * page size)"); 276 277 static int 278 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS) 279 { 280 u_long usermem = ctob(physmem - vmstats.v_wire_count); 281 int error; 282 283 error = sysctl_handle_long(oidp, &usermem, 0, req); 284 285 return (error); 286 } 287 288 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_ULONG|CTLFLAG_RD, 289 0, 0, sysctl_hw_usermem, "LU", ""); 290 291 static int 292 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS) 293 { 294 int error; 295 u_long availpages; 296 297 availpages = x86_64_btop(avail_end - avail_start); 298 error = sysctl_handle_long(oidp, &availpages, 0, req); 299 300 return (error); 301 } 302 303 SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_ULONG|CTLFLAG_RD, 304 0, 0, sysctl_hw_availpages, "LU", ""); 305 306 vm_paddr_t Maxmem; 307 vm_paddr_t Realmem; 308 309 /* 310 * The number of PHYSMAP entries must be one less than the number of 311 * PHYSSEG entries because the PHYSMAP entry that spans the largest 312 * physical address that is accessible by ISA DMA is split into two 313 * PHYSSEG entries. 314 */ 315 vm_phystable_t phys_avail[VM_PHYSSEG_MAX + 1]; 316 vm_phystable_t dump_avail[VM_PHYSSEG_MAX + 1]; 317 318 /* must be 1 less so 0 0 can signal end of chunks */ 319 #define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 1) 320 #define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 1) 321 322 static vm_offset_t buffer_sva, buffer_eva; 323 vm_offset_t clean_sva, clean_eva; 324 static vm_offset_t pager_sva, pager_eva; 325 static struct trapframe proc0_tf; 326 327 static void cpu_implement_smap(void); 328 329 static void 330 cpu_startup(void *dummy) 331 { 332 caddr_t v; 333 vm_size_t size = 0; 334 vm_offset_t firstaddr; 335 336 /* 337 * Good {morning,afternoon,evening,night}. 338 */ 339 kprintf("%s", version); 340 startrtclock(); 341 printcpuinfo(); 342 panicifcpuunsupported(); 343 if (cpu_stdext_feature & CPUID_STDEXT_SMAP) 344 cpu_implement_smap(); 345 346 kprintf("real memory = %ju (%ju MB)\n", 347 (intmax_t)Realmem, 348 (intmax_t)Realmem / 1024 / 1024); 349 /* 350 * Display any holes after the first chunk of extended memory. 351 */ 352 if (bootverbose) { 353 int indx; 354 355 kprintf("Physical memory chunk(s):\n"); 356 for (indx = 0; phys_avail[indx].phys_end != 0; ++indx) { 357 vm_paddr_t size1; 358 359 size1 = phys_avail[indx].phys_end - 360 phys_avail[indx].phys_beg; 361 362 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n", 363 (intmax_t)phys_avail[indx].phys_beg, 364 (intmax_t)phys_avail[indx].phys_end - 1, 365 (intmax_t)size1, 366 (intmax_t)(size1 / PAGE_SIZE)); 367 } 368 } 369 370 /* 371 * Allocate space for system data structures. 372 * The first available kernel virtual address is in "v". 373 * As pages of kernel virtual memory are allocated, "v" is incremented. 374 * As pages of memory are allocated and cleared, 375 * "firstaddr" is incremented. 376 * An index into the kernel page table corresponding to the 377 * virtual memory address maintained in "v" is kept in "mapaddr". 378 */ 379 380 /* 381 * Make two passes. The first pass calculates how much memory is 382 * needed and allocates it. The second pass assigns virtual 383 * addresses to the various data structures. 384 */ 385 firstaddr = 0; 386 again: 387 v = (caddr_t)firstaddr; 388 389 #define valloc(name, type, num) \ 390 (name) = (type *)v; v = (caddr_t)((name)+(num)) 391 #define valloclim(name, type, num, lim) \ 392 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) 393 394 /* 395 * Calculate nbuf such that maxbufspace uses approximately 1/20 396 * of physical memory by default, with a minimum of 50 buffers. 397 * 398 * The calculation is made after discounting 128MB. 399 * 400 * NOTE: maxbufspace is (nbuf * NBUFCALCSIZE) (NBUFCALCSIZE ~= 16KB). 401 * nbuf = (kbytes / factor) would cover all of memory. 402 */ 403 if (nbuf == 0) { 404 long factor = NBUFCALCSIZE / 1024; /* KB/nbuf */ 405 long kbytes = physmem * (PAGE_SIZE / 1024); /* physmem */ 406 407 nbuf = 50; 408 if (kbytes > 128 * 1024) 409 nbuf += (kbytes - 128 * 1024) / (factor * 20); 410 if (maxbcache && nbuf > maxbcache / NBUFCALCSIZE) 411 nbuf = maxbcache / NBUFCALCSIZE; 412 if ((size_t)nbuf * sizeof(struct buf) > MAXBUFSTRUCTSIZE) { 413 kprintf("Warning: nbuf capped at %ld due to the " 414 "reasonability limit\n", nbuf); 415 nbuf = MAXBUFSTRUCTSIZE / sizeof(struct buf); 416 } 417 } 418 419 /* 420 * Do not allow the buffer_map to be more then 1/2 the size of the 421 * kernel_map. 422 */ 423 if (nbuf > (virtual_end - virtual_start + 424 virtual2_end - virtual2_start) / (MAXBSIZE * 2)) { 425 nbuf = (virtual_end - virtual_start + 426 virtual2_end - virtual2_start) / (MAXBSIZE * 2); 427 kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf); 428 } 429 430 /* 431 * Do not allow the buffer_map to use more than 50% of available 432 * physical-equivalent memory. Since the VM pages which back 433 * individual buffers are typically wired, having too many bufs 434 * can prevent the system from paging properly. 435 */ 436 if (nbuf > physmem * PAGE_SIZE / (NBUFCALCSIZE * 2)) { 437 nbuf = physmem * PAGE_SIZE / (NBUFCALCSIZE * 2); 438 kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf); 439 } 440 441 /* 442 * Do not allow the sizeof(struct buf) * nbuf to exceed 1/4 of 443 * the valloc space which is just the virtual_end - virtual_start 444 * section. This is typically ~2GB regardless of the amount of 445 * memory, so we use 500MB as a metric. 446 * 447 * This is because we use valloc() to allocate the buf header array. 448 * 449 * NOTE: buffer space in bytes is limited by vfs.*bufspace sysctls. 450 */ 451 if (nbuf > (virtual_end - virtual_start) / (sizeof(struct buf) * 4)) { 452 nbuf = (virtual_end - virtual_start) / 453 (sizeof(struct buf) * 4); 454 kprintf("Warning: nbufs capped at %ld due to " 455 "valloc considerations\n", 456 nbuf); 457 } 458 459 nswbuf_mem = lmax(lmin(nbuf / 32, 512), 8); 460 #ifdef NSWBUF_MIN 461 if (nswbuf_mem < NSWBUF_MIN) 462 nswbuf_mem = NSWBUF_MIN; 463 #endif 464 nswbuf_kva = lmax(lmin(nbuf / 4, 512), 16); 465 #ifdef NSWBUF_MIN 466 if (nswbuf_kva < NSWBUF_MIN) 467 nswbuf_kva = NSWBUF_MIN; 468 #endif 469 470 valloc(swbuf_mem, struct buf, nswbuf_mem); 471 valloc(swbuf_kva, struct buf, nswbuf_kva); 472 valloc(buf, struct buf, nbuf); 473 474 /* 475 * End of first pass, size has been calculated so allocate memory 476 */ 477 if (firstaddr == 0) { 478 size = (vm_size_t)(v - firstaddr); 479 firstaddr = kmem_alloc(kernel_map, round_page(size), 480 VM_SUBSYS_BUF); 481 if (firstaddr == 0) 482 panic("startup: no room for tables"); 483 goto again; 484 } 485 486 /* 487 * End of second pass, addresses have been assigned 488 * 489 * nbuf is an int, make sure we don't overflow the field. 490 * 491 * On 64-bit systems we always reserve maximal allocations for 492 * buffer cache buffers and there are no fragmentation issues, 493 * so the KVA segment does not have to be excessively oversized. 494 */ 495 if ((vm_size_t)(v - firstaddr) != size) 496 panic("startup: table size inconsistency"); 497 498 kmem_suballoc(kernel_map, clean_map, &clean_sva, &clean_eva, 499 ((vm_offset_t)(nbuf + 16) * MAXBSIZE) + 500 ((nswbuf_mem + nswbuf_kva) * MAXPHYS) + pager_map_size); 501 kmem_suballoc(clean_map, buffer_map, &buffer_sva, &buffer_eva, 502 ((vm_offset_t)(nbuf + 16) * MAXBSIZE)); 503 buffer_map->system_map = 1; 504 kmem_suballoc(clean_map, pager_map, &pager_sva, &pager_eva, 505 ((vm_offset_t)(nswbuf_mem + nswbuf_kva) * MAXPHYS) + 506 pager_map_size); 507 pager_map->system_map = 1; 508 kprintf("avail memory = %ju (%ju MB)\n", 509 (uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages), 510 (uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages) / 511 1024 / 1024); 512 } 513 514 struct cpu_idle_stat { 515 int hint; 516 int reserved; 517 u_long halt; 518 u_long spin; 519 u_long repeat; 520 u_long repeat_last; 521 u_long repeat_delta; 522 u_long mwait_cx[CPU_MWAIT_CX_MAX]; 523 } __cachealign; 524 525 #define CPU_IDLE_STAT_HALT -1 526 #define CPU_IDLE_STAT_SPIN -2 527 528 static struct cpu_idle_stat cpu_idle_stats[MAXCPU]; 529 530 static int 531 sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS) 532 { 533 int idx = arg2, cpu, error; 534 u_long val = 0; 535 536 if (idx == CPU_IDLE_STAT_HALT) { 537 for (cpu = 0; cpu < ncpus; ++cpu) 538 val += cpu_idle_stats[cpu].halt; 539 } else if (idx == CPU_IDLE_STAT_SPIN) { 540 for (cpu = 0; cpu < ncpus; ++cpu) 541 val += cpu_idle_stats[cpu].spin; 542 } else { 543 KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX, 544 ("invalid index %d", idx)); 545 for (cpu = 0; cpu < ncpus; ++cpu) 546 val += cpu_idle_stats[cpu].mwait_cx[idx]; 547 } 548 549 error = sysctl_handle_quad(oidp, &val, 0, req); 550 if (error || req->newptr == NULL) 551 return error; 552 553 if (idx == CPU_IDLE_STAT_HALT) { 554 for (cpu = 0; cpu < ncpus; ++cpu) 555 cpu_idle_stats[cpu].halt = 0; 556 cpu_idle_stats[0].halt = val; 557 } else if (idx == CPU_IDLE_STAT_SPIN) { 558 for (cpu = 0; cpu < ncpus; ++cpu) 559 cpu_idle_stats[cpu].spin = 0; 560 cpu_idle_stats[0].spin = val; 561 } else { 562 KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX, 563 ("invalid index %d", idx)); 564 for (cpu = 0; cpu < ncpus; ++cpu) 565 cpu_idle_stats[cpu].mwait_cx[idx] = 0; 566 cpu_idle_stats[0].mwait_cx[idx] = val; 567 } 568 return 0; 569 } 570 571 static void 572 cpu_mwait_attach(void) 573 { 574 struct sbuf sb; 575 int hint_idx, i; 576 577 if (!CPU_MWAIT_HAS_CX) 578 return; 579 580 if (cpu_vendor_id == CPU_VENDOR_INTEL && 581 (CPUID_TO_FAMILY(cpu_id) > 0xf || 582 (CPUID_TO_FAMILY(cpu_id) == 0x6 && 583 CPUID_TO_MODEL(cpu_id) >= 0xf))) { 584 int bm_sts = 1; 585 586 /* 587 * Pentium dual-core, Core 2 and beyond do not need any 588 * additional activities to enter deep C-state, i.e. C3(+). 589 */ 590 cpu_mwait_cx_no_bmarb(); 591 592 TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts); 593 if (!bm_sts) 594 cpu_mwait_cx_no_bmsts(); 595 } 596 597 sbuf_new(&sb, cpu_mwait_cx_supported, 598 sizeof(cpu_mwait_cx_supported), SBUF_FIXEDLEN); 599 600 for (i = 0; i < CPU_MWAIT_CX_MAX; ++i) { 601 struct cpu_mwait_cx *cx = &cpu_mwait_cx_info[i]; 602 int sub; 603 604 ksnprintf(cx->name, sizeof(cx->name), "C%d", i); 605 606 sysctl_ctx_init(&cx->sysctl_ctx); 607 cx->sysctl_tree = SYSCTL_ADD_NODE(&cx->sysctl_ctx, 608 SYSCTL_STATIC_CHILDREN(_machdep_mwait), OID_AUTO, 609 cx->name, CTLFLAG_RW, NULL, "Cx control/info"); 610 if (cx->sysctl_tree == NULL) 611 continue; 612 613 cx->subcnt = CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu, i); 614 SYSCTL_ADD_INT(&cx->sysctl_ctx, 615 SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO, 616 "subcnt", CTLFLAG_RD, &cx->subcnt, 0, 617 "sub-state count"); 618 SYSCTL_ADD_PROC(&cx->sysctl_ctx, 619 SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO, 620 "entered", (CTLTYPE_QUAD | CTLFLAG_RW), 0, 621 i, sysctl_cpu_idle_cnt, "Q", "# of times entered"); 622 623 for (sub = 0; sub < cx->subcnt; ++sub) 624 sbuf_printf(&sb, "C%d/%d ", i, sub); 625 } 626 sbuf_trim(&sb); 627 sbuf_finish(&sb); 628 629 /* 630 * Non-deep C-states 631 */ 632 cpu_mwait_c1_hints_cnt = cpu_mwait_cx_info[CPU_MWAIT_C1].subcnt; 633 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i) 634 cpu_mwait_hints_cnt += cpu_mwait_cx_info[i].subcnt; 635 cpu_mwait_hints = kmalloc(sizeof(int) * cpu_mwait_hints_cnt, 636 M_DEVBUF, M_WAITOK); 637 638 hint_idx = 0; 639 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i) { 640 int j, subcnt; 641 642 subcnt = cpu_mwait_cx_info[i].subcnt; 643 for (j = 0; j < subcnt; ++j) { 644 KASSERT(hint_idx < cpu_mwait_hints_cnt, 645 ("invalid mwait hint index %d", hint_idx)); 646 cpu_mwait_hints[hint_idx] = MWAIT_EAX_HINT(i, j); 647 ++hint_idx; 648 } 649 } 650 KASSERT(hint_idx == cpu_mwait_hints_cnt, 651 ("mwait hint count %d != index %d", 652 cpu_mwait_hints_cnt, hint_idx)); 653 654 if (bootverbose) { 655 kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt); 656 for (i = 0; i < cpu_mwait_hints_cnt; ++i) { 657 int hint = cpu_mwait_hints[i]; 658 659 kprintf(" C%d/%d hint 0x%04x\n", 660 MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint), 661 hint); 662 } 663 } 664 665 /* 666 * Deep C-states 667 */ 668 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i) 669 cpu_mwait_deep_hints_cnt += cpu_mwait_cx_info[i].subcnt; 670 cpu_mwait_deep_hints = kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt, 671 M_DEVBUF, M_WAITOK); 672 673 hint_idx = 0; 674 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i) { 675 int j, subcnt; 676 677 subcnt = cpu_mwait_cx_info[i].subcnt; 678 for (j = 0; j < subcnt; ++j) { 679 KASSERT(hint_idx < cpu_mwait_deep_hints_cnt, 680 ("invalid mwait deep hint index %d", hint_idx)); 681 cpu_mwait_deep_hints[hint_idx] = MWAIT_EAX_HINT(i, j); 682 ++hint_idx; 683 } 684 } 685 KASSERT(hint_idx == cpu_mwait_deep_hints_cnt, 686 ("mwait deep hint count %d != index %d", 687 cpu_mwait_deep_hints_cnt, hint_idx)); 688 689 if (bootverbose) { 690 kprintf("MWAIT deep hints:\n"); 691 for (i = 0; i < cpu_mwait_deep_hints_cnt; ++i) { 692 int hint = cpu_mwait_deep_hints[i]; 693 694 kprintf(" C%d/%d hint 0x%04x\n", 695 MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint), 696 hint); 697 } 698 } 699 cpu_idle_repeat_max = 256 * cpu_mwait_deep_hints_cnt; 700 701 for (i = 0; i < ncpus; ++i) { 702 char name[16]; 703 704 ksnprintf(name, sizeof(name), "idle%d", i); 705 SYSCTL_ADD_PROC(NULL, 706 SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX), OID_AUTO, 707 name, (CTLTYPE_STRING | CTLFLAG_RW), &cpu_idle_stats[i], 708 0, cpu_mwait_cx_pcpu_idle_sysctl, "A", ""); 709 } 710 } 711 712 static void 713 cpu_finish(void *dummy __unused) 714 { 715 cpu_setregs(); 716 cpu_mwait_attach(); 717 } 718 719 static void 720 pic_finish(void *dummy __unused) 721 { 722 /* Log ELCR information */ 723 elcr_dump(); 724 725 /* Log MPTABLE information */ 726 mptable_pci_int_dump(); 727 728 /* Finalize PCI */ 729 MachIntrABI.finalize(); 730 } 731 732 /* 733 * Send an interrupt to process. 734 * 735 * Stack is set up to allow sigcode stored 736 * at top to call routine, followed by kcall 737 * to sigreturn routine below. After sigreturn 738 * resets the signal mask, the stack, and the 739 * frame pointer, it returns to the user 740 * specified pc, psl. 741 */ 742 void 743 sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code) 744 { 745 struct lwp *lp = curthread->td_lwp; 746 struct proc *p = lp->lwp_proc; 747 struct trapframe *regs; 748 struct sigacts *psp = p->p_sigacts; 749 struct sigframe sf, *sfp; 750 int oonstack; 751 char *sp; 752 753 regs = lp->lwp_md.md_regs; 754 oonstack = (lp->lwp_sigstk.ss_flags & SS_ONSTACK) ? 1 : 0; 755 756 /* Save user context */ 757 bzero(&sf, sizeof(struct sigframe)); 758 sf.sf_uc.uc_sigmask = *mask; 759 sf.sf_uc.uc_stack = lp->lwp_sigstk; 760 sf.sf_uc.uc_mcontext.mc_onstack = oonstack; 761 KKASSERT(__offsetof(struct trapframe, tf_rdi) == 0); 762 /* gcc errors out on optimized bcopy */ 763 _bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(struct trapframe)); 764 765 /* Make the size of the saved context visible to userland */ 766 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); 767 768 /* Allocate and validate space for the signal handler context. */ 769 if ((lp->lwp_flags & LWP_ALTSTACK) != 0 && !oonstack && 770 SIGISMEMBER(psp->ps_sigonstack, sig)) { 771 sp = (char *)lp->lwp_sigstk.ss_sp + lp->lwp_sigstk.ss_size - 772 sizeof(struct sigframe); 773 lp->lwp_sigstk.ss_flags |= SS_ONSTACK; 774 } else { 775 /* We take red zone into account */ 776 sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128; 777 } 778 779 /* 780 * XXX AVX needs 64-byte alignment but sigframe has other fields and 781 * the embedded ucontext is not at the front, so aligning this won't 782 * help us. Fortunately we bcopy in/out of the sigframe, so the 783 * kernel is ok. 784 * 785 * The problem though is if userland winds up trying to use the 786 * context directly. 787 */ 788 sfp = (struct sigframe *)((intptr_t)sp & ~(intptr_t)0xF); 789 790 /* Translate the signal is appropriate */ 791 if (p->p_sysent->sv_sigtbl) { 792 if (sig <= p->p_sysent->sv_sigsize) 793 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 794 } 795 796 /* 797 * Build the argument list for the signal handler. 798 * 799 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx) 800 */ 801 regs->tf_rdi = sig; /* argument 1 */ 802 regs->tf_rdx = (register_t)&sfp->sf_uc; /* argument 3 */ 803 804 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 805 /* 806 * Signal handler installed with SA_SIGINFO. 807 * 808 * action(signo, siginfo, ucontext) 809 */ 810 regs->tf_rsi = (register_t)&sfp->sf_si; /* argument 2 */ 811 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */ 812 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 813 814 /* fill siginfo structure */ 815 sf.sf_si.si_signo = sig; 816 sf.sf_si.si_pid = psp->ps_frominfo[sig].pid; 817 sf.sf_si.si_uid = psp->ps_frominfo[sig].uid; 818 sf.sf_si.si_code = code; 819 sf.sf_si.si_addr = (void *)regs->tf_addr; 820 } else { 821 /* 822 * Old FreeBSD-style arguments. 823 * 824 * handler (signo, code, [uc], addr) 825 */ 826 regs->tf_rsi = (register_t)code; /* argument 2 */ 827 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */ 828 sf.sf_ahu.sf_handler = catcher; 829 } 830 831 /* 832 * If we're a vm86 process, we want to save the segment registers. 833 * We also change eflags to be our emulated eflags, not the actual 834 * eflags. 835 */ 836 #if 0 /* JG */ 837 if (regs->tf_eflags & PSL_VM) { 838 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 839 struct vm86_kernel *vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86; 840 841 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 842 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 843 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 844 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 845 846 if (vm86->vm86_has_vme == 0) 847 sf.sf_uc.uc_mcontext.mc_eflags = 848 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 849 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 850 851 /* 852 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 853 * syscalls made by the signal handler. This just avoids 854 * wasting time for our lazy fixup of such faults. PSL_NT 855 * does nothing in vm86 mode, but vm86 programs can set it 856 * almost legitimately in probes for old cpu types. 857 */ 858 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 859 } 860 #endif 861 862 /* 863 * Save the FPU state and reinit the FP unit 864 */ 865 npxpush(&sf.sf_uc.uc_mcontext); 866 867 /* 868 * Copy the sigframe out to the user's stack. 869 */ 870 if (copyout(&sf, sfp, sizeof(struct sigframe)) != 0) { 871 /* 872 * Something is wrong with the stack pointer. 873 * ...Kill the process. 874 */ 875 sigexit(lp, SIGILL); 876 } 877 878 regs->tf_rsp = (register_t)sfp; 879 regs->tf_rip = trunc_page64(PS_STRINGS - *(p->p_sysent->sv_szsigcode)); 880 regs->tf_rip -= SZSIGCODE_EXTRA_BYTES; 881 882 /* 883 * x86 abi specifies that the direction flag must be cleared 884 * on function entry 885 */ 886 regs->tf_rflags &= ~(PSL_T | PSL_D); 887 888 /* 889 * 64 bit mode has a code and stack selector but 890 * no data or extra selector. %fs and %gs are not 891 * stored in-context. 892 */ 893 regs->tf_cs = _ucodesel; 894 regs->tf_ss = _udatasel; 895 clear_quickret(); 896 } 897 898 /* 899 * Sanitize the trapframe for a virtual kernel passing control to a custom 900 * VM context. Remove any items that would otherwise create a privilage 901 * issue. 902 * 903 * XXX at the moment we allow userland to set the resume flag. Is this a 904 * bad idea? 905 */ 906 int 907 cpu_sanitize_frame(struct trapframe *frame) 908 { 909 frame->tf_cs = _ucodesel; 910 frame->tf_ss = _udatasel; 911 /* XXX VM (8086) mode not supported? */ 912 frame->tf_rflags &= (PSL_RF | PSL_USERCHANGE | PSL_VM_UNSUPP); 913 frame->tf_rflags |= PSL_RESERVED_DEFAULT | PSL_I; 914 915 return(0); 916 } 917 918 /* 919 * Sanitize the tls so loading the descriptor does not blow up 920 * on us. For x86_64 we don't have to do anything. 921 */ 922 int 923 cpu_sanitize_tls(struct savetls *tls) 924 { 925 return(0); 926 } 927 928 /* 929 * sigreturn(ucontext_t *sigcntxp) 930 * 931 * System call to cleanup state after a signal 932 * has been taken. Reset signal mask and 933 * stack state from context left by sendsig (above). 934 * Return to previous pc and psl as specified by 935 * context left by sendsig. Check carefully to 936 * make sure that the user has not modified the 937 * state to gain improper privileges. 938 * 939 * MPSAFE 940 */ 941 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 942 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 943 944 int 945 sys_sigreturn(struct sysmsg *sysmsg, const struct sigreturn_args *uap) 946 { 947 struct lwp *lp = curthread->td_lwp; 948 struct trapframe *regs; 949 ucontext_t uc; 950 ucontext_t *ucp; 951 register_t rflags; 952 int cs; 953 int error; 954 955 /* 956 * We have to copy the information into kernel space so userland 957 * can't modify it while we are sniffing it. 958 */ 959 regs = lp->lwp_md.md_regs; 960 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 961 if (error) 962 return (error); 963 ucp = &uc; 964 rflags = ucp->uc_mcontext.mc_rflags; 965 966 /* VM (8086) mode not supported */ 967 rflags &= ~PSL_VM_UNSUPP; 968 969 #if 0 /* JG */ 970 if (eflags & PSL_VM) { 971 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 972 struct vm86_kernel *vm86; 973 974 /* 975 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 976 * set up the vm86 area, and we can't enter vm86 mode. 977 */ 978 if (lp->lwp_thread->td_pcb->pcb_ext == 0) 979 return (EINVAL); 980 vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86; 981 if (vm86->vm86_inited == 0) 982 return (EINVAL); 983 984 /* go back to user mode if both flags are set */ 985 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) 986 trapsignal(lp, SIGBUS, 0); 987 988 if (vm86->vm86_has_vme) { 989 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 990 (eflags & VME_USERCHANGE) | PSL_VM; 991 } else { 992 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 993 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 994 (eflags & VM_USERCHANGE) | PSL_VM; 995 } 996 bcopy(&ucp->uc_mcontext.mc_gs, tf, sizeof(struct trapframe)); 997 tf->tf_eflags = eflags; 998 tf->tf_vm86_ds = tf->tf_ds; 999 tf->tf_vm86_es = tf->tf_es; 1000 tf->tf_vm86_fs = tf->tf_fs; 1001 tf->tf_vm86_gs = tf->tf_gs; 1002 tf->tf_ds = _udatasel; 1003 tf->tf_es = _udatasel; 1004 tf->tf_fs = _udatasel; 1005 tf->tf_gs = _udatasel; 1006 } else 1007 #endif 1008 { 1009 /* 1010 * Don't allow users to change privileged or reserved flags. 1011 */ 1012 /* 1013 * XXX do allow users to change the privileged flag PSL_RF. 1014 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers 1015 * should sometimes set it there too. tf_eflags is kept in 1016 * the signal context during signal handling and there is no 1017 * other place to remember it, so the PSL_RF bit may be 1018 * corrupted by the signal handler without us knowing. 1019 * Corruption of the PSL_RF bit at worst causes one more or 1020 * one less debugger trap, so allowing it is fairly harmless. 1021 */ 1022 if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { 1023 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags); 1024 return(EINVAL); 1025 } 1026 1027 /* 1028 * Don't allow users to load a valid privileged %cs. Let the 1029 * hardware check for invalid selectors, excess privilege in 1030 * other selectors, invalid %eip's and invalid %esp's. 1031 */ 1032 cs = ucp->uc_mcontext.mc_cs; 1033 if (!CS_SECURE(cs)) { 1034 kprintf("sigreturn: cs = 0x%x\n", cs); 1035 trapsignal(lp, SIGBUS, T_PROTFLT); 1036 return(EINVAL); 1037 } 1038 /* gcc errors out on optimized bcopy */ 1039 _bcopy(&ucp->uc_mcontext.mc_rdi, regs, 1040 sizeof(struct trapframe)); 1041 } 1042 1043 /* 1044 * Restore the FPU state from the frame 1045 */ 1046 crit_enter(); 1047 npxpop(&ucp->uc_mcontext); 1048 1049 if (ucp->uc_mcontext.mc_onstack & 1) 1050 lp->lwp_sigstk.ss_flags |= SS_ONSTACK; 1051 else 1052 lp->lwp_sigstk.ss_flags &= ~SS_ONSTACK; 1053 1054 lp->lwp_sigmask = ucp->uc_sigmask; 1055 SIG_CANTMASK(lp->lwp_sigmask); 1056 clear_quickret(); 1057 crit_exit(); 1058 return(EJUSTRETURN); 1059 } 1060 1061 /* 1062 * Machine dependent boot() routine 1063 * 1064 * I haven't seen anything to put here yet 1065 * Possibly some stuff might be grafted back here from boot() 1066 */ 1067 void 1068 cpu_boot(int howto) 1069 { 1070 } 1071 1072 /* 1073 * Shutdown the CPU as much as possible 1074 */ 1075 void 1076 cpu_halt(void) 1077 { 1078 for (;;) 1079 __asm__ __volatile("hlt"); 1080 } 1081 1082 /* 1083 * cpu_idle() represents the idle LWKT. You cannot return from this function 1084 * (unless you want to blow things up!). Instead we look for runnable threads 1085 * and loop or halt as appropriate. Giant is not held on entry to the thread. 1086 * 1087 * The main loop is entered with a critical section held, we must release 1088 * the critical section before doing anything else. lwkt_switch() will 1089 * check for pending interrupts due to entering and exiting its own 1090 * critical section. 1091 * 1092 * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up. 1093 * However, there are cases where the idlethread will be entered with 1094 * the possibility that no IPI will occur and in such cases 1095 * lwkt_switch() sets TDF_IDLE_NOHLT. 1096 * 1097 * NOTE: cpu_idle_repeat determines how many entries into the idle thread 1098 * must occur before it starts using ACPI halt. 1099 * 1100 * NOTE: Value overridden in hammer_time(). 1101 */ 1102 static int cpu_idle_hlt = 2; 1103 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, 1104 &cpu_idle_hlt, 0, "Idle loop HLT enable"); 1105 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_repeat, CTLFLAG_RW, 1106 &cpu_idle_repeat, 0, "Idle entries before acpi hlt"); 1107 1108 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_hltcnt, (CTLTYPE_QUAD | CTLFLAG_RW), 1109 0, CPU_IDLE_STAT_HALT, sysctl_cpu_idle_cnt, "Q", "Idle loop entry halts"); 1110 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_spincnt, (CTLTYPE_QUAD | CTLFLAG_RW), 1111 0, CPU_IDLE_STAT_SPIN, sysctl_cpu_idle_cnt, "Q", "Idle loop entry spins"); 1112 1113 static void 1114 cpu_idle_default_hook(void) 1115 { 1116 /* 1117 * We must guarentee that hlt is exactly the instruction 1118 * following the sti. 1119 */ 1120 __asm __volatile("sti; hlt"); 1121 } 1122 1123 /* Other subsystems (e.g., ACPI) can hook this later. */ 1124 void (*cpu_idle_hook)(void) = cpu_idle_default_hook; 1125 1126 static __inline int 1127 cpu_mwait_cx_hint(struct cpu_idle_stat *stat) 1128 { 1129 int hint, cx_idx; 1130 u_int idx; 1131 1132 hint = stat->hint; 1133 if (hint >= 0) 1134 goto done; 1135 1136 idx = (stat->repeat + stat->repeat_last + stat->repeat_delta) >> 1137 cpu_mwait_repeat_shift; 1138 if (idx >= cpu_mwait_c1_hints_cnt) { 1139 /* Step up faster, once we walked through all C1 states */ 1140 stat->repeat_delta += 1 << (cpu_mwait_repeat_shift + 1); 1141 } 1142 if (hint == CPU_MWAIT_HINT_AUTODEEP) { 1143 if (idx >= cpu_mwait_deep_hints_cnt) 1144 idx = cpu_mwait_deep_hints_cnt - 1; 1145 hint = cpu_mwait_deep_hints[idx]; 1146 } else { 1147 if (idx >= cpu_mwait_hints_cnt) 1148 idx = cpu_mwait_hints_cnt - 1; 1149 hint = cpu_mwait_hints[idx]; 1150 } 1151 done: 1152 cx_idx = MWAIT_EAX_TO_CX(hint); 1153 if (cx_idx >= 0 && cx_idx < CPU_MWAIT_CX_MAX) 1154 stat->mwait_cx[cx_idx]++; 1155 return hint; 1156 } 1157 1158 void 1159 cpu_idle(void) 1160 { 1161 globaldata_t gd = mycpu; 1162 struct cpu_idle_stat *stat = &cpu_idle_stats[gd->gd_cpuid]; 1163 struct thread *td __debugvar = gd->gd_curthread; 1164 int reqflags; 1165 1166 stat->repeat = stat->repeat_last = cpu_idle_repeat_max; 1167 1168 crit_exit(); 1169 KKASSERT(td->td_critcount == 0); 1170 1171 for (;;) { 1172 /* 1173 * See if there are any LWKTs ready to go. 1174 */ 1175 lwkt_switch(); 1176 1177 /* 1178 * When halting inside a cli we must check for reqflags 1179 * races, particularly [re]schedule requests. Running 1180 * splz() does the job. 1181 * 1182 * cpu_idle_hlt: 1183 * 0 Never halt, just spin 1184 * 1185 * 1 Always use MONITOR/MWAIT if avail, HLT 1186 * otherwise. 1187 * 1188 * Better default for modern (Haswell+) Intel 1189 * cpus. 1190 * 1191 * 2 Use HLT/MONITOR/MWAIT up to a point and then 1192 * use the ACPI halt (default). This is a hybrid 1193 * approach. See machdep.cpu_idle_repeat. 1194 * 1195 * Better default for modern AMD cpus and older 1196 * Intel cpus. 1197 * 1198 * 3 Always use the ACPI halt. This typically 1199 * eats the least amount of power but the cpu 1200 * will be slow waking up. Slows down e.g. 1201 * compiles and other pipe/event oriented stuff. 1202 * 1203 * Usually the best default for AMD cpus. 1204 * 1205 * 4 Always use HLT. 1206 * 1207 * 5 Always spin. 1208 * 1209 * NOTE: Interrupts are enabled and we are not in a critical 1210 * section. 1211 * 1212 * NOTE: Preemptions do not reset gd_idle_repeat. Also we 1213 * don't bother capping gd_idle_repeat, it is ok if 1214 * it overflows (we do make it unsigned, however). 1215 * 1216 * Implement optimized invltlb operations when halted 1217 * in idle. By setting the bit in smp_idleinvl_mask 1218 * we inform other cpus that they can set _reqs to 1219 * request an invltlb. Current the code to do that 1220 * sets the bits in _reqs anyway, but then check _mask 1221 * to determine if they can assume the invltlb will execute. 1222 * 1223 * A critical section is required to ensure that interrupts 1224 * do not fully run until after we've had a chance to execute 1225 * the request. 1226 */ 1227 if (gd->gd_idle_repeat == 0) { 1228 stat->repeat = (stat->repeat + stat->repeat_last) >> 1; 1229 if (stat->repeat > cpu_idle_repeat_max) 1230 stat->repeat = cpu_idle_repeat_max; 1231 stat->repeat_last = 0; 1232 stat->repeat_delta = 0; 1233 } 1234 ++stat->repeat_last; 1235 1236 /* 1237 * General idle thread halt code 1238 * 1239 * IBRS NOTES - IBRS is a SPECTRE mitigation. When going 1240 * idle, disable IBRS to reduce hyperthread 1241 * overhead. 1242 */ 1243 ++gd->gd_idle_repeat; 1244 1245 switch(cpu_idle_hlt) { 1246 default: 1247 case 0: 1248 /* 1249 * Always spin 1250 */ 1251 ; 1252 do_spin: 1253 splz(); 1254 __asm __volatile("sti"); 1255 stat->spin++; 1256 crit_enter_gd(gd); 1257 crit_exit_gd(gd); 1258 break; 1259 case 2: 1260 /* 1261 * Use MONITOR/MWAIT (or HLT) for a few cycles, 1262 * then start using the ACPI halt code if we 1263 * continue to be idle. 1264 */ 1265 if (gd->gd_idle_repeat >= cpu_idle_repeat) 1266 goto do_acpi; 1267 /* FALL THROUGH */ 1268 case 1: 1269 /* 1270 * Always use MONITOR/MWAIT (will use HLT if 1271 * MONITOR/MWAIT not available). 1272 */ 1273 if (cpu_mi_feature & CPU_MI_MONITOR) { 1274 splz(); /* XXX */ 1275 reqflags = gd->gd_reqflags; 1276 if (reqflags & RQF_IDLECHECK_WK_MASK) 1277 goto do_spin; 1278 crit_enter_gd(gd); 1279 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask, gd->gd_cpuid); 1280 /* 1281 * IBRS/STIBP 1282 */ 1283 if (pscpu->trampoline.tr_pcb_spec_ctrl[1] & 1284 SPEC_CTRL_DUMMY_ENABLE) { 1285 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1286 } 1287 cpu_mmw_pause_int(&gd->gd_reqflags, reqflags, 1288 cpu_mwait_cx_hint(stat), 0); 1289 if (pscpu->trampoline.tr_pcb_spec_ctrl[0] & 1290 SPEC_CTRL_DUMMY_ENABLE) { 1291 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1292 } 1293 stat->halt++; 1294 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask, gd->gd_cpuid); 1295 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs, 1296 gd->gd_cpuid)) { 1297 cpu_invltlb(); 1298 cpu_mfence(); 1299 } 1300 crit_exit_gd(gd); 1301 break; 1302 } 1303 /* FALLTHROUGH */ 1304 case 4: 1305 /* 1306 * Use HLT 1307 */ 1308 __asm __volatile("cli"); 1309 splz(); 1310 crit_enter_gd(gd); 1311 if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) { 1312 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask, 1313 gd->gd_cpuid); 1314 if (pscpu->trampoline.tr_pcb_spec_ctrl[1] & 1315 SPEC_CTRL_DUMMY_ENABLE) { 1316 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1317 } 1318 cpu_idle_default_hook(); 1319 if (pscpu->trampoline.tr_pcb_spec_ctrl[0] & 1320 SPEC_CTRL_DUMMY_ENABLE) { 1321 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1322 } 1323 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask, 1324 gd->gd_cpuid); 1325 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs, 1326 gd->gd_cpuid)) { 1327 cpu_invltlb(); 1328 cpu_mfence(); 1329 } 1330 } 1331 __asm __volatile("sti"); 1332 stat->halt++; 1333 crit_exit_gd(gd); 1334 break; 1335 case 3: 1336 /* 1337 * Use ACPI halt 1338 */ 1339 ; 1340 do_acpi: 1341 __asm __volatile("cli"); 1342 splz(); 1343 crit_enter_gd(gd); 1344 if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) { 1345 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask, 1346 gd->gd_cpuid); 1347 if (pscpu->trampoline.tr_pcb_spec_ctrl[1] & 1348 SPEC_CTRL_DUMMY_ENABLE) { 1349 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1350 } 1351 cpu_idle_hook(); 1352 if (pscpu->trampoline.tr_pcb_spec_ctrl[0] & 1353 SPEC_CTRL_DUMMY_ENABLE) { 1354 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1355 } 1356 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask, 1357 gd->gd_cpuid); 1358 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs, 1359 gd->gd_cpuid)) { 1360 cpu_invltlb(); 1361 cpu_mfence(); 1362 } 1363 } 1364 __asm __volatile("sti"); 1365 stat->halt++; 1366 crit_exit_gd(gd); 1367 break; 1368 } 1369 } 1370 } 1371 1372 /* 1373 * Called from deep ACPI via cpu_idle_hook() (see above) to actually halt 1374 * the cpu in C1. ACPI might use other halt methods for deeper states 1375 * and not reach here. 1376 * 1377 * For now we always use HLT as we are not sure what ACPI may have actually 1378 * done. MONITOR/MWAIT might not be appropriate. 1379 * 1380 * NOTE: MONITOR/MWAIT does not appear to throttle AMD cpus, while HLT 1381 * does. On Intel, MONITOR/MWAIT does appear to throttle the cpu. 1382 */ 1383 void 1384 cpu_idle_halt(void) 1385 { 1386 globaldata_t gd; 1387 1388 gd = mycpu; 1389 #if 0 1390 /* DISABLED FOR NOW */ 1391 struct cpu_idle_stat *stat; 1392 int reqflags; 1393 1394 1395 if ((cpu_idle_hlt == 1 || cpu_idle_hlt == 2) && 1396 (cpu_mi_feature & CPU_MI_MONITOR) && 1397 cpu_vendor_id != CPU_VENDOR_AMD) { 1398 /* 1399 * Use MONITOR/MWAIT 1400 * 1401 * (NOTE: On ryzen, MWAIT does not throttle clocks, so we 1402 * have to use HLT) 1403 */ 1404 stat = &cpu_idle_stats[gd->gd_cpuid]; 1405 reqflags = gd->gd_reqflags; 1406 if ((reqflags & RQF_IDLECHECK_WK_MASK) == 0) { 1407 __asm __volatile("sti"); 1408 cpu_mmw_pause_int(&gd->gd_reqflags, reqflags, 1409 cpu_mwait_cx_hint(stat), 0); 1410 } else { 1411 __asm __volatile("sti; pause"); 1412 } 1413 } else 1414 #endif 1415 { 1416 /* 1417 * Use HLT 1418 */ 1419 if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) 1420 __asm __volatile("sti; hlt"); 1421 else 1422 __asm __volatile("sti; pause"); 1423 } 1424 } 1425 1426 1427 /* 1428 * Called in a loop indirectly via Xcpustop 1429 */ 1430 void 1431 cpu_smp_stopped(void) 1432 { 1433 globaldata_t gd = mycpu; 1434 volatile __uint64_t *ptr; 1435 __uint64_t ovalue; 1436 1437 ptr = CPUMASK_ADDR(started_cpus, gd->gd_cpuid); 1438 ovalue = *ptr; 1439 if ((ovalue & CPUMASK_SIMPLE(gd->gd_cpuid & 63)) == 0) { 1440 if (cpu_mi_feature & CPU_MI_MONITOR) { 1441 if (cpu_mwait_hints) { 1442 cpu_mmw_pause_long(__DEVOLATILE(void *, ptr), 1443 ovalue, 1444 cpu_mwait_hints[ 1445 cpu_mwait_hints_cnt - 1], 0); 1446 } else { 1447 cpu_mmw_pause_long(__DEVOLATILE(void *, ptr), 1448 ovalue, 0, 0); 1449 } 1450 } else { 1451 cpu_halt(); /* depend on lapic timer */ 1452 } 1453 } 1454 } 1455 1456 /* 1457 * This routine is called if a spinlock has been held through the 1458 * exponential backoff period and is seriously contested. On a real cpu 1459 * we let it spin. 1460 */ 1461 void 1462 cpu_spinlock_contested(void) 1463 { 1464 cpu_pause(); 1465 } 1466 1467 /* 1468 * Clear registers on exec 1469 */ 1470 void 1471 exec_setregs(u_long entry, u_long stack, u_long ps_strings) 1472 { 1473 struct thread *td = curthread; 1474 struct lwp *lp = td->td_lwp; 1475 struct pcb *pcb = td->td_pcb; 1476 struct trapframe *regs = lp->lwp_md.md_regs; 1477 1478 user_ldt_free(pcb); 1479 1480 clear_quickret(); 1481 bzero((char *)regs, sizeof(struct trapframe)); 1482 regs->tf_rip = entry; 1483 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; /* align the stack */ 1484 regs->tf_rdi = stack; /* argv */ 1485 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); 1486 regs->tf_ss = _udatasel; 1487 regs->tf_cs = _ucodesel; 1488 regs->tf_rbx = ps_strings; 1489 1490 /* 1491 * Reset the hardware debug registers if they were in use. 1492 * They won't have any meaning for the newly exec'd process. 1493 */ 1494 if (pcb->pcb_flags & PCB_DBREGS) { 1495 pcb->pcb_dr0 = 0; 1496 pcb->pcb_dr1 = 0; 1497 pcb->pcb_dr2 = 0; 1498 pcb->pcb_dr3 = 0; 1499 pcb->pcb_dr6 = 0; 1500 pcb->pcb_dr7 = 0; /* JG set bit 10? */ 1501 if (pcb == td->td_pcb) { 1502 /* 1503 * Clear the debug registers on the running 1504 * CPU, otherwise they will end up affecting 1505 * the next process we switch to. 1506 */ 1507 reset_dbregs(); 1508 } 1509 pcb->pcb_flags &= ~PCB_DBREGS; 1510 } 1511 1512 /* 1513 * Initialize the math emulator (if any) for the current process. 1514 * Actually, just clear the bit that says that the emulator has 1515 * been initialized. Initialization is delayed until the process 1516 * traps to the emulator (if it is done at all) mainly because 1517 * emulators don't provide an entry point for initialization. 1518 */ 1519 pcb->pcb_flags &= ~FP_SOFTFP; 1520 1521 /* 1522 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing 1523 * gd_npxthread. Otherwise a preemptive interrupt thread 1524 * may panic in npxdna(). 1525 */ 1526 crit_enter(); 1527 load_cr0(rcr0() | CR0_MP); 1528 1529 /* 1530 * NOTE: The MSR values must be correct so we can return to 1531 * userland. gd_user_fs/gs must be correct so the switch 1532 * code knows what the current MSR values are. 1533 */ 1534 pcb->pcb_fsbase = 0; /* Values loaded from PCB on switch */ 1535 pcb->pcb_gsbase = 0; 1536 mdcpu->gd_user_fs = 0; /* Cache of current MSR values */ 1537 mdcpu->gd_user_gs = 0; 1538 wrmsr(MSR_FSBASE, 0); /* Set MSR values for return to userland */ 1539 wrmsr(MSR_KGSBASE, 0); 1540 1541 /* Initialize the npx (if any) for the current process. */ 1542 npxinit(); 1543 crit_exit(); 1544 1545 pcb->pcb_ds = _udatasel; 1546 pcb->pcb_es = _udatasel; 1547 pcb->pcb_fs = _udatasel; 1548 pcb->pcb_gs = _udatasel; 1549 } 1550 1551 void 1552 cpu_setregs(void) 1553 { 1554 register_t cr0; 1555 1556 cr0 = rcr0(); 1557 cr0 |= CR0_NE; /* Done by npxinit() */ 1558 cr0 |= CR0_MP | CR0_TS; /* Done at every execve() too. */ 1559 cr0 |= CR0_WP | CR0_AM; 1560 load_cr0(cr0); 1561 load_gs(_udatasel); 1562 } 1563 1564 static int 1565 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS) 1566 { 1567 int error; 1568 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, 1569 req); 1570 if (!error && req->newptr) 1571 resettodr(); 1572 return (error); 1573 } 1574 1575 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, 1576 &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); 1577 1578 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, 1579 CTLFLAG_RW, &disable_rtc_set, 0, ""); 1580 1581 #if 0 /* JG */ 1582 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, 1583 CTLFLAG_RD, &bootinfo, bootinfo, ""); 1584 #endif 1585 1586 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, 1587 CTLFLAG_RW, &wall_cmos_clock, 0, ""); 1588 1589 static int 1590 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) 1591 { 1592 struct efi_map_header *efihdr; 1593 caddr_t kmdp; 1594 uint32_t efisize; 1595 1596 kmdp = preload_search_by_type("elf kernel"); 1597 if (kmdp == NULL) 1598 kmdp = preload_search_by_type("elf64 kernel"); 1599 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1600 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1601 if (efihdr == NULL) 1602 return (0); 1603 efisize = *((uint32_t *)efihdr - 1); 1604 return (SYSCTL_OUT(req, efihdr, efisize)); 1605 } 1606 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 1607 efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map"); 1608 1609 /* 1610 * Initialize x86 and configure to run kernel 1611 */ 1612 1613 /* 1614 * Initialize segments & interrupt table 1615 */ 1616 1617 int _default_ldt; 1618 struct user_segment_descriptor gdt_cpu0[MAXGDT_COUNT]; 1619 struct gate_descriptor idt_arr[MAXCPU][NIDT]; 1620 #if 0 /* JG */ 1621 union descriptor ldt[NLDT]; /* local descriptor table */ 1622 #endif 1623 1624 /* table descriptors - used to load tables by cpu */ 1625 struct region_descriptor r_gdt; 1626 struct region_descriptor r_idt_arr[MAXCPU]; 1627 1628 /* JG proc0paddr is a virtual address */ 1629 void *proc0paddr; 1630 /* JG alignment? */ 1631 char proc0paddr_buff[LWKT_THREAD_STACK]; 1632 1633 1634 /* software prototypes -- in more palatable form */ 1635 struct soft_segment_descriptor gdt_segs[] = { 1636 /* GNULL_SEL 0 Null Descriptor */ 1637 { 0x0, /* segment base address */ 1638 0x0, /* length */ 1639 0, /* segment type */ 1640 0, /* segment descriptor priority level */ 1641 0, /* segment descriptor present */ 1642 0, /* long */ 1643 0, /* default 32 vs 16 bit size */ 1644 0 /* limit granularity (byte/page units)*/ }, 1645 /* GCODE_SEL 1 Code Descriptor for kernel */ 1646 { 0x0, /* segment base address */ 1647 0xfffff, /* length - all address space */ 1648 SDT_MEMERA, /* segment type */ 1649 SEL_KPL, /* segment descriptor priority level */ 1650 1, /* segment descriptor present */ 1651 1, /* long */ 1652 0, /* default 32 vs 16 bit size */ 1653 1 /* limit granularity (byte/page units)*/ }, 1654 /* GDATA_SEL 2 Data Descriptor for kernel */ 1655 { 0x0, /* segment base address */ 1656 0xfffff, /* length - all address space */ 1657 SDT_MEMRWA, /* segment type */ 1658 SEL_KPL, /* segment descriptor priority level */ 1659 1, /* segment descriptor present */ 1660 1, /* long */ 1661 0, /* default 32 vs 16 bit size */ 1662 1 /* limit granularity (byte/page units)*/ }, 1663 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */ 1664 { 0x0, /* segment base address */ 1665 0xfffff, /* length - all address space */ 1666 SDT_MEMERA, /* segment type */ 1667 SEL_UPL, /* segment descriptor priority level */ 1668 1, /* segment descriptor present */ 1669 0, /* long */ 1670 1, /* default 32 vs 16 bit size */ 1671 1 /* limit granularity (byte/page units)*/ }, 1672 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */ 1673 { 0x0, /* segment base address */ 1674 0xfffff, /* length - all address space */ 1675 SDT_MEMRWA, /* segment type */ 1676 SEL_UPL, /* segment descriptor priority level */ 1677 1, /* segment descriptor present */ 1678 0, /* long */ 1679 1, /* default 32 vs 16 bit size */ 1680 1 /* limit granularity (byte/page units)*/ }, 1681 /* GUCODE_SEL 5 64 bit Code Descriptor for user */ 1682 { 0x0, /* segment base address */ 1683 0xfffff, /* length - all address space */ 1684 SDT_MEMERA, /* segment type */ 1685 SEL_UPL, /* segment descriptor priority level */ 1686 1, /* segment descriptor present */ 1687 1, /* long */ 1688 0, /* default 32 vs 16 bit size */ 1689 1 /* limit granularity (byte/page units)*/ }, 1690 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */ 1691 { 1692 0x0, /* segment base address */ 1693 sizeof(struct x86_64tss)-1,/* length - all address space */ 1694 SDT_SYSTSS, /* segment type */ 1695 SEL_KPL, /* segment descriptor priority level */ 1696 1, /* segment descriptor present */ 1697 0, /* long */ 1698 0, /* unused - default 32 vs 16 bit size */ 1699 0 /* limit granularity (byte/page units)*/ }, 1700 /* Actually, the TSS is a system descriptor which is double size */ 1701 { 0x0, /* segment base address */ 1702 0x0, /* length */ 1703 0, /* segment type */ 1704 0, /* segment descriptor priority level */ 1705 0, /* segment descriptor present */ 1706 0, /* long */ 1707 0, /* default 32 vs 16 bit size */ 1708 0 /* limit granularity (byte/page units)*/ }, 1709 /* GUGS32_SEL 8 32 bit GS Descriptor for user */ 1710 { 0x0, /* segment base address */ 1711 0xfffff, /* length - all address space */ 1712 SDT_MEMRWA, /* segment type */ 1713 SEL_UPL, /* segment descriptor priority level */ 1714 1, /* segment descriptor present */ 1715 0, /* long */ 1716 1, /* default 32 vs 16 bit size */ 1717 1 /* limit granularity (byte/page units)*/ }, 1718 }; 1719 1720 void 1721 setidt_global(int idx, inthand_t *func, int typ, int dpl, int ist) 1722 { 1723 int cpu; 1724 1725 for (cpu = 0; cpu < MAXCPU; ++cpu) { 1726 struct gate_descriptor *ip = &idt_arr[cpu][idx]; 1727 1728 ip->gd_looffset = (uintptr_t)func; 1729 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 1730 ip->gd_ist = ist; 1731 ip->gd_xx = 0; 1732 ip->gd_type = typ; 1733 ip->gd_dpl = dpl; 1734 ip->gd_p = 1; 1735 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 1736 } 1737 } 1738 1739 void 1740 setidt(int idx, inthand_t *func, int typ, int dpl, int ist, int cpu) 1741 { 1742 struct gate_descriptor *ip; 1743 1744 KASSERT(cpu >= 0 && cpu < ncpus, ("invalid cpu %d", cpu)); 1745 1746 ip = &idt_arr[cpu][idx]; 1747 ip->gd_looffset = (uintptr_t)func; 1748 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 1749 ip->gd_ist = ist; 1750 ip->gd_xx = 0; 1751 ip->gd_type = typ; 1752 ip->gd_dpl = dpl; 1753 ip->gd_p = 1; 1754 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 1755 } 1756 1757 #define IDTVEC(name) __CONCAT(X,name) 1758 1759 extern inthand_t 1760 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 1761 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 1762 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 1763 IDTVEC(page), IDTVEC(mchk), IDTVEC(fpu), IDTVEC(align), 1764 IDTVEC(xmm), IDTVEC(dblfault), 1765 IDTVEC(fast_syscall), IDTVEC(fast_syscall32); 1766 1767 extern inthand_t 1768 IDTVEC(rsvd00), IDTVEC(rsvd01), IDTVEC(rsvd02), IDTVEC(rsvd03), 1769 IDTVEC(rsvd04), IDTVEC(rsvd05), IDTVEC(rsvd06), IDTVEC(rsvd07), 1770 IDTVEC(rsvd08), IDTVEC(rsvd09), IDTVEC(rsvd0a), IDTVEC(rsvd0b), 1771 IDTVEC(rsvd0c), IDTVEC(rsvd0d), IDTVEC(rsvd0e), IDTVEC(rsvd0f), 1772 IDTVEC(rsvd10), IDTVEC(rsvd11), IDTVEC(rsvd12), IDTVEC(rsvd13), 1773 IDTVEC(rsvd14), IDTVEC(rsvd15), IDTVEC(rsvd16), IDTVEC(rsvd17), 1774 IDTVEC(rsvd18), IDTVEC(rsvd19), IDTVEC(rsvd1a), IDTVEC(rsvd1b), 1775 IDTVEC(rsvd1c), IDTVEC(rsvd1d), IDTVEC(rsvd1e), IDTVEC(rsvd1f), 1776 IDTVEC(rsvd20), IDTVEC(rsvd21), IDTVEC(rsvd22), IDTVEC(rsvd23), 1777 IDTVEC(rsvd24), IDTVEC(rsvd25), IDTVEC(rsvd26), IDTVEC(rsvd27), 1778 IDTVEC(rsvd28), IDTVEC(rsvd29), IDTVEC(rsvd2a), IDTVEC(rsvd2b), 1779 IDTVEC(rsvd2c), IDTVEC(rsvd2d), IDTVEC(rsvd2e), IDTVEC(rsvd2f), 1780 IDTVEC(rsvd30), IDTVEC(rsvd31), IDTVEC(rsvd32), IDTVEC(rsvd33), 1781 IDTVEC(rsvd34), IDTVEC(rsvd35), IDTVEC(rsvd36), IDTVEC(rsvd37), 1782 IDTVEC(rsvd38), IDTVEC(rsvd39), IDTVEC(rsvd3a), IDTVEC(rsvd3b), 1783 IDTVEC(rsvd3c), IDTVEC(rsvd3d), IDTVEC(rsvd3e), IDTVEC(rsvd3f), 1784 IDTVEC(rsvd40), IDTVEC(rsvd41), IDTVEC(rsvd42), IDTVEC(rsvd43), 1785 IDTVEC(rsvd44), IDTVEC(rsvd45), IDTVEC(rsvd46), IDTVEC(rsvd47), 1786 IDTVEC(rsvd48), IDTVEC(rsvd49), IDTVEC(rsvd4a), IDTVEC(rsvd4b), 1787 IDTVEC(rsvd4c), IDTVEC(rsvd4d), IDTVEC(rsvd4e), IDTVEC(rsvd4f), 1788 IDTVEC(rsvd50), IDTVEC(rsvd51), IDTVEC(rsvd52), IDTVEC(rsvd53), 1789 IDTVEC(rsvd54), IDTVEC(rsvd55), IDTVEC(rsvd56), IDTVEC(rsvd57), 1790 IDTVEC(rsvd58), IDTVEC(rsvd59), IDTVEC(rsvd5a), IDTVEC(rsvd5b), 1791 IDTVEC(rsvd5c), IDTVEC(rsvd5d), IDTVEC(rsvd5e), IDTVEC(rsvd5f), 1792 IDTVEC(rsvd60), IDTVEC(rsvd61), IDTVEC(rsvd62), IDTVEC(rsvd63), 1793 IDTVEC(rsvd64), IDTVEC(rsvd65), IDTVEC(rsvd66), IDTVEC(rsvd67), 1794 IDTVEC(rsvd68), IDTVEC(rsvd69), IDTVEC(rsvd6a), IDTVEC(rsvd6b), 1795 IDTVEC(rsvd6c), IDTVEC(rsvd6d), IDTVEC(rsvd6e), IDTVEC(rsvd6f), 1796 IDTVEC(rsvd70), IDTVEC(rsvd71), IDTVEC(rsvd72), IDTVEC(rsvd73), 1797 IDTVEC(rsvd74), IDTVEC(rsvd75), IDTVEC(rsvd76), IDTVEC(rsvd77), 1798 IDTVEC(rsvd78), IDTVEC(rsvd79), IDTVEC(rsvd7a), IDTVEC(rsvd7b), 1799 IDTVEC(rsvd7c), IDTVEC(rsvd7d), IDTVEC(rsvd7e), IDTVEC(rsvd7f), 1800 IDTVEC(rsvd80), IDTVEC(rsvd81), IDTVEC(rsvd82), IDTVEC(rsvd83), 1801 IDTVEC(rsvd84), IDTVEC(rsvd85), IDTVEC(rsvd86), IDTVEC(rsvd87), 1802 IDTVEC(rsvd88), IDTVEC(rsvd89), IDTVEC(rsvd8a), IDTVEC(rsvd8b), 1803 IDTVEC(rsvd8c), IDTVEC(rsvd8d), IDTVEC(rsvd8e), IDTVEC(rsvd8f), 1804 IDTVEC(rsvd90), IDTVEC(rsvd91), IDTVEC(rsvd92), IDTVEC(rsvd93), 1805 IDTVEC(rsvd94), IDTVEC(rsvd95), IDTVEC(rsvd96), IDTVEC(rsvd97), 1806 IDTVEC(rsvd98), IDTVEC(rsvd99), IDTVEC(rsvd9a), IDTVEC(rsvd9b), 1807 IDTVEC(rsvd9c), IDTVEC(rsvd9d), IDTVEC(rsvd9e), IDTVEC(rsvd9f), 1808 IDTVEC(rsvda0), IDTVEC(rsvda1), IDTVEC(rsvda2), IDTVEC(rsvda3), 1809 IDTVEC(rsvda4), IDTVEC(rsvda5), IDTVEC(rsvda6), IDTVEC(rsvda7), 1810 IDTVEC(rsvda8), IDTVEC(rsvda9), IDTVEC(rsvdaa), IDTVEC(rsvdab), 1811 IDTVEC(rsvdac), IDTVEC(rsvdad), IDTVEC(rsvdae), IDTVEC(rsvdaf), 1812 IDTVEC(rsvdb0), IDTVEC(rsvdb1), IDTVEC(rsvdb2), IDTVEC(rsvdb3), 1813 IDTVEC(rsvdb4), IDTVEC(rsvdb5), IDTVEC(rsvdb6), IDTVEC(rsvdb7), 1814 IDTVEC(rsvdb8), IDTVEC(rsvdb9), IDTVEC(rsvdba), IDTVEC(rsvdbb), 1815 IDTVEC(rsvdbc), IDTVEC(rsvdbd), IDTVEC(rsvdbe), IDTVEC(rsvdbf), 1816 IDTVEC(rsvdc0), IDTVEC(rsvdc1), IDTVEC(rsvdc2), IDTVEC(rsvdc3), 1817 IDTVEC(rsvdc4), IDTVEC(rsvdc5), IDTVEC(rsvdc6), IDTVEC(rsvdc7), 1818 IDTVEC(rsvdc8), IDTVEC(rsvdc9), IDTVEC(rsvdca), IDTVEC(rsvdcb), 1819 IDTVEC(rsvdcc), IDTVEC(rsvdcd), IDTVEC(rsvdce), IDTVEC(rsvdcf), 1820 IDTVEC(rsvdd0), IDTVEC(rsvdd1), IDTVEC(rsvdd2), IDTVEC(rsvdd3), 1821 IDTVEC(rsvdd4), IDTVEC(rsvdd5), IDTVEC(rsvdd6), IDTVEC(rsvdd7), 1822 IDTVEC(rsvdd8), IDTVEC(rsvdd9), IDTVEC(rsvdda), IDTVEC(rsvddb), 1823 IDTVEC(rsvddc), IDTVEC(rsvddd), IDTVEC(rsvdde), IDTVEC(rsvddf), 1824 IDTVEC(rsvde0), IDTVEC(rsvde1), IDTVEC(rsvde2), IDTVEC(rsvde3), 1825 IDTVEC(rsvde4), IDTVEC(rsvde5), IDTVEC(rsvde6), IDTVEC(rsvde7), 1826 IDTVEC(rsvde8), IDTVEC(rsvde9), IDTVEC(rsvdea), IDTVEC(rsvdeb), 1827 IDTVEC(rsvdec), IDTVEC(rsvded), IDTVEC(rsvdee), IDTVEC(rsvdef), 1828 IDTVEC(rsvdf0), IDTVEC(rsvdf1), IDTVEC(rsvdf2), IDTVEC(rsvdf3), 1829 IDTVEC(rsvdf4), IDTVEC(rsvdf5), IDTVEC(rsvdf6), IDTVEC(rsvdf7), 1830 IDTVEC(rsvdf8), IDTVEC(rsvdf9), IDTVEC(rsvdfa), IDTVEC(rsvdfb), 1831 IDTVEC(rsvdfc), IDTVEC(rsvdfd), IDTVEC(rsvdfe), IDTVEC(rsvdff); 1832 1833 inthand_t *rsvdary[NIDT] = { 1834 &IDTVEC(rsvd00), &IDTVEC(rsvd01), &IDTVEC(rsvd02), &IDTVEC(rsvd03), 1835 &IDTVEC(rsvd04), &IDTVEC(rsvd05), &IDTVEC(rsvd06), &IDTVEC(rsvd07), 1836 &IDTVEC(rsvd08), &IDTVEC(rsvd09), &IDTVEC(rsvd0a), &IDTVEC(rsvd0b), 1837 &IDTVEC(rsvd0c), &IDTVEC(rsvd0d), &IDTVEC(rsvd0e), &IDTVEC(rsvd0f), 1838 &IDTVEC(rsvd10), &IDTVEC(rsvd11), &IDTVEC(rsvd12), &IDTVEC(rsvd13), 1839 &IDTVEC(rsvd14), &IDTVEC(rsvd15), &IDTVEC(rsvd16), &IDTVEC(rsvd17), 1840 &IDTVEC(rsvd18), &IDTVEC(rsvd19), &IDTVEC(rsvd1a), &IDTVEC(rsvd1b), 1841 &IDTVEC(rsvd1c), &IDTVEC(rsvd1d), &IDTVEC(rsvd1e), &IDTVEC(rsvd1f), 1842 &IDTVEC(rsvd20), &IDTVEC(rsvd21), &IDTVEC(rsvd22), &IDTVEC(rsvd23), 1843 &IDTVEC(rsvd24), &IDTVEC(rsvd25), &IDTVEC(rsvd26), &IDTVEC(rsvd27), 1844 &IDTVEC(rsvd28), &IDTVEC(rsvd29), &IDTVEC(rsvd2a), &IDTVEC(rsvd2b), 1845 &IDTVEC(rsvd2c), &IDTVEC(rsvd2d), &IDTVEC(rsvd2e), &IDTVEC(rsvd2f), 1846 &IDTVEC(rsvd30), &IDTVEC(rsvd31), &IDTVEC(rsvd32), &IDTVEC(rsvd33), 1847 &IDTVEC(rsvd34), &IDTVEC(rsvd35), &IDTVEC(rsvd36), &IDTVEC(rsvd37), 1848 &IDTVEC(rsvd38), &IDTVEC(rsvd39), &IDTVEC(rsvd3a), &IDTVEC(rsvd3b), 1849 &IDTVEC(rsvd3c), &IDTVEC(rsvd3d), &IDTVEC(rsvd3e), &IDTVEC(rsvd3f), 1850 &IDTVEC(rsvd40), &IDTVEC(rsvd41), &IDTVEC(rsvd42), &IDTVEC(rsvd43), 1851 &IDTVEC(rsvd44), &IDTVEC(rsvd45), &IDTVEC(rsvd46), &IDTVEC(rsvd47), 1852 &IDTVEC(rsvd48), &IDTVEC(rsvd49), &IDTVEC(rsvd4a), &IDTVEC(rsvd4b), 1853 &IDTVEC(rsvd4c), &IDTVEC(rsvd4d), &IDTVEC(rsvd4e), &IDTVEC(rsvd4f), 1854 &IDTVEC(rsvd50), &IDTVEC(rsvd51), &IDTVEC(rsvd52), &IDTVEC(rsvd53), 1855 &IDTVEC(rsvd54), &IDTVEC(rsvd55), &IDTVEC(rsvd56), &IDTVEC(rsvd57), 1856 &IDTVEC(rsvd58), &IDTVEC(rsvd59), &IDTVEC(rsvd5a), &IDTVEC(rsvd5b), 1857 &IDTVEC(rsvd5c), &IDTVEC(rsvd5d), &IDTVEC(rsvd5e), &IDTVEC(rsvd5f), 1858 &IDTVEC(rsvd60), &IDTVEC(rsvd61), &IDTVEC(rsvd62), &IDTVEC(rsvd63), 1859 &IDTVEC(rsvd64), &IDTVEC(rsvd65), &IDTVEC(rsvd66), &IDTVEC(rsvd67), 1860 &IDTVEC(rsvd68), &IDTVEC(rsvd69), &IDTVEC(rsvd6a), &IDTVEC(rsvd6b), 1861 &IDTVEC(rsvd6c), &IDTVEC(rsvd6d), &IDTVEC(rsvd6e), &IDTVEC(rsvd6f), 1862 &IDTVEC(rsvd70), &IDTVEC(rsvd71), &IDTVEC(rsvd72), &IDTVEC(rsvd73), 1863 &IDTVEC(rsvd74), &IDTVEC(rsvd75), &IDTVEC(rsvd76), &IDTVEC(rsvd77), 1864 &IDTVEC(rsvd78), &IDTVEC(rsvd79), &IDTVEC(rsvd7a), &IDTVEC(rsvd7b), 1865 &IDTVEC(rsvd7c), &IDTVEC(rsvd7d), &IDTVEC(rsvd7e), &IDTVEC(rsvd7f), 1866 &IDTVEC(rsvd80), &IDTVEC(rsvd81), &IDTVEC(rsvd82), &IDTVEC(rsvd83), 1867 &IDTVEC(rsvd84), &IDTVEC(rsvd85), &IDTVEC(rsvd86), &IDTVEC(rsvd87), 1868 &IDTVEC(rsvd88), &IDTVEC(rsvd89), &IDTVEC(rsvd8a), &IDTVEC(rsvd8b), 1869 &IDTVEC(rsvd8c), &IDTVEC(rsvd8d), &IDTVEC(rsvd8e), &IDTVEC(rsvd8f), 1870 &IDTVEC(rsvd90), &IDTVEC(rsvd91), &IDTVEC(rsvd92), &IDTVEC(rsvd93), 1871 &IDTVEC(rsvd94), &IDTVEC(rsvd95), &IDTVEC(rsvd96), &IDTVEC(rsvd97), 1872 &IDTVEC(rsvd98), &IDTVEC(rsvd99), &IDTVEC(rsvd9a), &IDTVEC(rsvd9b), 1873 &IDTVEC(rsvd9c), &IDTVEC(rsvd9d), &IDTVEC(rsvd9e), &IDTVEC(rsvd9f), 1874 &IDTVEC(rsvda0), &IDTVEC(rsvda1), &IDTVEC(rsvda2), &IDTVEC(rsvda3), 1875 &IDTVEC(rsvda4), &IDTVEC(rsvda5), &IDTVEC(rsvda6), &IDTVEC(rsvda7), 1876 &IDTVEC(rsvda8), &IDTVEC(rsvda9), &IDTVEC(rsvdaa), &IDTVEC(rsvdab), 1877 &IDTVEC(rsvdac), &IDTVEC(rsvdad), &IDTVEC(rsvdae), &IDTVEC(rsvdaf), 1878 &IDTVEC(rsvdb0), &IDTVEC(rsvdb1), &IDTVEC(rsvdb2), &IDTVEC(rsvdb3), 1879 &IDTVEC(rsvdb4), &IDTVEC(rsvdb5), &IDTVEC(rsvdb6), &IDTVEC(rsvdb7), 1880 &IDTVEC(rsvdb8), &IDTVEC(rsvdb9), &IDTVEC(rsvdba), &IDTVEC(rsvdbb), 1881 &IDTVEC(rsvdbc), &IDTVEC(rsvdbd), &IDTVEC(rsvdbe), &IDTVEC(rsvdbf), 1882 &IDTVEC(rsvdc0), &IDTVEC(rsvdc1), &IDTVEC(rsvdc2), &IDTVEC(rsvdc3), 1883 &IDTVEC(rsvdc4), &IDTVEC(rsvdc5), &IDTVEC(rsvdc6), &IDTVEC(rsvdc7), 1884 &IDTVEC(rsvdc8), &IDTVEC(rsvdc9), &IDTVEC(rsvdca), &IDTVEC(rsvdcb), 1885 &IDTVEC(rsvdcc), &IDTVEC(rsvdcd), &IDTVEC(rsvdce), &IDTVEC(rsvdcf), 1886 &IDTVEC(rsvdd0), &IDTVEC(rsvdd1), &IDTVEC(rsvdd2), &IDTVEC(rsvdd3), 1887 &IDTVEC(rsvdd4), &IDTVEC(rsvdd5), &IDTVEC(rsvdd6), &IDTVEC(rsvdd7), 1888 &IDTVEC(rsvdd8), &IDTVEC(rsvdd9), &IDTVEC(rsvdda), &IDTVEC(rsvddb), 1889 &IDTVEC(rsvddc), &IDTVEC(rsvddd), &IDTVEC(rsvdde), &IDTVEC(rsvddf), 1890 &IDTVEC(rsvde0), &IDTVEC(rsvde1), &IDTVEC(rsvde2), &IDTVEC(rsvde3), 1891 &IDTVEC(rsvde4), &IDTVEC(rsvde5), &IDTVEC(rsvde6), &IDTVEC(rsvde7), 1892 &IDTVEC(rsvde8), &IDTVEC(rsvde9), &IDTVEC(rsvdea), &IDTVEC(rsvdeb), 1893 &IDTVEC(rsvdec), &IDTVEC(rsvded), &IDTVEC(rsvdee), &IDTVEC(rsvdef), 1894 &IDTVEC(rsvdf0), &IDTVEC(rsvdf1), &IDTVEC(rsvdf2), &IDTVEC(rsvdf3), 1895 &IDTVEC(rsvdf4), &IDTVEC(rsvdf5), &IDTVEC(rsvdf6), &IDTVEC(rsvdf7), 1896 &IDTVEC(rsvdf8), &IDTVEC(rsvdf9), &IDTVEC(rsvdfa), &IDTVEC(rsvdfb), 1897 &IDTVEC(rsvdfc), &IDTVEC(rsvdfd), &IDTVEC(rsvdfe), &IDTVEC(rsvdff) 1898 }; 1899 1900 void 1901 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd) 1902 { 1903 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 1904 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 1905 ssd->ssd_type = sd->sd_type; 1906 ssd->ssd_dpl = sd->sd_dpl; 1907 ssd->ssd_p = sd->sd_p; 1908 ssd->ssd_def32 = sd->sd_def32; 1909 ssd->ssd_gran = sd->sd_gran; 1910 } 1911 1912 void 1913 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd) 1914 { 1915 1916 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 1917 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 1918 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 1919 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 1920 sd->sd_type = ssd->ssd_type; 1921 sd->sd_dpl = ssd->ssd_dpl; 1922 sd->sd_p = ssd->ssd_p; 1923 sd->sd_long = ssd->ssd_long; 1924 sd->sd_def32 = ssd->ssd_def32; 1925 sd->sd_gran = ssd->ssd_gran; 1926 } 1927 1928 void 1929 ssdtosyssd(struct soft_segment_descriptor *ssd, 1930 struct system_segment_descriptor *sd) 1931 { 1932 1933 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 1934 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 1935 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 1936 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 1937 sd->sd_type = ssd->ssd_type; 1938 sd->sd_dpl = ssd->ssd_dpl; 1939 sd->sd_p = ssd->ssd_p; 1940 sd->sd_gran = ssd->ssd_gran; 1941 } 1942 1943 /* 1944 * Populate the (physmap) array with base/bound pairs describing the 1945 * available physical memory in the system, then test this memory and 1946 * build the phys_avail array describing the actually-available memory. 1947 * 1948 * If we cannot accurately determine the physical memory map, then use 1949 * value from the 0xE801 call, and failing that, the RTC. 1950 * 1951 * Total memory size may be set by the kernel environment variable 1952 * hw.physmem or the compile-time define MAXMEM. 1953 * 1954 * Memory is aligned to PHYSMAP_ALIGN which must be a multiple 1955 * of PAGE_SIZE. This also greatly reduces the memory test time 1956 * which would otherwise be excessive on machines with > 8G of ram. 1957 * 1958 * XXX first should be vm_paddr_t. 1959 */ 1960 1961 #define PHYSMAP_ALIGN (vm_paddr_t)(128 * 1024) 1962 #define PHYSMAP_ALIGN_MASK (vm_paddr_t)(PHYSMAP_ALIGN - 1) 1963 #define PHYSMAP_SIZE VM_PHYSSEG_MAX 1964 1965 vm_paddr_t physmap[PHYSMAP_SIZE]; 1966 struct bios_smap *smapbase, *smap, *smapend; 1967 struct efi_map_header *efihdrbase; 1968 u_int32_t smapsize; 1969 1970 #define PHYSMAP_HANDWAVE (vm_paddr_t)(2 * 1024 * 1024) 1971 #define PHYSMAP_HANDWAVE_MASK (PHYSMAP_HANDWAVE - 1) 1972 1973 static void 1974 add_smap_entries(int *physmap_idx) 1975 { 1976 int i; 1977 1978 smapsize = *((u_int32_t *)smapbase - 1); 1979 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1980 1981 for (smap = smapbase; smap < smapend; smap++) { 1982 if (boothowto & RB_VERBOSE) 1983 kprintf("SMAP type=%02x base=%016lx len=%016lx\n", 1984 smap->type, smap->base, smap->length); 1985 1986 if (smap->type != SMAP_TYPE_MEMORY) 1987 continue; 1988 1989 if (smap->length == 0) 1990 continue; 1991 1992 for (i = 0; i <= *physmap_idx; i += 2) { 1993 if (smap->base < physmap[i + 1]) { 1994 if (boothowto & RB_VERBOSE) { 1995 kprintf("Overlapping or non-monotonic " 1996 "memory region, ignoring " 1997 "second region\n"); 1998 } 1999 break; 2000 } 2001 } 2002 if (i <= *physmap_idx) 2003 continue; 2004 2005 Realmem += smap->length; 2006 2007 /* 2008 * NOTE: This little bit of code initially expands 2009 * physmap[1] as well as later entries. 2010 */ 2011 if (smap->base == physmap[*physmap_idx + 1]) { 2012 physmap[*physmap_idx + 1] += smap->length; 2013 continue; 2014 } 2015 2016 *physmap_idx += 2; 2017 if (*physmap_idx == PHYSMAP_SIZE) { 2018 kprintf("Too many segments in the physical " 2019 "address map, giving up\n"); 2020 break; 2021 } 2022 physmap[*physmap_idx] = smap->base; 2023 physmap[*physmap_idx + 1] = smap->base + smap->length; 2024 } 2025 } 2026 2027 static void 2028 add_efi_map_entries(int *physmap_idx) 2029 { 2030 struct efi_md *map, *p; 2031 const char *type; 2032 size_t efisz; 2033 int i, ndesc; 2034 2035 static const char *types[] = { 2036 "Reserved", 2037 "LoaderCode", 2038 "LoaderData", 2039 "BootServicesCode", 2040 "BootServicesData", 2041 "RuntimeServicesCode", 2042 "RuntimeServicesData", 2043 "ConventionalMemory", 2044 "UnusableMemory", 2045 "ACPIReclaimMemory", 2046 "ACPIMemoryNVS", 2047 "MemoryMappedIO", 2048 "MemoryMappedIOPortSpace", 2049 "PalCode" 2050 }; 2051 2052 /* 2053 * Memory map data provided by UEFI via the GetMemoryMap 2054 * Boot Services API. 2055 */ 2056 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; 2057 map = (struct efi_md *)((uint8_t *)efihdrbase + efisz); 2058 2059 if (efihdrbase->descriptor_size == 0) 2060 return; 2061 ndesc = efihdrbase->memory_size / efihdrbase->descriptor_size; 2062 2063 if (boothowto & RB_VERBOSE) 2064 kprintf("%23s %12s %12s %8s %4s\n", 2065 "Type", "Physical", "Virtual", "#Pages", "Attr"); 2066 2067 for (i = 0, p = map; i < ndesc; i++, 2068 p = efi_next_descriptor(p, efihdrbase->descriptor_size)) { 2069 if (boothowto & RB_VERBOSE) { 2070 if (p->md_type <= EFI_MD_TYPE_PALCODE) 2071 type = types[p->md_type]; 2072 else 2073 type = "<INVALID>"; 2074 kprintf("%23s %012lx %12p %08lx ", type, p->md_phys, 2075 p->md_virt, p->md_pages); 2076 if (p->md_attr & EFI_MD_ATTR_UC) 2077 kprintf("UC "); 2078 if (p->md_attr & EFI_MD_ATTR_WC) 2079 kprintf("WC "); 2080 if (p->md_attr & EFI_MD_ATTR_WT) 2081 kprintf("WT "); 2082 if (p->md_attr & EFI_MD_ATTR_WB) 2083 kprintf("WB "); 2084 if (p->md_attr & EFI_MD_ATTR_UCE) 2085 kprintf("UCE "); 2086 if (p->md_attr & EFI_MD_ATTR_WP) 2087 kprintf("WP "); 2088 if (p->md_attr & EFI_MD_ATTR_RP) 2089 kprintf("RP "); 2090 if (p->md_attr & EFI_MD_ATTR_XP) 2091 kprintf("XP "); 2092 if (p->md_attr & EFI_MD_ATTR_RT) 2093 kprintf("RUNTIME"); 2094 kprintf("\n"); 2095 } 2096 2097 switch (p->md_type) { 2098 case EFI_MD_TYPE_CODE: 2099 case EFI_MD_TYPE_DATA: 2100 case EFI_MD_TYPE_BS_CODE: 2101 case EFI_MD_TYPE_BS_DATA: 2102 case EFI_MD_TYPE_FREE: 2103 /* 2104 * We're allowed to use any entry with these types. 2105 */ 2106 break; 2107 default: 2108 continue; 2109 } 2110 2111 Realmem += p->md_pages * PAGE_SIZE; 2112 2113 /* 2114 * NOTE: This little bit of code initially expands 2115 * physmap[1] as well as later entries. 2116 */ 2117 if (p->md_phys == physmap[*physmap_idx + 1]) { 2118 physmap[*physmap_idx + 1] += p->md_pages * PAGE_SIZE; 2119 continue; 2120 } 2121 2122 *physmap_idx += 2; 2123 if (*physmap_idx == PHYSMAP_SIZE) { 2124 kprintf("Too many segments in the physical " 2125 "address map, giving up\n"); 2126 break; 2127 } 2128 physmap[*physmap_idx] = p->md_phys; 2129 physmap[*physmap_idx + 1] = p->md_phys + p->md_pages * PAGE_SIZE; 2130 } 2131 } 2132 2133 struct fb_info efi_fb_info; 2134 static int have_efi_framebuffer = 0; 2135 2136 static void 2137 efi_fb_init_vaddr(int direct_map) 2138 { 2139 uint64_t sz; 2140 vm_offset_t addr, v; 2141 2142 v = efi_fb_info.vaddr; 2143 sz = efi_fb_info.stride * efi_fb_info.height; 2144 2145 if (direct_map) { 2146 addr = PHYS_TO_DMAP(efi_fb_info.paddr); 2147 if (addr >= DMAP_MIN_ADDRESS && addr + sz <= DMapMaxAddress) 2148 efi_fb_info.vaddr = addr; 2149 } else { 2150 efi_fb_info.vaddr = 2151 (vm_offset_t)pmap_mapdev_attr(efi_fb_info.paddr, 2152 sz, 2153 PAT_WRITE_COMBINING); 2154 } 2155 } 2156 2157 static u_int 2158 efifb_color_depth(struct efi_fb *efifb) 2159 { 2160 uint32_t mask; 2161 u_int depth; 2162 2163 mask = efifb->fb_mask_red | efifb->fb_mask_green | 2164 efifb->fb_mask_blue | efifb->fb_mask_reserved; 2165 if (mask == 0) 2166 return (0); 2167 for (depth = 1; mask != 1; depth++) 2168 mask >>= 1; 2169 return (depth); 2170 } 2171 2172 int 2173 probe_efi_fb(int early) 2174 { 2175 struct efi_fb *efifb; 2176 caddr_t kmdp; 2177 u_int depth; 2178 2179 if (have_efi_framebuffer) { 2180 if (!early && 2181 (efi_fb_info.vaddr == 0 || 2182 efi_fb_info.vaddr == PHYS_TO_DMAP(efi_fb_info.paddr))) 2183 efi_fb_init_vaddr(0); 2184 return 0; 2185 } 2186 2187 kmdp = preload_search_by_type("elf kernel"); 2188 if (kmdp == NULL) 2189 kmdp = preload_search_by_type("elf64 kernel"); 2190 efifb = (struct efi_fb *)preload_search_info(kmdp, 2191 MODINFO_METADATA | MODINFOMD_EFI_FB); 2192 if (efifb == NULL) 2193 return 1; 2194 2195 depth = efifb_color_depth(efifb); 2196 /* 2197 * Our bootloader should already notice, when we won't be able to 2198 * use the UEFI framebuffer. 2199 */ 2200 if (depth != 24 && depth != 32) 2201 return 1; 2202 2203 have_efi_framebuffer = 1; 2204 2205 efi_fb_info.is_vga_boot_display = 1; 2206 efi_fb_info.width = efifb->fb_width; 2207 efi_fb_info.height = efifb->fb_height; 2208 efi_fb_info.depth = depth; 2209 efi_fb_info.stride = efifb->fb_stride * (depth / 8); 2210 efi_fb_info.paddr = efifb->fb_addr; 2211 if (early) { 2212 efi_fb_info.vaddr = 0; 2213 } else { 2214 efi_fb_init_vaddr(0); 2215 } 2216 efi_fb_info.fbops.fb_set_par = NULL; 2217 efi_fb_info.fbops.fb_blank = NULL; 2218 efi_fb_info.fbops.fb_debug_enter = NULL; 2219 efi_fb_info.device = NULL; 2220 2221 return 0; 2222 } 2223 2224 static void 2225 efifb_startup(void *arg) 2226 { 2227 probe_efi_fb(0); 2228 } 2229 2230 SYSINIT(efi_fb_info, SI_BOOT1_POST, SI_ORDER_FIRST, efifb_startup, NULL); 2231 2232 static void 2233 getmemsize(caddr_t kmdp, u_int64_t first) 2234 { 2235 int off, physmap_idx, pa_indx, da_indx; 2236 int i, j; 2237 vm_paddr_t pa; 2238 vm_paddr_t msgbuf_size; 2239 u_long physmem_tunable; 2240 pt_entry_t *pte; 2241 quad_t dcons_addr, dcons_size; 2242 2243 bzero(physmap, sizeof(physmap)); 2244 physmap_idx = 0; 2245 2246 /* 2247 * get memory map from INT 15:E820, kindly supplied by the loader. 2248 * 2249 * subr_module.c says: 2250 * "Consumer may safely assume that size value precedes data." 2251 * ie: an int32_t immediately precedes smap. 2252 */ 2253 efihdrbase = (struct efi_map_header *)preload_search_info(kmdp, 2254 MODINFO_METADATA | MODINFOMD_EFI_MAP); 2255 smapbase = (struct bios_smap *)preload_search_info(kmdp, 2256 MODINFO_METADATA | MODINFOMD_SMAP); 2257 if (smapbase == NULL && efihdrbase == NULL) 2258 panic("No BIOS smap or EFI map info from loader!"); 2259 2260 if (efihdrbase == NULL) 2261 add_smap_entries(&physmap_idx); 2262 else 2263 add_efi_map_entries(&physmap_idx); 2264 2265 base_memory = physmap[1] / 1024; 2266 /* make hole for AP bootstrap code */ 2267 physmap[1] = mp_bootaddress(base_memory); 2268 2269 /* Save EBDA address, if any */ 2270 ebda_addr = (u_long)(*(u_short *)(KERNBASE + 0x40e)); 2271 ebda_addr <<= 4; 2272 2273 /* 2274 * Maxmem isn't the "maximum memory", it's one larger than the 2275 * highest page of the physical address space. It should be 2276 * called something like "Maxphyspage". We may adjust this 2277 * based on ``hw.physmem'' and the results of the memory test. 2278 */ 2279 Maxmem = atop(physmap[physmap_idx + 1]); 2280 2281 #ifdef MAXMEM 2282 Maxmem = MAXMEM / 4; 2283 #endif 2284 2285 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 2286 Maxmem = atop(physmem_tunable); 2287 2288 /* 2289 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 2290 * in the system. 2291 */ 2292 if (Maxmem > atop(physmap[physmap_idx + 1])) 2293 Maxmem = atop(physmap[physmap_idx + 1]); 2294 2295 /* 2296 * Blowing out the DMAP will blow up the system. 2297 */ 2298 if (Maxmem > atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS)) { 2299 kprintf("Limiting Maxmem due to DMAP size\n"); 2300 Maxmem = atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS); 2301 } 2302 2303 if (atop(physmap[physmap_idx + 1]) != Maxmem && 2304 (boothowto & RB_VERBOSE)) { 2305 kprintf("Physical memory use set to %ldK\n", Maxmem * 4); 2306 } 2307 2308 /* 2309 * Call pmap initialization to make new kernel address space 2310 * 2311 * Mask off page 0. 2312 */ 2313 pmap_bootstrap(&first); 2314 physmap[0] = PAGE_SIZE; 2315 2316 /* 2317 * Align the physmap to PHYSMAP_ALIGN and cut out anything 2318 * exceeding Maxmem. 2319 */ 2320 for (i = j = 0; i <= physmap_idx; i += 2) { 2321 if (physmap[i+1] > ptoa(Maxmem)) 2322 physmap[i+1] = ptoa(Maxmem); 2323 physmap[i] = (physmap[i] + PHYSMAP_ALIGN_MASK) & 2324 ~PHYSMAP_ALIGN_MASK; 2325 physmap[i+1] = physmap[i+1] & ~PHYSMAP_ALIGN_MASK; 2326 2327 physmap[j] = physmap[i]; 2328 physmap[j+1] = physmap[i+1]; 2329 2330 if (physmap[i] < physmap[i+1]) 2331 j += 2; 2332 } 2333 physmap_idx = j - 2; 2334 2335 /* 2336 * Align anything else used in the validation loop. 2337 * 2338 * Also make sure that our 2MB kernel text+data+bss mappings 2339 * do not overlap potentially allocatable space. 2340 */ 2341 first = (first + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK; 2342 2343 /* 2344 * Size up each available chunk of physical memory. 2345 */ 2346 pa_indx = 0; 2347 da_indx = 0; 2348 phys_avail[pa_indx].phys_beg = physmap[0]; 2349 phys_avail[pa_indx].phys_end = physmap[0]; 2350 dump_avail[da_indx].phys_beg = 0; 2351 dump_avail[da_indx].phys_end = physmap[0]; 2352 pte = CMAP1; 2353 2354 /* 2355 * Get dcons buffer address 2356 */ 2357 if (kgetenv_quad("dcons.addr", &dcons_addr) == 0 || 2358 kgetenv_quad("dcons.size", &dcons_size) == 0) 2359 dcons_addr = 0; 2360 2361 /* 2362 * Validate the physical memory. The physical memory segments 2363 * have already been aligned to PHYSMAP_ALIGN which is a multiple 2364 * of PAGE_SIZE. 2365 * 2366 * We no longer perform an exhaustive memory test. Instead we 2367 * simply test the first and last word in each physmap[] 2368 * segment. 2369 */ 2370 for (i = 0; i <= physmap_idx; i += 2) { 2371 vm_paddr_t end; 2372 vm_paddr_t incr; 2373 2374 end = physmap[i + 1]; 2375 2376 for (pa = physmap[i]; pa < end; pa += incr) { 2377 int page_bad, full; 2378 volatile uint64_t *ptr = (uint64_t *)CADDR1; 2379 uint64_t tmp; 2380 2381 full = FALSE; 2382 2383 /* 2384 * Calculate incr. Just test the first and 2385 * last page in each physmap[] segment. 2386 */ 2387 if (pa == end - PAGE_SIZE) 2388 incr = PAGE_SIZE; 2389 else 2390 incr = end - pa - PAGE_SIZE; 2391 2392 /* 2393 * Make sure we don't skip blacked out areas. 2394 */ 2395 if (pa < 0x200000 && 0x200000 < end) { 2396 incr = 0x200000 - pa; 2397 } 2398 if (dcons_addr > 0 && 2399 pa < dcons_addr && 2400 dcons_addr < end) { 2401 incr = dcons_addr - pa; 2402 } 2403 2404 /* 2405 * Block out kernel memory as not available. 2406 */ 2407 if (pa >= 0x200000 && pa < first) { 2408 incr = first - pa; 2409 if (pa + incr > end) 2410 incr = end - pa; 2411 goto do_dump_avail; 2412 } 2413 2414 /* 2415 * Block out the dcons buffer if it exists. 2416 */ 2417 if (dcons_addr > 0 && 2418 pa >= trunc_page(dcons_addr) && 2419 pa < dcons_addr + dcons_size) { 2420 incr = dcons_addr + dcons_size - pa; 2421 incr = (incr + PAGE_MASK) & 2422 ~(vm_paddr_t)PAGE_MASK; 2423 if (pa + incr > end) 2424 incr = end - pa; 2425 goto do_dump_avail; 2426 } 2427 2428 page_bad = FALSE; 2429 2430 /* 2431 * Map the page non-cacheable for the memory 2432 * test. 2433 */ 2434 *pte = pa | 2435 kernel_pmap->pmap_bits[PG_V_IDX] | 2436 kernel_pmap->pmap_bits[PG_RW_IDX] | 2437 kernel_pmap->pmap_bits[PG_N_IDX]; 2438 cpu_invlpg(__DEVOLATILE(void *, ptr)); 2439 cpu_mfence(); 2440 2441 /* 2442 * Save original value for restoration later. 2443 */ 2444 tmp = *ptr; 2445 2446 /* 2447 * Test for alternating 1's and 0's 2448 */ 2449 *ptr = 0xaaaaaaaaaaaaaaaaLLU; 2450 cpu_mfence(); 2451 if (*ptr != 0xaaaaaaaaaaaaaaaaLLU) 2452 page_bad = TRUE; 2453 /* 2454 * Test for alternating 0's and 1's 2455 */ 2456 *ptr = 0x5555555555555555LLU; 2457 cpu_mfence(); 2458 if (*ptr != 0x5555555555555555LLU) 2459 page_bad = TRUE; 2460 /* 2461 * Test for all 1's 2462 */ 2463 *ptr = 0xffffffffffffffffLLU; 2464 cpu_mfence(); 2465 if (*ptr != 0xffffffffffffffffLLU) 2466 page_bad = TRUE; 2467 /* 2468 * Test for all 0's 2469 */ 2470 *ptr = 0x0; 2471 cpu_mfence(); 2472 if (*ptr != 0x0) 2473 page_bad = TRUE; 2474 2475 /* 2476 * Restore original value. 2477 */ 2478 *ptr = tmp; 2479 2480 /* 2481 * Adjust array of valid/good pages. 2482 */ 2483 if (page_bad == TRUE) { 2484 incr = PAGE_SIZE; 2485 continue; 2486 } 2487 2488 /* 2489 * Collapse page address into phys_avail[]. Do a 2490 * continuation of the current phys_avail[] index 2491 * when possible. 2492 */ 2493 if (phys_avail[pa_indx].phys_end == pa) { 2494 /* 2495 * Continuation 2496 */ 2497 phys_avail[pa_indx].phys_end += incr; 2498 } else if (phys_avail[pa_indx].phys_beg == 2499 phys_avail[pa_indx].phys_end) { 2500 /* 2501 * Current phys_avail is completely empty, 2502 * reuse the index. 2503 */ 2504 phys_avail[pa_indx].phys_beg = pa; 2505 phys_avail[pa_indx].phys_end = pa + incr; 2506 } else { 2507 /* 2508 * Allocate next phys_avail index. 2509 */ 2510 ++pa_indx; 2511 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 2512 kprintf( 2513 "Too many holes in the physical address space, giving up\n"); 2514 --pa_indx; 2515 full = TRUE; 2516 goto do_dump_avail; 2517 } 2518 phys_avail[pa_indx].phys_beg = pa; 2519 phys_avail[pa_indx].phys_end = pa + incr; 2520 } 2521 physmem += incr / PAGE_SIZE; 2522 2523 /* 2524 * pa available for dumping 2525 */ 2526 do_dump_avail: 2527 if (dump_avail[da_indx].phys_end == pa) { 2528 dump_avail[da_indx].phys_end += incr; 2529 } else { 2530 ++da_indx; 2531 if (da_indx == DUMP_AVAIL_ARRAY_END) { 2532 --da_indx; 2533 goto do_next; 2534 } 2535 dump_avail[da_indx].phys_beg = pa; 2536 dump_avail[da_indx].phys_end = pa + incr; 2537 } 2538 do_next: 2539 if (full) 2540 break; 2541 } 2542 } 2543 *pte = 0; 2544 cpu_invltlb(); 2545 cpu_mfence(); 2546 2547 /* 2548 * The last chunk must contain at least one page plus the message 2549 * buffer to avoid complicating other code (message buffer address 2550 * calculation, etc.). 2551 */ 2552 msgbuf_size = (MSGBUF_SIZE + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK; 2553 2554 while (phys_avail[pa_indx].phys_beg + PHYSMAP_ALIGN + msgbuf_size >= 2555 phys_avail[pa_indx].phys_end) { 2556 physmem -= atop(phys_avail[pa_indx].phys_end - 2557 phys_avail[pa_indx].phys_beg); 2558 phys_avail[pa_indx].phys_beg = 0; 2559 phys_avail[pa_indx].phys_end = 0; 2560 --pa_indx; 2561 } 2562 2563 Maxmem = atop(phys_avail[pa_indx].phys_end); 2564 2565 /* Trim off space for the message buffer. */ 2566 phys_avail[pa_indx].phys_end -= msgbuf_size; 2567 2568 avail_end = phys_avail[pa_indx].phys_end; 2569 2570 /* Map the message buffer. */ 2571 for (off = 0; off < msgbuf_size; off += PAGE_SIZE) { 2572 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); 2573 } 2574 2575 /* 2576 * Try to get EFI framebuffer working as early as possible. 2577 * 2578 * WARN: Some BIOSes do not list the EFI framebuffer memory, causing 2579 * the pmap probe code to create a DMAP that does not cover its 2580 * physical address space, efi_fb_init_vaddr(1) might not return 2581 * an initialized framebuffer base pointer. In this situation the 2582 * later efi_fb_init_vaddr(0) call will deal with it. 2583 */ 2584 if (have_efi_framebuffer) 2585 efi_fb_init_vaddr(1); 2586 } 2587 2588 struct machintr_abi MachIntrABI; 2589 2590 /* 2591 * IDT VECTORS: 2592 * 0 Divide by zero 2593 * 1 Debug 2594 * 2 NMI 2595 * 3 BreakPoint 2596 * 4 OverFlow 2597 * 5 Bound-Range 2598 * 6 Invalid OpCode 2599 * 7 Device Not Available (x87) 2600 * 8 Double-Fault 2601 * 9 Coprocessor Segment overrun (unsupported, reserved) 2602 * 10 Invalid-TSS 2603 * 11 Segment not present 2604 * 12 Stack 2605 * 13 General Protection 2606 * 14 Page Fault 2607 * 15 Reserved 2608 * 16 x87 FP Exception pending 2609 * 17 Alignment Check 2610 * 18 Machine Check 2611 * 19 SIMD floating point 2612 * 20-31 reserved 2613 * 32-255 INTn/external sources 2614 */ 2615 u_int64_t 2616 hammer_time(u_int64_t modulep, u_int64_t physfree) 2617 { 2618 caddr_t kmdp; 2619 int gsel_tss, x, cpu; 2620 #if 0 /* JG */ 2621 int metadata_missing, off; 2622 #endif 2623 struct mdglobaldata *gd; 2624 struct privatespace *ps; 2625 u_int64_t msr; 2626 2627 /* 2628 * Prevent lowering of the ipl if we call tsleep() early. 2629 */ 2630 gd = &CPU_prvspace[0]->mdglobaldata; 2631 ps = (struct privatespace *)gd; 2632 bzero(gd, sizeof(*gd)); 2633 bzero(&ps->common_tss, sizeof(ps->common_tss)); 2634 2635 /* 2636 * Note: on both UP and SMP curthread must be set non-NULL 2637 * early in the boot sequence because the system assumes 2638 * that 'curthread' is never NULL. 2639 */ 2640 2641 gd->mi.gd_curthread = &thread0; 2642 thread0.td_gd = &gd->mi; 2643 2644 atdevbase = ISA_HOLE_START + PTOV_OFFSET; 2645 2646 #if 0 /* JG */ 2647 metadata_missing = 0; 2648 if (bootinfo.bi_modulep) { 2649 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; 2650 preload_bootstrap_relocate(KERNBASE); 2651 } else { 2652 metadata_missing = 1; 2653 } 2654 if (bootinfo.bi_envp) 2655 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; 2656 #endif 2657 2658 preload_metadata = (caddr_t)(uintptr_t)(modulep + PTOV_OFFSET); 2659 preload_bootstrap_relocate(PTOV_OFFSET); 2660 kmdp = preload_search_by_type("elf kernel"); 2661 if (kmdp == NULL) 2662 kmdp = preload_search_by_type("elf64 kernel"); 2663 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 2664 kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + PTOV_OFFSET; 2665 #ifdef DDB 2666 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 2667 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 2668 #endif 2669 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); 2670 2671 if (boothowto & RB_VERBOSE) 2672 bootverbose++; 2673 2674 /* 2675 * Default MachIntrABI to ICU 2676 */ 2677 MachIntrABI = MachIntrABI_ICU; 2678 2679 /* 2680 * start with one cpu. Note: with one cpu, ncpus_fit_mask remain 0. 2681 */ 2682 ncpus = 1; 2683 ncpus_fit = 1; 2684 /* Init basic tunables, hz etc */ 2685 init_param1(); 2686 2687 /* 2688 * make gdt memory segments 2689 */ 2690 gdt_segs[GPROC0_SEL].ssd_base = 2691 (uintptr_t) &CPU_prvspace[0]->common_tss; 2692 2693 gd->mi.gd_prvspace = CPU_prvspace[0]; 2694 2695 for (x = 0; x < NGDT; x++) { 2696 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1)) 2697 ssdtosd(&gdt_segs[x], &gdt_cpu0[x]); 2698 } 2699 ssdtosyssd(&gdt_segs[GPROC0_SEL], 2700 (struct system_segment_descriptor *)&gdt_cpu0[GPROC0_SEL]); 2701 2702 /* 2703 * WARNING! Due to an Intel quirk, VMX exits set the gdt[] table 2704 * limit to 0xFFFF. To avoid having to do a heavy-weight 2705 * reload, we just make ours maximally sized. 2706 */ 2707 r_gdt.rd_limit = MAXGDT_LIMIT - 1; 2708 r_gdt.rd_base = (long)gdt_cpu0; 2709 lgdt(&r_gdt); 2710 2711 wrmsr(MSR_FSBASE, 0); /* User value */ 2712 wrmsr(MSR_GSBASE, (u_int64_t)&gd->mi); 2713 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 2714 2715 mi_gdinit(&gd->mi, 0); 2716 cpu_gdinit(gd, 0); 2717 proc0paddr = proc0paddr_buff; 2718 mi_proc0init(&gd->mi, proc0paddr); 2719 safepri = TDPRI_MAX; 2720 2721 /* spinlocks and the BGL */ 2722 init_locks(); 2723 2724 /* exceptions */ 2725 for (x = 0; x < NIDT; x++) 2726 setidt_global(x, rsvdary[x], SDT_SYSIGT, SEL_KPL, 0); 2727 setidt_global(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); 2728 setidt_global(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 2); 2729 setidt_global(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 1); 2730 setidt_global(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0); 2731 setidt_global(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0); 2732 setidt_global(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); 2733 setidt_global(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); 2734 setidt_global(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); 2735 setidt_global(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 2736 setidt_global(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); 2737 setidt_global(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); 2738 setidt_global(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); 2739 setidt_global(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); 2740 setidt_global(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); 2741 setidt_global(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); 2742 setidt_global(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); 2743 setidt_global(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); 2744 setidt_global(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0); 2745 setidt_global(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); 2746 2747 for (cpu = 0; cpu < MAXCPU; ++cpu) { 2748 r_idt_arr[cpu].rd_limit = sizeof(idt_arr[cpu]) - 1; 2749 r_idt_arr[cpu].rd_base = (long) &idt_arr[cpu][0]; 2750 } 2751 2752 lidt(&r_idt_arr[0]); 2753 2754 /* 2755 * Initialize the console before we print anything out. 2756 */ 2757 cninit(); 2758 2759 #if 0 /* JG */ 2760 if (metadata_missing) 2761 kprintf("WARNING: loader(8) metadata is missing!\n"); 2762 #endif 2763 2764 #if NISA >0 2765 elcr_probe(); 2766 isa_defaultirq(); 2767 #endif 2768 rand_initialize(); 2769 2770 /* 2771 * Initialize IRQ mapping 2772 * 2773 * NOTE: 2774 * SHOULD be after elcr_probe() 2775 */ 2776 MachIntrABI_ICU.initmap(); 2777 MachIntrABI_IOAPIC.initmap(); 2778 2779 #ifdef DDB 2780 kdb_init(); 2781 if (boothowto & RB_KDB) 2782 Debugger("Boot flags requested debugger"); 2783 #endif 2784 2785 identify_cpu(); /* Final stage of CPU initialization */ 2786 initializecpu(0); /* Initialize CPU registers */ 2787 2788 /* 2789 * On modern Intel cpus, haswell or later, cpu_idle_hlt=1 is better 2790 * because the cpu does significant power management in MWAIT 2791 * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP). 2792 * 2793 * On many AMD cpus cpu_idle_hlt=3 is better, because the cpu does 2794 * significant power management only when using ACPI halt mode. 2795 * (However, on Ryzen, mode 4 (HLT) also does power management). 2796 * 2797 * On older AMD or Intel cpus, cpu_idle_hlt=2 is better because ACPI 2798 * is needed to reduce power consumption, but wakeup times are often 2799 * too long. 2800 */ 2801 if (cpu_vendor_id == CPU_VENDOR_INTEL && 2802 CPUID_TO_MODEL(cpu_id) >= 0x3C) { /* Haswell or later */ 2803 cpu_idle_hlt = 1; 2804 } 2805 if (cpu_vendor_id == CPU_VENDOR_AMD) { 2806 if (CPUID_TO_FAMILY(cpu_id) >= 0x17) { 2807 /* Ryzen or later */ 2808 cpu_idle_hlt = 3; 2809 } else if (CPUID_TO_FAMILY(cpu_id) >= 0x14) { 2810 /* Bobcat or later */ 2811 cpu_idle_hlt = 3; 2812 } 2813 } 2814 2815 TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable); /* for compat */ 2816 TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable); 2817 TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable); 2818 TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt); 2819 2820 /* 2821 * By default always enable the ioapic. Certain virtual machines 2822 * may not work with the I/O apic enabled and can be specified in 2823 * the case statement below. On the other hand, if the ioapic is 2824 * disabled for virtual machines which DO work with the I/O apic, 2825 * the virtual machine can implode if we disable the I/O apic. 2826 * 2827 * For now enable the ioapic for all guests. 2828 * 2829 * NOTE: This must be done after identify_cpu(), which sets 2830 * 'cpu_feature2'. 2831 */ 2832 if (ioapic_enable < 0) { 2833 ioapic_enable = 1; 2834 switch(vmm_guest) { 2835 case VMM_GUEST_NONE: /* should be enabled on real HW */ 2836 case VMM_GUEST_KVM: /* must be enabled or VM implodes */ 2837 ioapic_enable = 1; 2838 break; 2839 default: /* enable by default for other VMs */ 2840 ioapic_enable = 1; 2841 break; 2842 } 2843 } 2844 2845 /* 2846 * TSS entry point for interrupts, traps, and exceptions 2847 * (sans NMI). This will always go to near the top of the pcpu 2848 * trampoline area. Hardware-pushed data will be copied into 2849 * the trap-frame on entry, and (if necessary) returned to the 2850 * trampoline on exit. 2851 * 2852 * We store some pcb data for the trampoline code above the 2853 * stack the cpu hw pushes into, and arrange things so the 2854 * address of tr_pcb_rsp is the same as the desired top of 2855 * stack. 2856 */ 2857 ps->common_tss.tss_rsp0 = (register_t)&ps->trampoline.tr_pcb_rsp; 2858 ps->trampoline.tr_pcb_rsp = ps->common_tss.tss_rsp0; 2859 ps->trampoline.tr_pcb_gs_kernel = (register_t)gd; 2860 ps->trampoline.tr_pcb_cr3 = KPML4phys; /* adj to user cr3 live */ 2861 ps->dbltramp.tr_pcb_gs_kernel = (register_t)gd; 2862 ps->dbltramp.tr_pcb_cr3 = KPML4phys; 2863 ps->dbgtramp.tr_pcb_gs_kernel = (register_t)gd; 2864 ps->dbgtramp.tr_pcb_cr3 = KPML4phys; 2865 2866 /* double fault stack */ 2867 ps->common_tss.tss_ist1 = (register_t)&ps->dbltramp.tr_pcb_rsp; 2868 /* #DB debugger needs its own stack */ 2869 ps->common_tss.tss_ist2 = (register_t)&ps->dbgtramp.tr_pcb_rsp; 2870 2871 /* Set the IO permission bitmap (empty due to tss seg limit) */ 2872 ps->common_tss.tss_iobase = sizeof(struct x86_64tss); 2873 2874 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 2875 gd->gd_gdt = &gdt_cpu0[0]; 2876 gd->gd_tss_gdt = &gd->gd_gdt[GPROC0_SEL]; 2877 gd->gd_common_tssd = *gd->gd_tss_gdt; 2878 ltr(gsel_tss); 2879 2880 /* Set up the fast syscall stuff */ 2881 msr = rdmsr(MSR_EFER) | EFER_SCE; 2882 wrmsr(MSR_EFER, msr); 2883 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); 2884 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 2885 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 2886 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 2887 wrmsr(MSR_STAR, msr); 2888 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_IOPL|PSL_AC); 2889 2890 getmemsize(kmdp, physfree); 2891 init_param2(physmem); 2892 2893 /* now running on new page tables, configured,and u/iom is accessible */ 2894 2895 /* Map the message buffer. */ 2896 #if 0 /* JG */ 2897 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) 2898 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); 2899 #endif 2900 2901 msgbufinit(msgbufp, MSGBUF_SIZE); 2902 2903 2904 /* transfer to user mode */ 2905 2906 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 2907 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 2908 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 2909 2910 load_ds(_udatasel); 2911 load_es(_udatasel); 2912 load_fs(_udatasel); 2913 2914 /* setup proc 0's pcb */ 2915 thread0.td_pcb->pcb_flags = 0; 2916 thread0.td_pcb->pcb_cr3 = KPML4phys; 2917 thread0.td_pcb->pcb_cr3_iso = 0; 2918 thread0.td_pcb->pcb_ext = NULL; 2919 lwp0.lwp_md.md_regs = &proc0_tf; /* XXX needed? */ 2920 2921 /* Location of kernel stack for locore */ 2922 return ((u_int64_t)thread0.td_pcb); 2923 } 2924 2925 /* 2926 * Initialize machine-dependant portions of the global data structure. 2927 * Note that the global data area and cpu0's idlestack in the private 2928 * data space were allocated in locore. 2929 * 2930 * Note: the idlethread's cpl is 0 2931 * 2932 * WARNING! Called from early boot, 'mycpu' may not work yet. 2933 */ 2934 void 2935 cpu_gdinit(struct mdglobaldata *gd, int cpu) 2936 { 2937 if (cpu) 2938 gd->mi.gd_curthread = &gd->mi.gd_idlethread; 2939 2940 lwkt_init_thread(&gd->mi.gd_idlethread, 2941 gd->mi.gd_prvspace->idlestack, 2942 sizeof(gd->mi.gd_prvspace->idlestack), 2943 0, &gd->mi); 2944 lwkt_set_comm(&gd->mi.gd_idlethread, "idle_%d", cpu); 2945 gd->mi.gd_idlethread.td_switch = cpu_lwkt_switch; 2946 gd->mi.gd_idlethread.td_sp -= sizeof(void *); 2947 *(void **)gd->mi.gd_idlethread.td_sp = cpu_idle_restore; 2948 } 2949 2950 /* 2951 * We only have to check for DMAP bounds, the globaldata space is 2952 * actually part of the kernel_map so we don't have to waste time 2953 * checking CPU_prvspace[*]. 2954 */ 2955 int 2956 is_globaldata_space(vm_offset_t saddr, vm_offset_t eaddr) 2957 { 2958 #if 0 2959 if (saddr >= (vm_offset_t)&CPU_prvspace[0] && 2960 eaddr <= (vm_offset_t)&CPU_prvspace[MAXCPU]) { 2961 return (TRUE); 2962 } 2963 #endif 2964 if (saddr >= DMAP_MIN_ADDRESS && eaddr <= DMAP_MAX_ADDRESS) 2965 return (TRUE); 2966 return (FALSE); 2967 } 2968 2969 struct globaldata * 2970 globaldata_find(int cpu) 2971 { 2972 KKASSERT(cpu >= 0 && cpu < ncpus); 2973 return(&CPU_prvspace[cpu]->mdglobaldata.mi); 2974 } 2975 2976 /* 2977 * This path should be safe from the SYSRET issue because only stopped threads 2978 * can have their %rip adjusted this way (and all heavy weight thread switches 2979 * clear QUICKREF and thus do not use SYSRET). However, the code path is 2980 * convoluted so add a safety by forcing %rip to be cannonical. 2981 */ 2982 int 2983 ptrace_set_pc(struct lwp *lp, unsigned long addr) 2984 { 2985 if (addr & 0x0000800000000000LLU) 2986 lp->lwp_md.md_regs->tf_rip = addr | 0xFFFF000000000000LLU; 2987 else 2988 lp->lwp_md.md_regs->tf_rip = addr & 0x0000FFFFFFFFFFFFLLU; 2989 return (0); 2990 } 2991 2992 int 2993 ptrace_single_step(struct lwp *lp) 2994 { 2995 lp->lwp_md.md_regs->tf_rflags |= PSL_T; 2996 return (0); 2997 } 2998 2999 int 3000 fill_regs(struct lwp *lp, struct reg *regs) 3001 { 3002 struct trapframe *tp; 3003 3004 if ((tp = lp->lwp_md.md_regs) == NULL) 3005 return EINVAL; 3006 bcopy(&tp->tf_rdi, ®s->r_rdi, sizeof(*regs)); 3007 return (0); 3008 } 3009 3010 int 3011 set_regs(struct lwp *lp, struct reg *regs) 3012 { 3013 struct trapframe *tp; 3014 3015 tp = lp->lwp_md.md_regs; 3016 if (!EFL_SECURE(regs->r_rflags, tp->tf_rflags) || 3017 !CS_SECURE(regs->r_cs)) 3018 return (EINVAL); 3019 bcopy(®s->r_rdi, &tp->tf_rdi, sizeof(*regs)); 3020 clear_quickret(); 3021 return (0); 3022 } 3023 3024 static void 3025 fill_fpregs_xmm(struct savexmm *sv_xmm, struct save87 *sv_87) 3026 { 3027 struct env87 *penv_87 = &sv_87->sv_env; 3028 struct envxmm *penv_xmm = &sv_xmm->sv_env; 3029 int i; 3030 3031 /* FPU control/status */ 3032 penv_87->en_cw = penv_xmm->en_cw; 3033 penv_87->en_sw = penv_xmm->en_sw; 3034 penv_87->en_tw = penv_xmm->en_tw; 3035 penv_87->en_fip = penv_xmm->en_fip; 3036 penv_87->en_fcs = penv_xmm->en_fcs; 3037 penv_87->en_opcode = penv_xmm->en_opcode; 3038 penv_87->en_foo = penv_xmm->en_foo; 3039 penv_87->en_fos = penv_xmm->en_fos; 3040 3041 /* FPU registers */ 3042 for (i = 0; i < 8; ++i) 3043 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; 3044 } 3045 3046 static void 3047 set_fpregs_xmm(struct save87 *sv_87, struct savexmm *sv_xmm) 3048 { 3049 struct env87 *penv_87 = &sv_87->sv_env; 3050 struct envxmm *penv_xmm = &sv_xmm->sv_env; 3051 int i; 3052 3053 /* FPU control/status */ 3054 penv_xmm->en_cw = penv_87->en_cw; 3055 penv_xmm->en_sw = penv_87->en_sw; 3056 penv_xmm->en_tw = penv_87->en_tw; 3057 penv_xmm->en_fip = penv_87->en_fip; 3058 penv_xmm->en_fcs = penv_87->en_fcs; 3059 penv_xmm->en_opcode = penv_87->en_opcode; 3060 penv_xmm->en_foo = penv_87->en_foo; 3061 penv_xmm->en_fos = penv_87->en_fos; 3062 3063 /* FPU registers */ 3064 for (i = 0; i < 8; ++i) 3065 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; 3066 } 3067 3068 int 3069 fill_fpregs(struct lwp *lp, struct fpreg *fpregs) 3070 { 3071 if (lp->lwp_thread == NULL || lp->lwp_thread->td_pcb == NULL) 3072 return EINVAL; 3073 if (cpu_fxsr) { 3074 fill_fpregs_xmm(&lp->lwp_thread->td_pcb->pcb_save.sv_xmm, 3075 (struct save87 *)fpregs); 3076 return (0); 3077 } 3078 bcopy(&lp->lwp_thread->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs); 3079 return (0); 3080 } 3081 3082 int 3083 set_fpregs(struct lwp *lp, struct fpreg *fpregs) 3084 { 3085 if (cpu_fxsr) { 3086 set_fpregs_xmm((struct save87 *)fpregs, 3087 &lp->lwp_thread->td_pcb->pcb_save.sv_xmm); 3088 return (0); 3089 } 3090 bcopy(fpregs, &lp->lwp_thread->td_pcb->pcb_save.sv_87, sizeof *fpregs); 3091 return (0); 3092 } 3093 3094 int 3095 fill_dbregs(struct lwp *lp, struct dbreg *dbregs) 3096 { 3097 struct pcb *pcb; 3098 3099 if (lp == NULL) { 3100 dbregs->dr[0] = rdr0(); 3101 dbregs->dr[1] = rdr1(); 3102 dbregs->dr[2] = rdr2(); 3103 dbregs->dr[3] = rdr3(); 3104 dbregs->dr[4] = rdr4(); 3105 dbregs->dr[5] = rdr5(); 3106 dbregs->dr[6] = rdr6(); 3107 dbregs->dr[7] = rdr7(); 3108 return (0); 3109 } 3110 if (lp->lwp_thread == NULL || (pcb = lp->lwp_thread->td_pcb) == NULL) 3111 return EINVAL; 3112 dbregs->dr[0] = pcb->pcb_dr0; 3113 dbregs->dr[1] = pcb->pcb_dr1; 3114 dbregs->dr[2] = pcb->pcb_dr2; 3115 dbregs->dr[3] = pcb->pcb_dr3; 3116 dbregs->dr[4] = 0; 3117 dbregs->dr[5] = 0; 3118 dbregs->dr[6] = pcb->pcb_dr6; 3119 dbregs->dr[7] = pcb->pcb_dr7; 3120 return (0); 3121 } 3122 3123 int 3124 set_dbregs(struct lwp *lp, struct dbreg *dbregs) 3125 { 3126 if (lp == NULL) { 3127 load_dr0(dbregs->dr[0]); 3128 load_dr1(dbregs->dr[1]); 3129 load_dr2(dbregs->dr[2]); 3130 load_dr3(dbregs->dr[3]); 3131 load_dr4(dbregs->dr[4]); 3132 load_dr5(dbregs->dr[5]); 3133 load_dr6(dbregs->dr[6]); 3134 load_dr7(dbregs->dr[7]); 3135 } else { 3136 struct pcb *pcb; 3137 struct ucred *ucred; 3138 int i; 3139 uint64_t mask1, mask2; 3140 3141 /* 3142 * Don't let an illegal value for dr7 get set. Specifically, 3143 * check for undefined settings. Setting these bit patterns 3144 * result in undefined behaviour and can lead to an unexpected 3145 * TRCTRAP. 3146 */ 3147 /* JG this loop looks unreadable */ 3148 /* Check 4 2-bit fields for invalid patterns. 3149 * These fields are R/Wi, for i = 0..3 3150 */ 3151 /* Is 10 in LENi allowed when running in compatibility mode? */ 3152 /* Pattern 10 in R/Wi might be used to indicate 3153 * breakpoint on I/O. Further analysis should be 3154 * carried to decide if it is safe and useful to 3155 * provide access to that capability 3156 */ 3157 for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 4; 3158 i++, mask1 <<= 4, mask2 <<= 4) 3159 if ((dbregs->dr[7] & mask1) == mask2) 3160 return (EINVAL); 3161 3162 pcb = lp->lwp_thread->td_pcb; 3163 ucred = lp->lwp_proc->p_ucred; 3164 3165 /* 3166 * Don't let a process set a breakpoint that is not within the 3167 * process's address space. If a process could do this, it 3168 * could halt the system by setting a breakpoint in the kernel 3169 * (if ddb was enabled). Thus, we need to check to make sure 3170 * that no breakpoints are being enabled for addresses outside 3171 * process's address space, unless, perhaps, we were called by 3172 * uid 0. 3173 * 3174 * XXX - what about when the watched area of the user's 3175 * address space is written into from within the kernel 3176 * ... wouldn't that still cause a breakpoint to be generated 3177 * from within kernel mode? 3178 */ 3179 3180 if (priv_check_cred(ucred, PRIV_ROOT, 0) != 0) { 3181 if (dbregs->dr[7] & 0x3) { 3182 /* dr0 is enabled */ 3183 if (dbregs->dr[0] >= VM_MAX_USER_ADDRESS) 3184 return (EINVAL); 3185 } 3186 3187 if (dbregs->dr[7] & (0x3<<2)) { 3188 /* dr1 is enabled */ 3189 if (dbregs->dr[1] >= VM_MAX_USER_ADDRESS) 3190 return (EINVAL); 3191 } 3192 3193 if (dbregs->dr[7] & (0x3<<4)) { 3194 /* dr2 is enabled */ 3195 if (dbregs->dr[2] >= VM_MAX_USER_ADDRESS) 3196 return (EINVAL); 3197 } 3198 3199 if (dbregs->dr[7] & (0x3<<6)) { 3200 /* dr3 is enabled */ 3201 if (dbregs->dr[3] >= VM_MAX_USER_ADDRESS) 3202 return (EINVAL); 3203 } 3204 } 3205 3206 pcb->pcb_dr0 = dbregs->dr[0]; 3207 pcb->pcb_dr1 = dbregs->dr[1]; 3208 pcb->pcb_dr2 = dbregs->dr[2]; 3209 pcb->pcb_dr3 = dbregs->dr[3]; 3210 pcb->pcb_dr6 = dbregs->dr[6]; 3211 pcb->pcb_dr7 = dbregs->dr[7]; 3212 3213 pcb->pcb_flags |= PCB_DBREGS; 3214 } 3215 3216 return (0); 3217 } 3218 3219 /* 3220 * Return > 0 if a hardware breakpoint has been hit, and the 3221 * breakpoint was in user space. Return 0, otherwise. 3222 */ 3223 int 3224 user_dbreg_trap(void) 3225 { 3226 u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */ 3227 u_int64_t bp; /* breakpoint bits extracted from dr6 */ 3228 int nbp; /* number of breakpoints that triggered */ 3229 caddr_t addr[4]; /* breakpoint addresses */ 3230 int i; 3231 3232 dr7 = rdr7(); 3233 if ((dr7 & 0xff) == 0) { 3234 /* 3235 * all GE and LE bits in the dr7 register are zero, 3236 * thus the trap couldn't have been caused by the 3237 * hardware debug registers 3238 */ 3239 return 0; 3240 } 3241 3242 nbp = 0; 3243 dr6 = rdr6(); 3244 bp = dr6 & 0xf; 3245 3246 if (bp == 0) { 3247 /* 3248 * None of the breakpoint bits are set meaning this 3249 * trap was not caused by any of the debug registers 3250 */ 3251 return 0; 3252 } 3253 3254 /* 3255 * at least one of the breakpoints were hit, check to see 3256 * which ones and if any of them are user space addresses 3257 */ 3258 3259 if (bp & 0x01) { 3260 addr[nbp++] = (caddr_t)rdr0(); 3261 } 3262 if (bp & 0x02) { 3263 addr[nbp++] = (caddr_t)rdr1(); 3264 } 3265 if (bp & 0x04) { 3266 addr[nbp++] = (caddr_t)rdr2(); 3267 } 3268 if (bp & 0x08) { 3269 addr[nbp++] = (caddr_t)rdr3(); 3270 } 3271 3272 for (i = 0; i < nbp; i++) { 3273 if (addr[i] < (caddr_t)VM_MAX_USER_ADDRESS) { 3274 /* 3275 * addr[i] is in user space 3276 */ 3277 return nbp; 3278 } 3279 } 3280 3281 /* 3282 * None of the breakpoints are in user space. 3283 */ 3284 return 0; 3285 } 3286 3287 3288 #ifndef DDB 3289 void 3290 Debugger(const char *msg) 3291 { 3292 kprintf("Debugger(\"%s\") called.\n", msg); 3293 } 3294 #endif /* no DDB */ 3295 3296 #ifdef DDB 3297 3298 /* 3299 * Provide inb() and outb() as functions. They are normally only 3300 * available as macros calling inlined functions, thus cannot be 3301 * called inside DDB. 3302 * 3303 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined. 3304 */ 3305 3306 #undef inb 3307 #undef outb 3308 3309 /* silence compiler warnings */ 3310 u_char inb(u_int); 3311 void outb(u_int, u_char); 3312 3313 u_char 3314 inb(u_int port) 3315 { 3316 u_char data; 3317 /* 3318 * We use %%dx and not %1 here because i/o is done at %dx and not at 3319 * %edx, while gcc generates inferior code (movw instead of movl) 3320 * if we tell it to load (u_short) port. 3321 */ 3322 __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); 3323 return (data); 3324 } 3325 3326 void 3327 outb(u_int port, u_char data) 3328 { 3329 u_char al; 3330 /* 3331 * Use an unnecessary assignment to help gcc's register allocator. 3332 * This make a large difference for gcc-1.40 and a tiny difference 3333 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for 3334 * best results. gcc-2.6.0 can't handle this. 3335 */ 3336 al = data; 3337 __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); 3338 } 3339 3340 #endif /* DDB */ 3341 3342 3343 3344 /* 3345 * initialize all the SMP locks 3346 */ 3347 3348 /* critical region when masking or unmasking interupts */ 3349 struct spinlock_deprecated imen_spinlock; 3350 3351 /* locks com (tty) data/hardware accesses: a FASTINTR() */ 3352 struct spinlock_deprecated com_spinlock; 3353 3354 /* lock regions around the clock hardware */ 3355 struct spinlock_deprecated clock_spinlock; 3356 3357 static void 3358 init_locks(void) 3359 { 3360 /* 3361 * Get the initial mplock with a count of 1 for the BSP. 3362 * This uses a LOGICAL cpu ID, ie BSP == 0. 3363 */ 3364 cpu_get_initial_mplock(); 3365 /* DEPRECATED */ 3366 spin_init_deprecated(&imen_spinlock); 3367 spin_init_deprecated(&com_spinlock); 3368 spin_init_deprecated(&clock_spinlock); 3369 3370 /* our token pool needs to work early */ 3371 lwkt_token_pool_init(); 3372 } 3373 3374 boolean_t 3375 cpu_mwait_hint_valid(uint32_t hint) 3376 { 3377 int cx_idx, sub; 3378 3379 cx_idx = MWAIT_EAX_TO_CX(hint); 3380 if (cx_idx >= CPU_MWAIT_CX_MAX) 3381 return FALSE; 3382 3383 sub = MWAIT_EAX_TO_CX_SUB(hint); 3384 if (sub >= cpu_mwait_cx_info[cx_idx].subcnt) 3385 return FALSE; 3386 3387 return TRUE; 3388 } 3389 3390 void 3391 cpu_mwait_cx_no_bmsts(void) 3392 { 3393 atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_STS); 3394 } 3395 3396 void 3397 cpu_mwait_cx_no_bmarb(void) 3398 { 3399 atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_ARB); 3400 } 3401 3402 static int 3403 cpu_mwait_cx_hint2name(int hint, char *name, int namelen, boolean_t allow_auto) 3404 { 3405 int old_cx_idx, sub = 0; 3406 3407 if (hint >= 0) { 3408 old_cx_idx = MWAIT_EAX_TO_CX(hint); 3409 sub = MWAIT_EAX_TO_CX_SUB(hint); 3410 } else if (hint == CPU_MWAIT_HINT_AUTO) { 3411 old_cx_idx = allow_auto ? CPU_MWAIT_C2 : CPU_MWAIT_CX_MAX; 3412 } else if (hint == CPU_MWAIT_HINT_AUTODEEP) { 3413 old_cx_idx = allow_auto ? CPU_MWAIT_C3 : CPU_MWAIT_CX_MAX; 3414 } else { 3415 old_cx_idx = CPU_MWAIT_CX_MAX; 3416 } 3417 3418 if (!CPU_MWAIT_HAS_CX) 3419 strlcpy(name, "NONE", namelen); 3420 else if (allow_auto && hint == CPU_MWAIT_HINT_AUTO) 3421 strlcpy(name, "AUTO", namelen); 3422 else if (allow_auto && hint == CPU_MWAIT_HINT_AUTODEEP) 3423 strlcpy(name, "AUTODEEP", namelen); 3424 else if (old_cx_idx >= CPU_MWAIT_CX_MAX || 3425 sub >= cpu_mwait_cx_info[old_cx_idx].subcnt) 3426 strlcpy(name, "INVALID", namelen); 3427 else 3428 ksnprintf(name, namelen, "C%d/%d", old_cx_idx, sub); 3429 3430 return old_cx_idx; 3431 } 3432 3433 static int 3434 cpu_mwait_cx_name2hint(char *name, int *hint0, boolean_t allow_auto) 3435 { 3436 int cx_idx, sub, hint; 3437 char *ptr, *start; 3438 3439 if (allow_auto && strcmp(name, "AUTO") == 0) { 3440 hint = CPU_MWAIT_HINT_AUTO; 3441 cx_idx = CPU_MWAIT_C2; 3442 goto done; 3443 } 3444 if (allow_auto && strcmp(name, "AUTODEEP") == 0) { 3445 hint = CPU_MWAIT_HINT_AUTODEEP; 3446 cx_idx = CPU_MWAIT_C3; 3447 goto done; 3448 } 3449 3450 if (strlen(name) < 4 || toupper(name[0]) != 'C') 3451 return -1; 3452 start = &name[1]; 3453 ptr = NULL; 3454 3455 cx_idx = strtol(start, &ptr, 10); 3456 if (ptr == start || *ptr != '/') 3457 return -1; 3458 if (cx_idx < 0 || cx_idx >= CPU_MWAIT_CX_MAX) 3459 return -1; 3460 3461 start = ptr + 1; 3462 ptr = NULL; 3463 3464 sub = strtol(start, &ptr, 10); 3465 if (*ptr != '\0') 3466 return -1; 3467 if (sub < 0 || sub >= cpu_mwait_cx_info[cx_idx].subcnt) 3468 return -1; 3469 3470 hint = MWAIT_EAX_HINT(cx_idx, sub); 3471 done: 3472 *hint0 = hint; 3473 return cx_idx; 3474 } 3475 3476 static int 3477 cpu_mwait_cx_transit(int old_cx_idx, int cx_idx) 3478 { 3479 if (cx_idx >= CPU_MWAIT_C3 && cpu_mwait_c3_preamble) 3480 return EOPNOTSUPP; 3481 if (old_cx_idx < CPU_MWAIT_C3 && cx_idx >= CPU_MWAIT_C3) { 3482 int error; 3483 3484 error = cputimer_intr_powersave_addreq(); 3485 if (error) 3486 return error; 3487 } else if (old_cx_idx >= CPU_MWAIT_C3 && cx_idx < CPU_MWAIT_C3) { 3488 cputimer_intr_powersave_remreq(); 3489 } 3490 return 0; 3491 } 3492 3493 static int 3494 cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS, int *hint0, 3495 boolean_t allow_auto) 3496 { 3497 int error, cx_idx, old_cx_idx, hint; 3498 char name[CPU_MWAIT_CX_NAMELEN]; 3499 3500 hint = *hint0; 3501 old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name), 3502 allow_auto); 3503 3504 error = sysctl_handle_string(oidp, name, sizeof(name), req); 3505 if (error != 0 || req->newptr == NULL) 3506 return error; 3507 3508 if (!CPU_MWAIT_HAS_CX) 3509 return EOPNOTSUPP; 3510 3511 cx_idx = cpu_mwait_cx_name2hint(name, &hint, allow_auto); 3512 if (cx_idx < 0) 3513 return EINVAL; 3514 3515 error = cpu_mwait_cx_transit(old_cx_idx, cx_idx); 3516 if (error) 3517 return error; 3518 3519 *hint0 = hint; 3520 return 0; 3521 } 3522 3523 static int 3524 cpu_mwait_cx_setname(struct cpu_idle_stat *stat, const char *cx_name) 3525 { 3526 int error, cx_idx, old_cx_idx, hint; 3527 char name[CPU_MWAIT_CX_NAMELEN]; 3528 3529 KASSERT(CPU_MWAIT_HAS_CX, ("cpu does not support mwait CX extension")); 3530 3531 hint = stat->hint; 3532 old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE); 3533 3534 strlcpy(name, cx_name, sizeof(name)); 3535 cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE); 3536 if (cx_idx < 0) 3537 return EINVAL; 3538 3539 error = cpu_mwait_cx_transit(old_cx_idx, cx_idx); 3540 if (error) 3541 return error; 3542 3543 stat->hint = hint; 3544 return 0; 3545 } 3546 3547 static int 3548 cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS) 3549 { 3550 int hint = cpu_mwait_halt_global; 3551 int error, cx_idx, cpu; 3552 char name[CPU_MWAIT_CX_NAMELEN], cx_name[CPU_MWAIT_CX_NAMELEN]; 3553 3554 cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE); 3555 3556 error = sysctl_handle_string(oidp, name, sizeof(name), req); 3557 if (error != 0 || req->newptr == NULL) 3558 return error; 3559 3560 if (!CPU_MWAIT_HAS_CX) 3561 return EOPNOTSUPP; 3562 3563 /* Save name for later per-cpu CX configuration */ 3564 strlcpy(cx_name, name, sizeof(cx_name)); 3565 3566 cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE); 3567 if (cx_idx < 0) 3568 return EINVAL; 3569 3570 /* Change per-cpu CX configuration */ 3571 for (cpu = 0; cpu < ncpus; ++cpu) { 3572 error = cpu_mwait_cx_setname(&cpu_idle_stats[cpu], cx_name); 3573 if (error) 3574 return error; 3575 } 3576 3577 cpu_mwait_halt_global = hint; 3578 return 0; 3579 } 3580 3581 static int 3582 cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS) 3583 { 3584 struct cpu_idle_stat *stat = arg1; 3585 int error; 3586 3587 error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req, 3588 &stat->hint, TRUE); 3589 return error; 3590 } 3591 3592 static int 3593 cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS) 3594 { 3595 int error; 3596 3597 error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req, 3598 &cpu_mwait_spin, FALSE); 3599 return error; 3600 } 3601 3602 /* 3603 * This manual debugging code is called unconditionally from Xtimer 3604 * (the per-cpu timer interrupt) whether the current thread is in a 3605 * critical section or not) and can be useful in tracking down lockups. 3606 * 3607 * NOTE: MANUAL DEBUG CODE 3608 */ 3609 #if 0 3610 static int saveticks[SMP_MAXCPU]; 3611 static int savecounts[SMP_MAXCPU]; 3612 #endif 3613 static tsc_uclock_t last_tsc[SMP_MAXCPU]; 3614 3615 void 3616 pcpu_timer_always(struct intrframe *frame) 3617 { 3618 globaldata_t gd; 3619 thread_t td; 3620 char *top; 3621 char *bot; 3622 char *rbp; 3623 char *rip; 3624 int n; 3625 tsc_uclock_t tsc; 3626 3627 if (flame_poll_debug == 0) 3628 return; 3629 gd = mycpu; 3630 tsc = rdtsc() - last_tsc[gd->gd_cpuid]; 3631 if (tsc_frequency == 0 || tsc < tsc_frequency) 3632 return; 3633 last_tsc[gd->gd_cpuid] = rdtsc(); 3634 3635 td = gd->gd_curthread; 3636 if (td == NULL) 3637 return; 3638 bot = (char *)td->td_kstack + PAGE_SIZE; /* skip guard */ 3639 top = (char *)td->td_kstack + td->td_kstack_size; 3640 if (bot >= top) 3641 return; 3642 3643 rip = (char *)(intptr_t)frame->if_rip; 3644 kprintf("POLL%02d %016lx", gd->gd_cpuid, (intptr_t)rip); 3645 rbp = (char *)(intptr_t)frame->if_rbp; 3646 3647 for (n = 1; n < 8; ++n) { 3648 if (rbp < bot || rbp > top - 8 || ((intptr_t)rbp & 7)) 3649 break; 3650 kprintf("<-%016lx", (intptr_t)*(char **)(rbp + 8)); 3651 if (*(char **)rbp <= rbp) 3652 break; 3653 rbp = *(char **)rbp; 3654 } 3655 kprintf("\n"); 3656 cpu_sfence(); 3657 } 3658 3659 SET_DECLARE(smap_open, char); 3660 SET_DECLARE(smap_close, char); 3661 3662 static void 3663 cpu_implement_smap(void) 3664 { 3665 char **scan; 3666 3667 for (scan = SET_BEGIN(smap_open); /* nop -> stac */ 3668 scan < SET_LIMIT(smap_open); ++scan) { 3669 (*scan)[0] = 0x0F; 3670 (*scan)[1] = 0x01; 3671 (*scan)[2] = 0xCB; 3672 } 3673 for (scan = SET_BEGIN(smap_close); /* nop -> clac */ 3674 scan < SET_LIMIT(smap_close); ++scan) { 3675 (*scan)[0] = 0x0F; 3676 (*scan)[1] = 0x01; 3677 (*scan)[2] = 0xCA; 3678 } 3679 } 3680 3681 /* 3682 * From a hard interrupt 3683 */ 3684 int 3685 cpu_interrupt_running(struct thread *td) 3686 { 3687 struct mdglobaldata *gd = mdcpu; 3688 3689 if (clock_debug1 > 0) { 3690 --clock_debug1; 3691 kprintf("%d %016lx %016lx %016lx\n", 3692 ((td->td_flags & TDF_INTTHREAD) != 0), 3693 gd->gd_ipending[0], 3694 gd->gd_ipending[1], 3695 gd->gd_ipending[2]); 3696 if (td->td_flags & TDF_CLKTHREAD) { 3697 kprintf("CLKTD %s PREEMPT %s\n", 3698 td->td_comm, 3699 (td->td_preempted ? 3700 td->td_preempted->td_comm : "")); 3701 } else { 3702 kprintf("NORTD %s\n", td->td_comm); 3703 } 3704 } 3705 if ((td->td_flags & TDF_INTTHREAD) || 3706 gd->gd_ipending[0] || 3707 gd->gd_ipending[1] || 3708 gd->gd_ipending[2]) { 3709 return 1; 3710 } else { 3711 return 0; 3712 } 3713 } 3714