1 /*- 2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 3 * Copyright (c) 1992 Terrence R. Lambert. 4 * Copyright (c) 2003 Peter Wemm. 5 * Copyright (c) 2008-2017 The DragonFly Project. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * William Jolitz. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $ 41 */ 42 43 //#include "use_npx.h" 44 #include "use_isa.h" 45 #include "opt_cpu.h" 46 #include "opt_ddb.h" 47 #include "opt_inet.h" 48 #include "opt_msgbuf.h" 49 #include "opt_swap.h" 50 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/sysmsg.h> 54 #include <sys/signalvar.h> 55 #include <sys/kernel.h> 56 #include <sys/linker.h> 57 #include <sys/malloc.h> 58 #include <sys/proc.h> 59 #include <sys/priv.h> 60 #include <sys/buf.h> 61 #include <sys/reboot.h> 62 #include <sys/mbuf.h> 63 #include <sys/msgbuf.h> 64 #include <sys/sysent.h> 65 #include <sys/sysctl.h> 66 #include <sys/vmmeter.h> 67 #include <sys/bus.h> 68 #include <sys/usched.h> 69 #include <sys/reg.h> 70 #include <sys/sbuf.h> 71 #include <sys/ctype.h> 72 #include <sys/serialize.h> 73 #include <sys/systimer.h> 74 75 #include <vm/vm.h> 76 #include <vm/vm_param.h> 77 #include <sys/lock.h> 78 #include <vm/vm_kern.h> 79 #include <vm/vm_object.h> 80 #include <vm/vm_page.h> 81 #include <vm/vm_map.h> 82 #include <vm/vm_pager.h> 83 #include <vm/vm_extern.h> 84 85 #include <sys/thread2.h> 86 #include <sys/mplock2.h> 87 88 #include <sys/exec.h> 89 #include <sys/cons.h> 90 91 #include <sys/efi.h> 92 93 #include <ddb/ddb.h> 94 95 #include <machine/cpu.h> 96 #include <machine/clock.h> 97 #include <machine/specialreg.h> 98 #if 0 /* JG */ 99 #include <machine/bootinfo.h> 100 #endif 101 #include <machine/md_var.h> 102 #include <machine/metadata.h> 103 #include <machine/pc/bios.h> 104 #include <machine/pcb_ext.h> 105 #include <machine/globaldata.h> /* CPU_prvspace */ 106 #include <machine/smp.h> 107 #include <machine/cputypes.h> 108 #include <machine/intr_machdep.h> 109 #include <machine/framebuffer.h> 110 111 #ifdef OLD_BUS_ARCH 112 #include <bus/isa/isa_device.h> 113 #endif 114 #include <machine_base/isa/isa_intr.h> 115 #include <bus/isa/rtc.h> 116 #include <sys/random.h> 117 #include <sys/ptrace.h> 118 #include <machine/sigframe.h> 119 120 #include <sys/machintr.h> 121 #include <machine_base/icu/icu_abi.h> 122 #include <machine_base/icu/elcr_var.h> 123 #include <machine_base/apic/lapic.h> 124 #include <machine_base/apic/ioapic.h> 125 #include <machine_base/apic/ioapic_abi.h> 126 #include <machine/mptable.h> 127 128 #define PHYSMAP_ENTRIES 10 129 #define MAXBUFSTRUCTSIZE ((size_t)512 * 1024 * 1024) 130 131 extern u_int64_t hammer_time(u_int64_t, u_int64_t); 132 133 extern void printcpuinfo(void); /* XXX header file */ 134 extern void identify_cpu(void); 135 extern void panicifcpuunsupported(void); 136 137 static void cpu_startup(void *); 138 static void pic_finish(void *); 139 static void cpu_finish(void *); 140 141 static void set_fpregs_xmm(struct save87 *, struct savexmm *); 142 static void fill_fpregs_xmm(struct savexmm *, struct save87 *); 143 static void init_locks(void); 144 145 extern void pcpu_timer_always(struct intrframe *); 146 147 SYSINIT(cpu, SI_BOOT2_START_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 148 SYSINIT(pic_finish, SI_BOOT2_FINISH_PIC, SI_ORDER_FIRST, pic_finish, NULL); 149 SYSINIT(cpu_finish, SI_BOOT2_FINISH_CPU, SI_ORDER_FIRST, cpu_finish, NULL); 150 151 #ifdef DDB 152 extern vm_offset_t ksym_start, ksym_end; 153 #endif 154 155 struct privatespace CPU_prvspace_bsp __aligned(4096); 156 struct privatespace *CPU_prvspace[MAXCPU] = { &CPU_prvspace_bsp }; 157 158 vm_paddr_t efi_systbl_phys; 159 int _udatasel, _ucodesel, _ucode32sel; 160 u_long atdevbase; 161 int64_t tsc_offsets[MAXCPU]; 162 cpumask_t smp_idleinvl_mask; 163 cpumask_t smp_idleinvl_reqs; 164 165 /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */ 166 __read_mostly static int cpu_mwait_halt_global; 167 __read_mostly static int clock_debug1; 168 169 #if defined(SWTCH_OPTIM_STATS) 170 extern int swtch_optim_stats; 171 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, 172 CTLFLAG_RD, &swtch_optim_stats, 0, ""); 173 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, 174 CTLFLAG_RD, &tlb_flush_count, 0, ""); 175 #endif 176 SYSCTL_INT(_debug, OID_AUTO, clock_debug1, 177 CTLFLAG_RW, &clock_debug1, 0, ""); 178 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_halt, 179 CTLFLAG_RD, &cpu_mwait_halt_global, 0, ""); 180 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_spin, 181 CTLFLAG_RD, &cpu_mwait_spin, 0, "monitor/mwait target state"); 182 183 #define CPU_MWAIT_HAS_CX \ 184 ((cpu_feature2 & CPUID2_MON) && \ 185 (cpu_mwait_feature & CPUID_MWAIT_EXT)) 186 187 #define CPU_MWAIT_CX_NAMELEN 16 188 189 #define CPU_MWAIT_C1 1 190 #define CPU_MWAIT_C2 2 191 #define CPU_MWAIT_C3 3 192 #define CPU_MWAIT_CX_MAX 8 193 194 #define CPU_MWAIT_HINT_AUTO -1 /* C1 and C2 */ 195 #define CPU_MWAIT_HINT_AUTODEEP -2 /* C3+ */ 196 197 SYSCTL_NODE(_machdep, OID_AUTO, mwait, CTLFLAG_RW, 0, "MWAIT features"); 198 SYSCTL_NODE(_machdep_mwait, OID_AUTO, CX, CTLFLAG_RW, 0, "MWAIT Cx settings"); 199 200 struct cpu_mwait_cx { 201 int subcnt; 202 char name[4]; 203 struct sysctl_ctx_list sysctl_ctx; 204 struct sysctl_oid *sysctl_tree; 205 }; 206 static struct cpu_mwait_cx cpu_mwait_cx_info[CPU_MWAIT_CX_MAX]; 207 static char cpu_mwait_cx_supported[256]; 208 209 static int cpu_mwait_c1_hints_cnt; 210 static int cpu_mwait_hints_cnt; 211 static int *cpu_mwait_hints; 212 213 static int cpu_mwait_deep_hints_cnt; 214 static int *cpu_mwait_deep_hints; 215 216 #define CPU_IDLE_REPEAT_DEFAULT 750 217 218 static u_int cpu_idle_repeat = CPU_IDLE_REPEAT_DEFAULT; 219 static u_long cpu_idle_repeat_max = CPU_IDLE_REPEAT_DEFAULT; 220 static u_int cpu_mwait_repeat_shift = 1; 221 222 #define CPU_MWAIT_C3_PREAMBLE_BM_ARB 0x1 223 #define CPU_MWAIT_C3_PREAMBLE_BM_STS 0x2 224 225 static int cpu_mwait_c3_preamble = 226 CPU_MWAIT_C3_PREAMBLE_BM_ARB | 227 CPU_MWAIT_C3_PREAMBLE_BM_STS; 228 229 SYSCTL_STRING(_machdep_mwait_CX, OID_AUTO, supported, CTLFLAG_RD, 230 cpu_mwait_cx_supported, 0, "MWAIT supported C states"); 231 SYSCTL_INT(_machdep_mwait_CX, OID_AUTO, c3_preamble, CTLFLAG_RD, 232 &cpu_mwait_c3_preamble, 0, "C3+ preamble mask"); 233 234 static int cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS, 235 int *, boolean_t); 236 static int cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS); 237 static int cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS); 238 static int cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS); 239 240 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, idle, CTLTYPE_STRING|CTLFLAG_RW, 241 NULL, 0, cpu_mwait_cx_idle_sysctl, "A", ""); 242 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, spin, CTLTYPE_STRING|CTLFLAG_RW, 243 NULL, 0, cpu_mwait_cx_spin_sysctl, "A", ""); 244 SYSCTL_UINT(_machdep_mwait_CX, OID_AUTO, repeat_shift, CTLFLAG_RW, 245 &cpu_mwait_repeat_shift, 0, ""); 246 247 long physmem = 0; 248 249 u_long ebda_addr = 0; 250 251 int imcr_present = 0; 252 253 int naps = 0; /* # of Applications processors */ 254 255 u_int base_memory; 256 257 static int 258 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS) 259 { 260 u_long pmem = ctob(physmem); 261 int error; 262 263 error = sysctl_handle_long(oidp, &pmem, 0, req); 264 265 return (error); 266 } 267 268 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_ULONG|CTLFLAG_RD, 269 0, 0, sysctl_hw_physmem, "LU", 270 "Total system memory in bytes (number of pages * page size)"); 271 272 static int 273 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS) 274 { 275 u_long usermem = ctob(physmem - vmstats.v_wire_count); 276 int error; 277 278 error = sysctl_handle_long(oidp, &usermem, 0, req); 279 280 return (error); 281 } 282 283 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_ULONG|CTLFLAG_RD, 284 0, 0, sysctl_hw_usermem, "LU", ""); 285 286 static int 287 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS) 288 { 289 int error; 290 u_long availpages; 291 292 availpages = x86_64_btop(avail_end - avail_start); 293 error = sysctl_handle_long(oidp, &availpages, 0, req); 294 295 return (error); 296 } 297 298 SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_ULONG|CTLFLAG_RD, 299 0, 0, sysctl_hw_availpages, "LU", ""); 300 301 vm_paddr_t Maxmem; 302 vm_paddr_t Realmem; 303 304 /* 305 * The number of PHYSMAP entries must be one less than the number of 306 * PHYSSEG entries because the PHYSMAP entry that spans the largest 307 * physical address that is accessible by ISA DMA is split into two 308 * PHYSSEG entries. 309 */ 310 vm_phystable_t phys_avail[VM_PHYSSEG_MAX + 1]; 311 vm_phystable_t dump_avail[VM_PHYSSEG_MAX + 1]; 312 313 /* must be 1 less so 0 0 can signal end of chunks */ 314 #define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 1) 315 #define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 1) 316 317 static vm_offset_t buffer_sva, buffer_eva; 318 vm_offset_t clean_sva, clean_eva; 319 static vm_offset_t pager_sva, pager_eva; 320 static struct trapframe proc0_tf; 321 322 static void cpu_implement_smap(void); 323 324 static void 325 cpu_startup(void *dummy) 326 { 327 caddr_t v; 328 vm_size_t size = 0; 329 vm_offset_t firstaddr; 330 331 /* 332 * Good {morning,afternoon,evening,night}. 333 */ 334 kprintf("%s", version); 335 startrtclock(); 336 printcpuinfo(); 337 panicifcpuunsupported(); 338 if (cpu_stdext_feature & CPUID_STDEXT_SMAP) 339 cpu_implement_smap(); 340 341 kprintf("real memory = %ju (%ju MB)\n", 342 (intmax_t)Realmem, 343 (intmax_t)Realmem / 1024 / 1024); 344 /* 345 * Display any holes after the first chunk of extended memory. 346 */ 347 if (bootverbose) { 348 int indx; 349 350 kprintf("Physical memory chunk(s):\n"); 351 for (indx = 0; phys_avail[indx].phys_end != 0; ++indx) { 352 vm_paddr_t size1; 353 354 size1 = phys_avail[indx].phys_end - 355 phys_avail[indx].phys_beg; 356 357 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n", 358 (intmax_t)phys_avail[indx].phys_beg, 359 (intmax_t)phys_avail[indx].phys_end - 1, 360 (intmax_t)size1, 361 (intmax_t)(size1 / PAGE_SIZE)); 362 } 363 } 364 365 /* 366 * Allocate space for system data structures. 367 * The first available kernel virtual address is in "v". 368 * As pages of kernel virtual memory are allocated, "v" is incremented. 369 * As pages of memory are allocated and cleared, 370 * "firstaddr" is incremented. 371 * An index into the kernel page table corresponding to the 372 * virtual memory address maintained in "v" is kept in "mapaddr". 373 */ 374 375 /* 376 * Make two passes. The first pass calculates how much memory is 377 * needed and allocates it. The second pass assigns virtual 378 * addresses to the various data structures. 379 */ 380 firstaddr = 0; 381 again: 382 v = (caddr_t)firstaddr; 383 384 #define valloc(name, type, num) \ 385 (name) = (type *)v; v = (caddr_t)((name)+(num)) 386 #define valloclim(name, type, num, lim) \ 387 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) 388 389 /* 390 * Calculate nbuf such that maxbufspace uses approximately 1/20 391 * of physical memory by default, with a minimum of 50 buffers. 392 * 393 * The calculation is made after discounting 128MB. 394 * 395 * NOTE: maxbufspace is (nbuf * NBUFCALCSIZE) (NBUFCALCSIZE ~= 16KB). 396 * nbuf = (kbytes / factor) would cover all of memory. 397 */ 398 if (nbuf == 0) { 399 long factor = NBUFCALCSIZE / 1024; /* KB/nbuf */ 400 long kbytes = physmem * (PAGE_SIZE / 1024); /* physmem */ 401 402 nbuf = 50; 403 if (kbytes > 128 * 1024) 404 nbuf += (kbytes - 128 * 1024) / (factor * 20); 405 if (maxbcache && nbuf > maxbcache / NBUFCALCSIZE) 406 nbuf = maxbcache / NBUFCALCSIZE; 407 if ((size_t)nbuf * sizeof(struct buf) > MAXBUFSTRUCTSIZE) { 408 kprintf("Warning: nbuf capped at %ld due to the " 409 "reasonability limit\n", nbuf); 410 nbuf = MAXBUFSTRUCTSIZE / sizeof(struct buf); 411 } 412 } 413 414 /* 415 * Do not allow the buffer_map to be more then 1/2 the size of the 416 * kernel_map. 417 */ 418 if (nbuf > (virtual_end - virtual_start + 419 virtual2_end - virtual2_start) / (MAXBSIZE * 2)) { 420 nbuf = (virtual_end - virtual_start + 421 virtual2_end - virtual2_start) / (MAXBSIZE * 2); 422 kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf); 423 } 424 425 /* 426 * Do not allow the buffer_map to use more than 50% of available 427 * physical-equivalent memory. Since the VM pages which back 428 * individual buffers are typically wired, having too many bufs 429 * can prevent the system from paging properly. 430 */ 431 if (nbuf > physmem * PAGE_SIZE / (NBUFCALCSIZE * 2)) { 432 nbuf = physmem * PAGE_SIZE / (NBUFCALCSIZE * 2); 433 kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf); 434 } 435 436 /* 437 * Do not allow the sizeof(struct buf) * nbuf to exceed 1/4 of 438 * the valloc space which is just the virtual_end - virtual_start 439 * section. This is typically ~2GB regardless of the amount of 440 * memory, so we use 500MB as a metric. 441 * 442 * This is because we use valloc() to allocate the buf header array. 443 * 444 * NOTE: buffer space in bytes is limited by vfs.*bufspace sysctls. 445 */ 446 if (nbuf > (virtual_end - virtual_start) / (sizeof(struct buf) * 4)) { 447 nbuf = (virtual_end - virtual_start) / 448 (sizeof(struct buf) * 4); 449 kprintf("Warning: nbufs capped at %ld due to " 450 "valloc considerations\n", 451 nbuf); 452 } 453 454 nswbuf_mem = lmax(lmin(nbuf / 32, 512), 8); 455 #ifdef NSWBUF_MIN 456 if (nswbuf_mem < NSWBUF_MIN) 457 nswbuf_mem = NSWBUF_MIN; 458 #endif 459 nswbuf_kva = lmax(lmin(nbuf / 4, 512), 16); 460 #ifdef NSWBUF_MIN 461 if (nswbuf_kva < NSWBUF_MIN) 462 nswbuf_kva = NSWBUF_MIN; 463 #endif 464 465 valloc(swbuf_mem, struct buf, nswbuf_mem); 466 valloc(swbuf_kva, struct buf, nswbuf_kva); 467 valloc(buf, struct buf, nbuf); 468 469 /* 470 * End of first pass, size has been calculated so allocate memory 471 */ 472 if (firstaddr == 0) { 473 size = (vm_size_t)(v - firstaddr); 474 firstaddr = kmem_alloc(&kernel_map, round_page(size), 475 VM_SUBSYS_BUF); 476 if (firstaddr == 0) 477 panic("startup: no room for tables"); 478 goto again; 479 } 480 481 /* 482 * End of second pass, addresses have been assigned 483 * 484 * nbuf is an int, make sure we don't overflow the field. 485 * 486 * On 64-bit systems we always reserve maximal allocations for 487 * buffer cache buffers and there are no fragmentation issues, 488 * so the KVA segment does not have to be excessively oversized. 489 */ 490 if ((vm_size_t)(v - firstaddr) != size) 491 panic("startup: table size inconsistency"); 492 493 kmem_suballoc(&kernel_map, &clean_map, &clean_sva, &clean_eva, 494 ((vm_offset_t)(nbuf + 16) * MAXBSIZE) + 495 ((nswbuf_mem + nswbuf_kva) * MAXPHYS) + pager_map_size); 496 kmem_suballoc(&clean_map, &buffer_map, &buffer_sva, &buffer_eva, 497 ((vm_offset_t)(nbuf + 16) * MAXBSIZE)); 498 buffer_map.system_map = 1; 499 kmem_suballoc(&clean_map, &pager_map, &pager_sva, &pager_eva, 500 ((vm_offset_t)(nswbuf_mem + nswbuf_kva) * MAXPHYS) + 501 pager_map_size); 502 pager_map.system_map = 1; 503 kprintf("avail memory = %ju (%ju MB)\n", 504 (uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages), 505 (uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages) / 506 1024 / 1024); 507 } 508 509 struct cpu_idle_stat { 510 int hint; 511 int reserved; 512 u_long halt; 513 u_long spin; 514 u_long repeat; 515 u_long repeat_last; 516 u_long repeat_delta; 517 u_long mwait_cx[CPU_MWAIT_CX_MAX]; 518 } __cachealign; 519 520 #define CPU_IDLE_STAT_HALT -1 521 #define CPU_IDLE_STAT_SPIN -2 522 523 static struct cpu_idle_stat cpu_idle_stats[MAXCPU]; 524 525 static int 526 sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS) 527 { 528 int idx = arg2, cpu, error; 529 u_long val = 0; 530 531 if (idx == CPU_IDLE_STAT_HALT) { 532 for (cpu = 0; cpu < ncpus; ++cpu) 533 val += cpu_idle_stats[cpu].halt; 534 } else if (idx == CPU_IDLE_STAT_SPIN) { 535 for (cpu = 0; cpu < ncpus; ++cpu) 536 val += cpu_idle_stats[cpu].spin; 537 } else { 538 KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX, 539 ("invalid index %d", idx)); 540 for (cpu = 0; cpu < ncpus; ++cpu) 541 val += cpu_idle_stats[cpu].mwait_cx[idx]; 542 } 543 544 error = sysctl_handle_quad(oidp, &val, 0, req); 545 if (error || req->newptr == NULL) 546 return error; 547 548 if (idx == CPU_IDLE_STAT_HALT) { 549 for (cpu = 0; cpu < ncpus; ++cpu) 550 cpu_idle_stats[cpu].halt = 0; 551 cpu_idle_stats[0].halt = val; 552 } else if (idx == CPU_IDLE_STAT_SPIN) { 553 for (cpu = 0; cpu < ncpus; ++cpu) 554 cpu_idle_stats[cpu].spin = 0; 555 cpu_idle_stats[0].spin = val; 556 } else { 557 KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX, 558 ("invalid index %d", idx)); 559 for (cpu = 0; cpu < ncpus; ++cpu) 560 cpu_idle_stats[cpu].mwait_cx[idx] = 0; 561 cpu_idle_stats[0].mwait_cx[idx] = val; 562 } 563 return 0; 564 } 565 566 static void 567 cpu_mwait_attach(void) 568 { 569 struct sbuf sb; 570 int hint_idx, i; 571 572 if (!CPU_MWAIT_HAS_CX) 573 return; 574 575 if (cpu_vendor_id == CPU_VENDOR_INTEL && 576 (CPUID_TO_FAMILY(cpu_id) > 0xf || 577 (CPUID_TO_FAMILY(cpu_id) == 0x6 && 578 CPUID_TO_MODEL(cpu_id) >= 0xf))) { 579 int bm_sts = 1; 580 581 /* 582 * Pentium dual-core, Core 2 and beyond do not need any 583 * additional activities to enter deep C-state, i.e. C3(+). 584 */ 585 cpu_mwait_cx_no_bmarb(); 586 587 TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts); 588 if (!bm_sts) 589 cpu_mwait_cx_no_bmsts(); 590 } 591 592 sbuf_new(&sb, cpu_mwait_cx_supported, 593 sizeof(cpu_mwait_cx_supported), SBUF_FIXEDLEN); 594 595 for (i = 0; i < CPU_MWAIT_CX_MAX; ++i) { 596 struct cpu_mwait_cx *cx = &cpu_mwait_cx_info[i]; 597 int sub; 598 599 ksnprintf(cx->name, sizeof(cx->name), "C%d", i); 600 601 sysctl_ctx_init(&cx->sysctl_ctx); 602 cx->sysctl_tree = SYSCTL_ADD_NODE(&cx->sysctl_ctx, 603 SYSCTL_STATIC_CHILDREN(_machdep_mwait), OID_AUTO, 604 cx->name, CTLFLAG_RW, NULL, "Cx control/info"); 605 if (cx->sysctl_tree == NULL) 606 continue; 607 608 cx->subcnt = CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu, i); 609 SYSCTL_ADD_INT(&cx->sysctl_ctx, 610 SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO, 611 "subcnt", CTLFLAG_RD, &cx->subcnt, 0, 612 "sub-state count"); 613 SYSCTL_ADD_PROC(&cx->sysctl_ctx, 614 SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO, 615 "entered", (CTLTYPE_QUAD | CTLFLAG_RW), 0, 616 i, sysctl_cpu_idle_cnt, "Q", "# of times entered"); 617 618 for (sub = 0; sub < cx->subcnt; ++sub) 619 sbuf_printf(&sb, "C%d/%d ", i, sub); 620 } 621 sbuf_trim(&sb); 622 sbuf_finish(&sb); 623 624 /* 625 * Non-deep C-states 626 */ 627 cpu_mwait_c1_hints_cnt = cpu_mwait_cx_info[CPU_MWAIT_C1].subcnt; 628 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i) 629 cpu_mwait_hints_cnt += cpu_mwait_cx_info[i].subcnt; 630 cpu_mwait_hints = kmalloc(sizeof(int) * cpu_mwait_hints_cnt, 631 M_DEVBUF, M_WAITOK); 632 633 hint_idx = 0; 634 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i) { 635 int j, subcnt; 636 637 subcnt = cpu_mwait_cx_info[i].subcnt; 638 for (j = 0; j < subcnt; ++j) { 639 KASSERT(hint_idx < cpu_mwait_hints_cnt, 640 ("invalid mwait hint index %d", hint_idx)); 641 cpu_mwait_hints[hint_idx] = MWAIT_EAX_HINT(i, j); 642 ++hint_idx; 643 } 644 } 645 KASSERT(hint_idx == cpu_mwait_hints_cnt, 646 ("mwait hint count %d != index %d", 647 cpu_mwait_hints_cnt, hint_idx)); 648 649 if (bootverbose) { 650 kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt); 651 for (i = 0; i < cpu_mwait_hints_cnt; ++i) { 652 int hint = cpu_mwait_hints[i]; 653 654 kprintf(" C%d/%d hint 0x%04x\n", 655 MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint), 656 hint); 657 } 658 } 659 660 /* 661 * Deep C-states 662 */ 663 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i) 664 cpu_mwait_deep_hints_cnt += cpu_mwait_cx_info[i].subcnt; 665 cpu_mwait_deep_hints = kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt, 666 M_DEVBUF, M_WAITOK); 667 668 hint_idx = 0; 669 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i) { 670 int j, subcnt; 671 672 subcnt = cpu_mwait_cx_info[i].subcnt; 673 for (j = 0; j < subcnt; ++j) { 674 KASSERT(hint_idx < cpu_mwait_deep_hints_cnt, 675 ("invalid mwait deep hint index %d", hint_idx)); 676 cpu_mwait_deep_hints[hint_idx] = MWAIT_EAX_HINT(i, j); 677 ++hint_idx; 678 } 679 } 680 KASSERT(hint_idx == cpu_mwait_deep_hints_cnt, 681 ("mwait deep hint count %d != index %d", 682 cpu_mwait_deep_hints_cnt, hint_idx)); 683 684 if (bootverbose) { 685 kprintf("MWAIT deep hints:\n"); 686 for (i = 0; i < cpu_mwait_deep_hints_cnt; ++i) { 687 int hint = cpu_mwait_deep_hints[i]; 688 689 kprintf(" C%d/%d hint 0x%04x\n", 690 MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint), 691 hint); 692 } 693 } 694 cpu_idle_repeat_max = 256 * cpu_mwait_deep_hints_cnt; 695 696 for (i = 0; i < ncpus; ++i) { 697 char name[16]; 698 699 ksnprintf(name, sizeof(name), "idle%d", i); 700 SYSCTL_ADD_PROC(NULL, 701 SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX), OID_AUTO, 702 name, (CTLTYPE_STRING | CTLFLAG_RW), &cpu_idle_stats[i], 703 0, cpu_mwait_cx_pcpu_idle_sysctl, "A", ""); 704 } 705 } 706 707 static void 708 cpu_finish(void *dummy __unused) 709 { 710 cpu_setregs(); 711 cpu_mwait_attach(); 712 } 713 714 static void 715 pic_finish(void *dummy __unused) 716 { 717 /* Log ELCR information */ 718 elcr_dump(); 719 720 /* Log MPTABLE information */ 721 mptable_pci_int_dump(); 722 723 /* Finalize PCI */ 724 MachIntrABI.finalize(); 725 } 726 727 /* 728 * Send an interrupt to process. 729 * 730 * Stack is set up to allow sigcode stored 731 * at top to call routine, followed by kcall 732 * to sigreturn routine below. After sigreturn 733 * resets the signal mask, the stack, and the 734 * frame pointer, it returns to the user 735 * specified pc, psl. 736 */ 737 void 738 sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code) 739 { 740 struct lwp *lp = curthread->td_lwp; 741 struct proc *p = lp->lwp_proc; 742 struct trapframe *regs; 743 struct sigacts *psp = p->p_sigacts; 744 struct sigframe sf, *sfp; 745 int oonstack; 746 char *sp; 747 748 regs = lp->lwp_md.md_regs; 749 oonstack = (lp->lwp_sigstk.ss_flags & SS_ONSTACK) ? 1 : 0; 750 751 /* Save user context */ 752 bzero(&sf, sizeof(struct sigframe)); 753 sf.sf_uc.uc_sigmask = *mask; 754 sf.sf_uc.uc_stack = lp->lwp_sigstk; 755 sf.sf_uc.uc_mcontext.mc_onstack = oonstack; 756 KKASSERT(__offsetof(struct trapframe, tf_rdi) == 0); 757 /* gcc errors out on optimized bcopy */ 758 _bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(struct trapframe)); 759 760 /* Make the size of the saved context visible to userland */ 761 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); 762 763 /* Allocate and validate space for the signal handler context. */ 764 if ((lp->lwp_flags & LWP_ALTSTACK) != 0 && !oonstack && 765 SIGISMEMBER(psp->ps_sigonstack, sig)) { 766 sp = (char *)lp->lwp_sigstk.ss_sp + lp->lwp_sigstk.ss_size - 767 sizeof(struct sigframe); 768 lp->lwp_sigstk.ss_flags |= SS_ONSTACK; 769 } else { 770 /* We take red zone into account */ 771 sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128; 772 } 773 774 /* 775 * XXX AVX needs 64-byte alignment but sigframe has other fields and 776 * the embedded ucontext is not at the front, so aligning this won't 777 * help us. Fortunately we bcopy in/out of the sigframe, so the 778 * kernel is ok. 779 * 780 * The problem though is if userland winds up trying to use the 781 * context directly. 782 */ 783 sfp = (struct sigframe *)((intptr_t)sp & ~(intptr_t)0xF); 784 785 /* Translate the signal is appropriate */ 786 if (p->p_sysent->sv_sigtbl) { 787 if (sig <= p->p_sysent->sv_sigsize) 788 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 789 } 790 791 /* 792 * Build the argument list for the signal handler. 793 * 794 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx) 795 */ 796 regs->tf_rdi = sig; /* argument 1 */ 797 regs->tf_rdx = (register_t)&sfp->sf_uc; /* argument 3 */ 798 799 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 800 /* 801 * Signal handler installed with SA_SIGINFO. 802 * 803 * action(signo, siginfo, ucontext) 804 */ 805 regs->tf_rsi = (register_t)&sfp->sf_si; /* argument 2 */ 806 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */ 807 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 808 809 /* fill siginfo structure */ 810 sf.sf_si.si_signo = sig; 811 sf.sf_si.si_pid = psp->ps_frominfo[sig].pid; 812 sf.sf_si.si_uid = psp->ps_frominfo[sig].uid; 813 sf.sf_si.si_code = code; 814 sf.sf_si.si_addr = (void *)regs->tf_addr; 815 } else { 816 /* 817 * Old FreeBSD-style arguments. 818 * 819 * handler (signo, code, [uc], addr) 820 */ 821 regs->tf_rsi = (register_t)code; /* argument 2 */ 822 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */ 823 sf.sf_ahu.sf_handler = catcher; 824 } 825 826 /* 827 * If we're a vm86 process, we want to save the segment registers. 828 * We also change eflags to be our emulated eflags, not the actual 829 * eflags. 830 */ 831 #if 0 /* JG */ 832 if (regs->tf_eflags & PSL_VM) { 833 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 834 struct vm86_kernel *vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86; 835 836 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 837 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 838 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 839 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 840 841 if (vm86->vm86_has_vme == 0) 842 sf.sf_uc.uc_mcontext.mc_eflags = 843 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 844 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 845 846 /* 847 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 848 * syscalls made by the signal handler. This just avoids 849 * wasting time for our lazy fixup of such faults. PSL_NT 850 * does nothing in vm86 mode, but vm86 programs can set it 851 * almost legitimately in probes for old cpu types. 852 */ 853 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 854 } 855 #endif 856 857 /* 858 * Save the FPU state and reinit the FP unit 859 */ 860 npxpush(&sf.sf_uc.uc_mcontext); 861 862 /* 863 * Copy the sigframe out to the user's stack. 864 */ 865 if (copyout(&sf, sfp, sizeof(struct sigframe)) != 0) { 866 /* 867 * Something is wrong with the stack pointer. 868 * ...Kill the process. 869 */ 870 sigexit(lp, SIGILL); 871 } 872 873 regs->tf_rsp = (register_t)sfp; 874 regs->tf_rip = trunc_page64(PS_STRINGS - *(p->p_sysent->sv_szsigcode)); 875 regs->tf_rip -= SZSIGCODE_EXTRA_BYTES; 876 877 /* 878 * x86 abi specifies that the direction flag must be cleared 879 * on function entry 880 */ 881 regs->tf_rflags &= ~(PSL_T | PSL_D); 882 883 /* 884 * 64 bit mode has a code and stack selector but 885 * no data or extra selector. %fs and %gs are not 886 * stored in-context. 887 */ 888 regs->tf_cs = _ucodesel; 889 regs->tf_ss = _udatasel; 890 clear_quickret(); 891 } 892 893 /* 894 * Sanitize the trapframe for a virtual kernel passing control to a custom 895 * VM context. Remove any items that would otherwise create a privilage 896 * issue. 897 * 898 * XXX at the moment we allow userland to set the resume flag. Is this a 899 * bad idea? 900 */ 901 int 902 cpu_sanitize_frame(struct trapframe *frame) 903 { 904 frame->tf_cs = _ucodesel; 905 frame->tf_ss = _udatasel; 906 /* XXX VM (8086) mode not supported? */ 907 frame->tf_rflags &= (PSL_RF | PSL_USERCHANGE | PSL_VM_UNSUPP); 908 frame->tf_rflags |= PSL_RESERVED_DEFAULT | PSL_I; 909 910 return(0); 911 } 912 913 /* 914 * Sanitize the tls so loading the descriptor does not blow up 915 * on us. For x86_64 we don't have to do anything. 916 */ 917 int 918 cpu_sanitize_tls(struct savetls *tls) 919 { 920 return(0); 921 } 922 923 /* 924 * sigreturn(ucontext_t *sigcntxp) 925 * 926 * System call to cleanup state after a signal 927 * has been taken. Reset signal mask and 928 * stack state from context left by sendsig (above). 929 * Return to previous pc and psl as specified by 930 * context left by sendsig. Check carefully to 931 * make sure that the user has not modified the 932 * state to gain improper privileges. 933 * 934 * MPSAFE 935 */ 936 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 937 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 938 939 int 940 sys_sigreturn(struct sysmsg *sysmsg, const struct sigreturn_args *uap) 941 { 942 struct lwp *lp = curthread->td_lwp; 943 struct trapframe *regs; 944 ucontext_t uc; 945 ucontext_t *ucp; 946 register_t rflags; 947 int cs; 948 int error; 949 950 /* 951 * We have to copy the information into kernel space so userland 952 * can't modify it while we are sniffing it. 953 */ 954 regs = lp->lwp_md.md_regs; 955 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 956 if (error) 957 return (error); 958 ucp = &uc; 959 rflags = ucp->uc_mcontext.mc_rflags; 960 961 /* VM (8086) mode not supported */ 962 rflags &= ~PSL_VM_UNSUPP; 963 964 #if 0 /* JG */ 965 if (eflags & PSL_VM) { 966 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 967 struct vm86_kernel *vm86; 968 969 /* 970 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 971 * set up the vm86 area, and we can't enter vm86 mode. 972 */ 973 if (lp->lwp_thread->td_pcb->pcb_ext == 0) 974 return (EINVAL); 975 vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86; 976 if (vm86->vm86_inited == 0) 977 return (EINVAL); 978 979 /* go back to user mode if both flags are set */ 980 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) 981 trapsignal(lp, SIGBUS, 0); 982 983 if (vm86->vm86_has_vme) { 984 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 985 (eflags & VME_USERCHANGE) | PSL_VM; 986 } else { 987 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 988 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 989 (eflags & VM_USERCHANGE) | PSL_VM; 990 } 991 bcopy(&ucp->uc_mcontext.mc_gs, tf, sizeof(struct trapframe)); 992 tf->tf_eflags = eflags; 993 tf->tf_vm86_ds = tf->tf_ds; 994 tf->tf_vm86_es = tf->tf_es; 995 tf->tf_vm86_fs = tf->tf_fs; 996 tf->tf_vm86_gs = tf->tf_gs; 997 tf->tf_ds = _udatasel; 998 tf->tf_es = _udatasel; 999 tf->tf_fs = _udatasel; 1000 tf->tf_gs = _udatasel; 1001 } else 1002 #endif 1003 { 1004 /* 1005 * Don't allow users to change privileged or reserved flags. 1006 */ 1007 /* 1008 * XXX do allow users to change the privileged flag PSL_RF. 1009 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers 1010 * should sometimes set it there too. tf_eflags is kept in 1011 * the signal context during signal handling and there is no 1012 * other place to remember it, so the PSL_RF bit may be 1013 * corrupted by the signal handler without us knowing. 1014 * Corruption of the PSL_RF bit at worst causes one more or 1015 * one less debugger trap, so allowing it is fairly harmless. 1016 */ 1017 if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { 1018 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags); 1019 return(EINVAL); 1020 } 1021 1022 /* 1023 * Don't allow users to load a valid privileged %cs. Let the 1024 * hardware check for invalid selectors, excess privilege in 1025 * other selectors, invalid %eip's and invalid %esp's. 1026 */ 1027 cs = ucp->uc_mcontext.mc_cs; 1028 if (!CS_SECURE(cs)) { 1029 kprintf("sigreturn: cs = 0x%x\n", cs); 1030 trapsignal(lp, SIGBUS, T_PROTFLT); 1031 return(EINVAL); 1032 } 1033 /* gcc errors out on optimized bcopy */ 1034 _bcopy(&ucp->uc_mcontext.mc_rdi, regs, 1035 sizeof(struct trapframe)); 1036 } 1037 1038 /* 1039 * Restore the FPU state from the frame 1040 */ 1041 crit_enter(); 1042 npxpop(&ucp->uc_mcontext); 1043 1044 if (ucp->uc_mcontext.mc_onstack & 1) 1045 lp->lwp_sigstk.ss_flags |= SS_ONSTACK; 1046 else 1047 lp->lwp_sigstk.ss_flags &= ~SS_ONSTACK; 1048 1049 lp->lwp_sigmask = ucp->uc_sigmask; 1050 SIG_CANTMASK(lp->lwp_sigmask); 1051 clear_quickret(); 1052 crit_exit(); 1053 return(EJUSTRETURN); 1054 } 1055 1056 /* 1057 * Machine dependent boot() routine 1058 * 1059 * I haven't seen anything to put here yet 1060 * Possibly some stuff might be grafted back here from boot() 1061 */ 1062 void 1063 cpu_boot(int howto) 1064 { 1065 } 1066 1067 /* 1068 * Shutdown the CPU as much as possible 1069 */ 1070 void 1071 cpu_halt(void) 1072 { 1073 for (;;) 1074 __asm__ __volatile("hlt"); 1075 } 1076 1077 /* 1078 * cpu_idle() represents the idle LWKT. You cannot return from this function 1079 * (unless you want to blow things up!). Instead we look for runnable threads 1080 * and loop or halt as appropriate. Giant is not held on entry to the thread. 1081 * 1082 * The main loop is entered with a critical section held, we must release 1083 * the critical section before doing anything else. lwkt_switch() will 1084 * check for pending interrupts due to entering and exiting its own 1085 * critical section. 1086 * 1087 * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up. 1088 * However, there are cases where the idlethread will be entered with 1089 * the possibility that no IPI will occur and in such cases 1090 * lwkt_switch() sets TDF_IDLE_NOHLT. 1091 * 1092 * NOTE: cpu_idle_repeat determines how many entries into the idle thread 1093 * must occur before it starts using ACPI halt. 1094 * 1095 * NOTE: Value overridden in hammer_time(). 1096 */ 1097 static int cpu_idle_hlt = 2; 1098 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, 1099 &cpu_idle_hlt, 0, "Idle loop HLT enable"); 1100 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_repeat, CTLFLAG_RW, 1101 &cpu_idle_repeat, 0, "Idle entries before acpi hlt"); 1102 1103 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_hltcnt, (CTLTYPE_QUAD | CTLFLAG_RW), 1104 0, CPU_IDLE_STAT_HALT, sysctl_cpu_idle_cnt, "Q", "Idle loop entry halts"); 1105 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_spincnt, (CTLTYPE_QUAD | CTLFLAG_RW), 1106 0, CPU_IDLE_STAT_SPIN, sysctl_cpu_idle_cnt, "Q", "Idle loop entry spins"); 1107 1108 static void 1109 cpu_idle_default_hook(void) 1110 { 1111 /* 1112 * We must guarentee that hlt is exactly the instruction 1113 * following the sti. 1114 */ 1115 __asm __volatile("sti; hlt"); 1116 } 1117 1118 /* Other subsystems (e.g., ACPI) can hook this later. */ 1119 void (*cpu_idle_hook)(void) = cpu_idle_default_hook; 1120 1121 static __inline int 1122 cpu_mwait_cx_hint(struct cpu_idle_stat *stat) 1123 { 1124 int hint, cx_idx; 1125 u_int idx; 1126 1127 hint = stat->hint; 1128 if (hint >= 0) 1129 goto done; 1130 1131 idx = (stat->repeat + stat->repeat_last + stat->repeat_delta) >> 1132 cpu_mwait_repeat_shift; 1133 if (idx >= cpu_mwait_c1_hints_cnt) { 1134 /* Step up faster, once we walked through all C1 states */ 1135 stat->repeat_delta += 1 << (cpu_mwait_repeat_shift + 1); 1136 } 1137 if (hint == CPU_MWAIT_HINT_AUTODEEP) { 1138 if (idx >= cpu_mwait_deep_hints_cnt) 1139 idx = cpu_mwait_deep_hints_cnt - 1; 1140 hint = cpu_mwait_deep_hints[idx]; 1141 } else { 1142 if (idx >= cpu_mwait_hints_cnt) 1143 idx = cpu_mwait_hints_cnt - 1; 1144 hint = cpu_mwait_hints[idx]; 1145 } 1146 done: 1147 cx_idx = MWAIT_EAX_TO_CX(hint); 1148 if (cx_idx >= 0 && cx_idx < CPU_MWAIT_CX_MAX) 1149 stat->mwait_cx[cx_idx]++; 1150 return hint; 1151 } 1152 1153 void 1154 cpu_idle(void) 1155 { 1156 globaldata_t gd = mycpu; 1157 struct cpu_idle_stat *stat = &cpu_idle_stats[gd->gd_cpuid]; 1158 struct thread *td __debugvar = gd->gd_curthread; 1159 int reqflags; 1160 1161 stat->repeat = stat->repeat_last = cpu_idle_repeat_max; 1162 1163 crit_exit(); 1164 KKASSERT(td->td_critcount == 0); 1165 1166 for (;;) { 1167 /* 1168 * See if there are any LWKTs ready to go. 1169 */ 1170 lwkt_switch(); 1171 1172 /* 1173 * When halting inside a cli we must check for reqflags 1174 * races, particularly [re]schedule requests. Running 1175 * splz() does the job. 1176 * 1177 * cpu_idle_hlt: 1178 * 0 Never halt, just spin 1179 * 1180 * 1 Always use MONITOR/MWAIT if avail, HLT 1181 * otherwise. 1182 * 1183 * Better default for modern (Haswell+) Intel 1184 * cpus. 1185 * 1186 * 2 Use HLT/MONITOR/MWAIT up to a point and then 1187 * use the ACPI halt (default). This is a hybrid 1188 * approach. See machdep.cpu_idle_repeat. 1189 * 1190 * Better default for modern AMD cpus and older 1191 * Intel cpus. 1192 * 1193 * 3 Always use the ACPI halt. This typically 1194 * eats the least amount of power but the cpu 1195 * will be slow waking up. Slows down e.g. 1196 * compiles and other pipe/event oriented stuff. 1197 * 1198 * Usually the best default for AMD cpus. 1199 * 1200 * 4 Always use HLT. 1201 * 1202 * 5 Always spin. 1203 * 1204 * NOTE: Interrupts are enabled and we are not in a critical 1205 * section. 1206 * 1207 * NOTE: Preemptions do not reset gd_idle_repeat. Also we 1208 * don't bother capping gd_idle_repeat, it is ok if 1209 * it overflows (we do make it unsigned, however). 1210 * 1211 * Implement optimized invltlb operations when halted 1212 * in idle. By setting the bit in smp_idleinvl_mask 1213 * we inform other cpus that they can set _reqs to 1214 * request an invltlb. Current the code to do that 1215 * sets the bits in _reqs anyway, but then check _mask 1216 * to determine if they can assume the invltlb will execute. 1217 * 1218 * A critical section is required to ensure that interrupts 1219 * do not fully run until after we've had a chance to execute 1220 * the request. 1221 */ 1222 if (gd->gd_idle_repeat == 0) { 1223 stat->repeat = (stat->repeat + stat->repeat_last) >> 1; 1224 if (stat->repeat > cpu_idle_repeat_max) 1225 stat->repeat = cpu_idle_repeat_max; 1226 stat->repeat_last = 0; 1227 stat->repeat_delta = 0; 1228 } 1229 ++stat->repeat_last; 1230 1231 /* 1232 * General idle thread halt code 1233 * 1234 * IBRS NOTES - IBRS is a SPECTRE mitigation. When going 1235 * idle, disable IBRS to reduce hyperthread 1236 * overhead. 1237 */ 1238 ++gd->gd_idle_repeat; 1239 1240 switch(cpu_idle_hlt) { 1241 default: 1242 case 0: 1243 /* 1244 * Always spin 1245 */ 1246 ; 1247 do_spin: 1248 splz(); 1249 __asm __volatile("sti"); 1250 stat->spin++; 1251 crit_enter_gd(gd); 1252 crit_exit_gd(gd); 1253 break; 1254 case 2: 1255 /* 1256 * Use MONITOR/MWAIT (or HLT) for a few cycles, 1257 * then start using the ACPI halt code if we 1258 * continue to be idle. 1259 */ 1260 if (gd->gd_idle_repeat >= cpu_idle_repeat) 1261 goto do_acpi; 1262 /* FALL THROUGH */ 1263 case 1: 1264 /* 1265 * Always use MONITOR/MWAIT (will use HLT if 1266 * MONITOR/MWAIT not available). 1267 */ 1268 if (cpu_mi_feature & CPU_MI_MONITOR) { 1269 splz(); /* XXX */ 1270 reqflags = gd->gd_reqflags; 1271 if (reqflags & RQF_IDLECHECK_WK_MASK) 1272 goto do_spin; 1273 crit_enter_gd(gd); 1274 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask, gd->gd_cpuid); 1275 /* 1276 * IBRS/STIBP 1277 */ 1278 if (pscpu->trampoline.tr_pcb_spec_ctrl[1] & 1279 SPEC_CTRL_DUMMY_ENABLE) { 1280 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1281 } 1282 cpu_mmw_pause_int(&gd->gd_reqflags, reqflags, 1283 cpu_mwait_cx_hint(stat), 0); 1284 if (pscpu->trampoline.tr_pcb_spec_ctrl[0] & 1285 SPEC_CTRL_DUMMY_ENABLE) { 1286 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1287 } 1288 stat->halt++; 1289 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask, gd->gd_cpuid); 1290 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs, 1291 gd->gd_cpuid)) { 1292 cpu_invltlb(); 1293 cpu_mfence(); 1294 } 1295 crit_exit_gd(gd); 1296 break; 1297 } 1298 /* FALLTHROUGH */ 1299 case 4: 1300 /* 1301 * Use HLT 1302 */ 1303 __asm __volatile("cli"); 1304 splz(); 1305 crit_enter_gd(gd); 1306 if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) { 1307 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask, 1308 gd->gd_cpuid); 1309 if (pscpu->trampoline.tr_pcb_spec_ctrl[1] & 1310 SPEC_CTRL_DUMMY_ENABLE) { 1311 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1312 } 1313 cpu_idle_default_hook(); 1314 if (pscpu->trampoline.tr_pcb_spec_ctrl[0] & 1315 SPEC_CTRL_DUMMY_ENABLE) { 1316 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1317 } 1318 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask, 1319 gd->gd_cpuid); 1320 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs, 1321 gd->gd_cpuid)) { 1322 cpu_invltlb(); 1323 cpu_mfence(); 1324 } 1325 } 1326 __asm __volatile("sti"); 1327 stat->halt++; 1328 crit_exit_gd(gd); 1329 break; 1330 case 3: 1331 /* 1332 * Use ACPI halt 1333 */ 1334 ; 1335 do_acpi: 1336 __asm __volatile("cli"); 1337 splz(); 1338 crit_enter_gd(gd); 1339 if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) { 1340 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask, 1341 gd->gd_cpuid); 1342 if (pscpu->trampoline.tr_pcb_spec_ctrl[1] & 1343 SPEC_CTRL_DUMMY_ENABLE) { 1344 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1345 } 1346 cpu_idle_hook(); 1347 if (pscpu->trampoline.tr_pcb_spec_ctrl[0] & 1348 SPEC_CTRL_DUMMY_ENABLE) { 1349 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1350 } 1351 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask, 1352 gd->gd_cpuid); 1353 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs, 1354 gd->gd_cpuid)) { 1355 cpu_invltlb(); 1356 cpu_mfence(); 1357 } 1358 } 1359 __asm __volatile("sti"); 1360 stat->halt++; 1361 crit_exit_gd(gd); 1362 break; 1363 } 1364 } 1365 } 1366 1367 /* 1368 * Called from deep ACPI via cpu_idle_hook() (see above) to actually halt 1369 * the cpu in C1. ACPI might use other halt methods for deeper states 1370 * and not reach here. 1371 * 1372 * For now we always use HLT as we are not sure what ACPI may have actually 1373 * done. MONITOR/MWAIT might not be appropriate. 1374 * 1375 * NOTE: MONITOR/MWAIT does not appear to throttle AMD cpus, while HLT 1376 * does. On Intel, MONITOR/MWAIT does appear to throttle the cpu. 1377 */ 1378 void 1379 cpu_idle_halt(void) 1380 { 1381 globaldata_t gd; 1382 1383 gd = mycpu; 1384 #if 0 1385 /* DISABLED FOR NOW */ 1386 struct cpu_idle_stat *stat; 1387 int reqflags; 1388 1389 1390 if ((cpu_idle_hlt == 1 || cpu_idle_hlt == 2) && 1391 (cpu_mi_feature & CPU_MI_MONITOR) && 1392 cpu_vendor_id != CPU_VENDOR_AMD) { 1393 /* 1394 * Use MONITOR/MWAIT 1395 * 1396 * (NOTE: On ryzen, MWAIT does not throttle clocks, so we 1397 * have to use HLT) 1398 */ 1399 stat = &cpu_idle_stats[gd->gd_cpuid]; 1400 reqflags = gd->gd_reqflags; 1401 if ((reqflags & RQF_IDLECHECK_WK_MASK) == 0) { 1402 __asm __volatile("sti"); 1403 cpu_mmw_pause_int(&gd->gd_reqflags, reqflags, 1404 cpu_mwait_cx_hint(stat), 0); 1405 } else { 1406 __asm __volatile("sti; pause"); 1407 } 1408 } else 1409 #endif 1410 { 1411 /* 1412 * Use HLT 1413 */ 1414 if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) 1415 __asm __volatile("sti; hlt"); 1416 else 1417 __asm __volatile("sti; pause"); 1418 } 1419 } 1420 1421 1422 /* 1423 * Called in a loop indirectly via Xcpustop 1424 */ 1425 void 1426 cpu_smp_stopped(void) 1427 { 1428 globaldata_t gd = mycpu; 1429 volatile __uint64_t *ptr; 1430 __uint64_t ovalue; 1431 1432 ptr = CPUMASK_ADDR(started_cpus, gd->gd_cpuid); 1433 ovalue = *ptr; 1434 if ((ovalue & CPUMASK_SIMPLE(gd->gd_cpuid & 63)) == 0) { 1435 if (cpu_mi_feature & CPU_MI_MONITOR) { 1436 if (cpu_mwait_hints) { 1437 cpu_mmw_pause_long(__DEVOLATILE(void *, ptr), 1438 ovalue, 1439 cpu_mwait_hints[ 1440 cpu_mwait_hints_cnt - 1], 0); 1441 } else { 1442 cpu_mmw_pause_long(__DEVOLATILE(void *, ptr), 1443 ovalue, 0, 0); 1444 } 1445 } else { 1446 cpu_halt(); /* depend on lapic timer */ 1447 } 1448 } 1449 } 1450 1451 /* 1452 * This routine is called if a spinlock has been held through the 1453 * exponential backoff period and is seriously contested. On a real cpu 1454 * we let it spin. 1455 */ 1456 void 1457 cpu_spinlock_contested(void) 1458 { 1459 cpu_pause(); 1460 } 1461 1462 /* 1463 * Clear registers on exec 1464 */ 1465 void 1466 exec_setregs(u_long entry, u_long stack, u_long ps_strings) 1467 { 1468 struct thread *td = curthread; 1469 struct lwp *lp = td->td_lwp; 1470 struct pcb *pcb = td->td_pcb; 1471 struct trapframe *regs = lp->lwp_md.md_regs; 1472 1473 user_ldt_free(pcb); 1474 1475 clear_quickret(); 1476 bzero((char *)regs, sizeof(struct trapframe)); 1477 regs->tf_rip = entry; 1478 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; /* align the stack */ 1479 regs->tf_rdi = stack; /* argv */ 1480 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); 1481 regs->tf_ss = _udatasel; 1482 regs->tf_cs = _ucodesel; 1483 regs->tf_rbx = ps_strings; 1484 1485 /* 1486 * Reset the hardware debug registers if they were in use. 1487 * They won't have any meaning for the newly exec'd process. 1488 */ 1489 if (pcb->pcb_flags & PCB_DBREGS) { 1490 pcb->pcb_dr0 = 0; 1491 pcb->pcb_dr1 = 0; 1492 pcb->pcb_dr2 = 0; 1493 pcb->pcb_dr3 = 0; 1494 pcb->pcb_dr6 = 0; 1495 pcb->pcb_dr7 = 0; /* JG set bit 10? */ 1496 if (pcb == td->td_pcb) { 1497 /* 1498 * Clear the debug registers on the running 1499 * CPU, otherwise they will end up affecting 1500 * the next process we switch to. 1501 */ 1502 reset_dbregs(); 1503 } 1504 pcb->pcb_flags &= ~PCB_DBREGS; 1505 } 1506 1507 /* 1508 * Initialize the math emulator (if any) for the current process. 1509 * Actually, just clear the bit that says that the emulator has 1510 * been initialized. Initialization is delayed until the process 1511 * traps to the emulator (if it is done at all) mainly because 1512 * emulators don't provide an entry point for initialization. 1513 */ 1514 pcb->pcb_flags &= ~FP_SOFTFP; 1515 1516 /* 1517 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing 1518 * gd_npxthread. Otherwise a preemptive interrupt thread 1519 * may panic in npxdna(). 1520 */ 1521 crit_enter(); 1522 load_cr0(rcr0() | CR0_MP); 1523 1524 /* 1525 * NOTE: The MSR values must be correct so we can return to 1526 * userland. gd_user_fs/gs must be correct so the switch 1527 * code knows what the current MSR values are. 1528 */ 1529 pcb->pcb_fsbase = 0; /* Values loaded from PCB on switch */ 1530 pcb->pcb_gsbase = 0; 1531 mdcpu->gd_user_fs = 0; /* Cache of current MSR values */ 1532 mdcpu->gd_user_gs = 0; 1533 wrmsr(MSR_FSBASE, 0); /* Set MSR values for return to userland */ 1534 wrmsr(MSR_KGSBASE, 0); 1535 1536 /* Initialize the npx (if any) for the current process. */ 1537 npxinit(); 1538 crit_exit(); 1539 1540 pcb->pcb_ds = _udatasel; 1541 pcb->pcb_es = _udatasel; 1542 pcb->pcb_fs = _udatasel; 1543 pcb->pcb_gs = _udatasel; 1544 } 1545 1546 void 1547 cpu_setregs(void) 1548 { 1549 register_t cr0; 1550 1551 cr0 = rcr0(); 1552 cr0 |= CR0_NE; /* Done by npxinit() */ 1553 cr0 |= CR0_MP | CR0_TS; /* Done at every execve() too. */ 1554 cr0 |= CR0_WP | CR0_AM; 1555 load_cr0(cr0); 1556 load_gs(_udatasel); 1557 } 1558 1559 static int 1560 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS) 1561 { 1562 int error; 1563 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, 1564 req); 1565 if (!error && req->newptr) 1566 resettodr(); 1567 return (error); 1568 } 1569 1570 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, 1571 &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); 1572 1573 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, 1574 CTLFLAG_RW, &disable_rtc_set, 0, ""); 1575 1576 #if 0 /* JG */ 1577 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, 1578 CTLFLAG_RD, &bootinfo, bootinfo, ""); 1579 #endif 1580 1581 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, 1582 CTLFLAG_RW, &wall_cmos_clock, 0, ""); 1583 1584 static int 1585 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) 1586 { 1587 struct efi_map_header *efihdr; 1588 caddr_t kmdp; 1589 uint32_t efisize; 1590 1591 kmdp = preload_search_by_type("elf kernel"); 1592 if (kmdp == NULL) 1593 kmdp = preload_search_by_type("elf64 kernel"); 1594 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1595 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1596 if (efihdr == NULL) 1597 return (0); 1598 efisize = *((uint32_t *)efihdr - 1); 1599 return (SYSCTL_OUT(req, efihdr, efisize)); 1600 } 1601 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 1602 efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map"); 1603 1604 /* 1605 * Initialize x86 and configure to run kernel 1606 */ 1607 1608 /* 1609 * Initialize segments & interrupt table 1610 */ 1611 1612 int _default_ldt; 1613 struct user_segment_descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ 1614 struct gate_descriptor idt_arr[MAXCPU][NIDT]; 1615 #if 0 /* JG */ 1616 union descriptor ldt[NLDT]; /* local descriptor table */ 1617 #endif 1618 1619 /* table descriptors - used to load tables by cpu */ 1620 struct region_descriptor r_gdt; 1621 struct region_descriptor r_idt_arr[MAXCPU]; 1622 1623 /* JG proc0paddr is a virtual address */ 1624 void *proc0paddr; 1625 /* JG alignment? */ 1626 char proc0paddr_buff[LWKT_THREAD_STACK]; 1627 1628 1629 /* software prototypes -- in more palatable form */ 1630 struct soft_segment_descriptor gdt_segs[] = { 1631 /* GNULL_SEL 0 Null Descriptor */ 1632 { 0x0, /* segment base address */ 1633 0x0, /* length */ 1634 0, /* segment type */ 1635 0, /* segment descriptor priority level */ 1636 0, /* segment descriptor present */ 1637 0, /* long */ 1638 0, /* default 32 vs 16 bit size */ 1639 0 /* limit granularity (byte/page units)*/ }, 1640 /* GCODE_SEL 1 Code Descriptor for kernel */ 1641 { 0x0, /* segment base address */ 1642 0xfffff, /* length - all address space */ 1643 SDT_MEMERA, /* segment type */ 1644 SEL_KPL, /* segment descriptor priority level */ 1645 1, /* segment descriptor present */ 1646 1, /* long */ 1647 0, /* default 32 vs 16 bit size */ 1648 1 /* limit granularity (byte/page units)*/ }, 1649 /* GDATA_SEL 2 Data Descriptor for kernel */ 1650 { 0x0, /* segment base address */ 1651 0xfffff, /* length - all address space */ 1652 SDT_MEMRWA, /* segment type */ 1653 SEL_KPL, /* segment descriptor priority level */ 1654 1, /* segment descriptor present */ 1655 1, /* long */ 1656 0, /* default 32 vs 16 bit size */ 1657 1 /* limit granularity (byte/page units)*/ }, 1658 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */ 1659 { 0x0, /* segment base address */ 1660 0xfffff, /* length - all address space */ 1661 SDT_MEMERA, /* segment type */ 1662 SEL_UPL, /* segment descriptor priority level */ 1663 1, /* segment descriptor present */ 1664 0, /* long */ 1665 1, /* default 32 vs 16 bit size */ 1666 1 /* limit granularity (byte/page units)*/ }, 1667 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */ 1668 { 0x0, /* segment base address */ 1669 0xfffff, /* length - all address space */ 1670 SDT_MEMRWA, /* segment type */ 1671 SEL_UPL, /* segment descriptor priority level */ 1672 1, /* segment descriptor present */ 1673 0, /* long */ 1674 1, /* default 32 vs 16 bit size */ 1675 1 /* limit granularity (byte/page units)*/ }, 1676 /* GUCODE_SEL 5 64 bit Code Descriptor for user */ 1677 { 0x0, /* segment base address */ 1678 0xfffff, /* length - all address space */ 1679 SDT_MEMERA, /* segment type */ 1680 SEL_UPL, /* segment descriptor priority level */ 1681 1, /* segment descriptor present */ 1682 1, /* long */ 1683 0, /* default 32 vs 16 bit size */ 1684 1 /* limit granularity (byte/page units)*/ }, 1685 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */ 1686 { 1687 0x0, /* segment base address */ 1688 sizeof(struct x86_64tss)-1,/* length - all address space */ 1689 SDT_SYSTSS, /* segment type */ 1690 SEL_KPL, /* segment descriptor priority level */ 1691 1, /* segment descriptor present */ 1692 0, /* long */ 1693 0, /* unused - default 32 vs 16 bit size */ 1694 0 /* limit granularity (byte/page units)*/ }, 1695 /* Actually, the TSS is a system descriptor which is double size */ 1696 { 0x0, /* segment base address */ 1697 0x0, /* length */ 1698 0, /* segment type */ 1699 0, /* segment descriptor priority level */ 1700 0, /* segment descriptor present */ 1701 0, /* long */ 1702 0, /* default 32 vs 16 bit size */ 1703 0 /* limit granularity (byte/page units)*/ }, 1704 /* GUGS32_SEL 8 32 bit GS Descriptor for user */ 1705 { 0x0, /* segment base address */ 1706 0xfffff, /* length - all address space */ 1707 SDT_MEMRWA, /* segment type */ 1708 SEL_UPL, /* segment descriptor priority level */ 1709 1, /* segment descriptor present */ 1710 0, /* long */ 1711 1, /* default 32 vs 16 bit size */ 1712 1 /* limit granularity (byte/page units)*/ }, 1713 }; 1714 1715 void 1716 setidt_global(int idx, inthand_t *func, int typ, int dpl, int ist) 1717 { 1718 int cpu; 1719 1720 for (cpu = 0; cpu < MAXCPU; ++cpu) { 1721 struct gate_descriptor *ip = &idt_arr[cpu][idx]; 1722 1723 ip->gd_looffset = (uintptr_t)func; 1724 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 1725 ip->gd_ist = ist; 1726 ip->gd_xx = 0; 1727 ip->gd_type = typ; 1728 ip->gd_dpl = dpl; 1729 ip->gd_p = 1; 1730 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 1731 } 1732 } 1733 1734 void 1735 setidt(int idx, inthand_t *func, int typ, int dpl, int ist, int cpu) 1736 { 1737 struct gate_descriptor *ip; 1738 1739 KASSERT(cpu >= 0 && cpu < ncpus, ("invalid cpu %d", cpu)); 1740 1741 ip = &idt_arr[cpu][idx]; 1742 ip->gd_looffset = (uintptr_t)func; 1743 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 1744 ip->gd_ist = ist; 1745 ip->gd_xx = 0; 1746 ip->gd_type = typ; 1747 ip->gd_dpl = dpl; 1748 ip->gd_p = 1; 1749 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 1750 } 1751 1752 #define IDTVEC(name) __CONCAT(X,name) 1753 1754 extern inthand_t 1755 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 1756 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 1757 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 1758 IDTVEC(page), IDTVEC(mchk), IDTVEC(fpu), IDTVEC(align), 1759 IDTVEC(xmm), IDTVEC(dblfault), 1760 IDTVEC(fast_syscall), IDTVEC(fast_syscall32); 1761 1762 extern inthand_t 1763 IDTVEC(rsvd00), IDTVEC(rsvd01), IDTVEC(rsvd02), IDTVEC(rsvd03), 1764 IDTVEC(rsvd04), IDTVEC(rsvd05), IDTVEC(rsvd06), IDTVEC(rsvd07), 1765 IDTVEC(rsvd08), IDTVEC(rsvd09), IDTVEC(rsvd0a), IDTVEC(rsvd0b), 1766 IDTVEC(rsvd0c), IDTVEC(rsvd0d), IDTVEC(rsvd0e), IDTVEC(rsvd0f), 1767 IDTVEC(rsvd10), IDTVEC(rsvd11), IDTVEC(rsvd12), IDTVEC(rsvd13), 1768 IDTVEC(rsvd14), IDTVEC(rsvd15), IDTVEC(rsvd16), IDTVEC(rsvd17), 1769 IDTVEC(rsvd18), IDTVEC(rsvd19), IDTVEC(rsvd1a), IDTVEC(rsvd1b), 1770 IDTVEC(rsvd1c), IDTVEC(rsvd1d), IDTVEC(rsvd1e), IDTVEC(rsvd1f), 1771 IDTVEC(rsvd20), IDTVEC(rsvd21), IDTVEC(rsvd22), IDTVEC(rsvd23), 1772 IDTVEC(rsvd24), IDTVEC(rsvd25), IDTVEC(rsvd26), IDTVEC(rsvd27), 1773 IDTVEC(rsvd28), IDTVEC(rsvd29), IDTVEC(rsvd2a), IDTVEC(rsvd2b), 1774 IDTVEC(rsvd2c), IDTVEC(rsvd2d), IDTVEC(rsvd2e), IDTVEC(rsvd2f), 1775 IDTVEC(rsvd30), IDTVEC(rsvd31), IDTVEC(rsvd32), IDTVEC(rsvd33), 1776 IDTVEC(rsvd34), IDTVEC(rsvd35), IDTVEC(rsvd36), IDTVEC(rsvd37), 1777 IDTVEC(rsvd38), IDTVEC(rsvd39), IDTVEC(rsvd3a), IDTVEC(rsvd3b), 1778 IDTVEC(rsvd3c), IDTVEC(rsvd3d), IDTVEC(rsvd3e), IDTVEC(rsvd3f), 1779 IDTVEC(rsvd40), IDTVEC(rsvd41), IDTVEC(rsvd42), IDTVEC(rsvd43), 1780 IDTVEC(rsvd44), IDTVEC(rsvd45), IDTVEC(rsvd46), IDTVEC(rsvd47), 1781 IDTVEC(rsvd48), IDTVEC(rsvd49), IDTVEC(rsvd4a), IDTVEC(rsvd4b), 1782 IDTVEC(rsvd4c), IDTVEC(rsvd4d), IDTVEC(rsvd4e), IDTVEC(rsvd4f), 1783 IDTVEC(rsvd50), IDTVEC(rsvd51), IDTVEC(rsvd52), IDTVEC(rsvd53), 1784 IDTVEC(rsvd54), IDTVEC(rsvd55), IDTVEC(rsvd56), IDTVEC(rsvd57), 1785 IDTVEC(rsvd58), IDTVEC(rsvd59), IDTVEC(rsvd5a), IDTVEC(rsvd5b), 1786 IDTVEC(rsvd5c), IDTVEC(rsvd5d), IDTVEC(rsvd5e), IDTVEC(rsvd5f), 1787 IDTVEC(rsvd60), IDTVEC(rsvd61), IDTVEC(rsvd62), IDTVEC(rsvd63), 1788 IDTVEC(rsvd64), IDTVEC(rsvd65), IDTVEC(rsvd66), IDTVEC(rsvd67), 1789 IDTVEC(rsvd68), IDTVEC(rsvd69), IDTVEC(rsvd6a), IDTVEC(rsvd6b), 1790 IDTVEC(rsvd6c), IDTVEC(rsvd6d), IDTVEC(rsvd6e), IDTVEC(rsvd6f), 1791 IDTVEC(rsvd70), IDTVEC(rsvd71), IDTVEC(rsvd72), IDTVEC(rsvd73), 1792 IDTVEC(rsvd74), IDTVEC(rsvd75), IDTVEC(rsvd76), IDTVEC(rsvd77), 1793 IDTVEC(rsvd78), IDTVEC(rsvd79), IDTVEC(rsvd7a), IDTVEC(rsvd7b), 1794 IDTVEC(rsvd7c), IDTVEC(rsvd7d), IDTVEC(rsvd7e), IDTVEC(rsvd7f), 1795 IDTVEC(rsvd80), IDTVEC(rsvd81), IDTVEC(rsvd82), IDTVEC(rsvd83), 1796 IDTVEC(rsvd84), IDTVEC(rsvd85), IDTVEC(rsvd86), IDTVEC(rsvd87), 1797 IDTVEC(rsvd88), IDTVEC(rsvd89), IDTVEC(rsvd8a), IDTVEC(rsvd8b), 1798 IDTVEC(rsvd8c), IDTVEC(rsvd8d), IDTVEC(rsvd8e), IDTVEC(rsvd8f), 1799 IDTVEC(rsvd90), IDTVEC(rsvd91), IDTVEC(rsvd92), IDTVEC(rsvd93), 1800 IDTVEC(rsvd94), IDTVEC(rsvd95), IDTVEC(rsvd96), IDTVEC(rsvd97), 1801 IDTVEC(rsvd98), IDTVEC(rsvd99), IDTVEC(rsvd9a), IDTVEC(rsvd9b), 1802 IDTVEC(rsvd9c), IDTVEC(rsvd9d), IDTVEC(rsvd9e), IDTVEC(rsvd9f), 1803 IDTVEC(rsvda0), IDTVEC(rsvda1), IDTVEC(rsvda2), IDTVEC(rsvda3), 1804 IDTVEC(rsvda4), IDTVEC(rsvda5), IDTVEC(rsvda6), IDTVEC(rsvda7), 1805 IDTVEC(rsvda8), IDTVEC(rsvda9), IDTVEC(rsvdaa), IDTVEC(rsvdab), 1806 IDTVEC(rsvdac), IDTVEC(rsvdad), IDTVEC(rsvdae), IDTVEC(rsvdaf), 1807 IDTVEC(rsvdb0), IDTVEC(rsvdb1), IDTVEC(rsvdb2), IDTVEC(rsvdb3), 1808 IDTVEC(rsvdb4), IDTVEC(rsvdb5), IDTVEC(rsvdb6), IDTVEC(rsvdb7), 1809 IDTVEC(rsvdb8), IDTVEC(rsvdb9), IDTVEC(rsvdba), IDTVEC(rsvdbb), 1810 IDTVEC(rsvdbc), IDTVEC(rsvdbd), IDTVEC(rsvdbe), IDTVEC(rsvdbf), 1811 IDTVEC(rsvdc0), IDTVEC(rsvdc1), IDTVEC(rsvdc2), IDTVEC(rsvdc3), 1812 IDTVEC(rsvdc4), IDTVEC(rsvdc5), IDTVEC(rsvdc6), IDTVEC(rsvdc7), 1813 IDTVEC(rsvdc8), IDTVEC(rsvdc9), IDTVEC(rsvdca), IDTVEC(rsvdcb), 1814 IDTVEC(rsvdcc), IDTVEC(rsvdcd), IDTVEC(rsvdce), IDTVEC(rsvdcf), 1815 IDTVEC(rsvdd0), IDTVEC(rsvdd1), IDTVEC(rsvdd2), IDTVEC(rsvdd3), 1816 IDTVEC(rsvdd4), IDTVEC(rsvdd5), IDTVEC(rsvdd6), IDTVEC(rsvdd7), 1817 IDTVEC(rsvdd8), IDTVEC(rsvdd9), IDTVEC(rsvdda), IDTVEC(rsvddb), 1818 IDTVEC(rsvddc), IDTVEC(rsvddd), IDTVEC(rsvdde), IDTVEC(rsvddf), 1819 IDTVEC(rsvde0), IDTVEC(rsvde1), IDTVEC(rsvde2), IDTVEC(rsvde3), 1820 IDTVEC(rsvde4), IDTVEC(rsvde5), IDTVEC(rsvde6), IDTVEC(rsvde7), 1821 IDTVEC(rsvde8), IDTVEC(rsvde9), IDTVEC(rsvdea), IDTVEC(rsvdeb), 1822 IDTVEC(rsvdec), IDTVEC(rsvded), IDTVEC(rsvdee), IDTVEC(rsvdef), 1823 IDTVEC(rsvdf0), IDTVEC(rsvdf1), IDTVEC(rsvdf2), IDTVEC(rsvdf3), 1824 IDTVEC(rsvdf4), IDTVEC(rsvdf5), IDTVEC(rsvdf6), IDTVEC(rsvdf7), 1825 IDTVEC(rsvdf8), IDTVEC(rsvdf9), IDTVEC(rsvdfa), IDTVEC(rsvdfb), 1826 IDTVEC(rsvdfc), IDTVEC(rsvdfd), IDTVEC(rsvdfe), IDTVEC(rsvdff); 1827 1828 inthand_t *rsvdary[NIDT] = { 1829 &IDTVEC(rsvd00), &IDTVEC(rsvd01), &IDTVEC(rsvd02), &IDTVEC(rsvd03), 1830 &IDTVEC(rsvd04), &IDTVEC(rsvd05), &IDTVEC(rsvd06), &IDTVEC(rsvd07), 1831 &IDTVEC(rsvd08), &IDTVEC(rsvd09), &IDTVEC(rsvd0a), &IDTVEC(rsvd0b), 1832 &IDTVEC(rsvd0c), &IDTVEC(rsvd0d), &IDTVEC(rsvd0e), &IDTVEC(rsvd0f), 1833 &IDTVEC(rsvd10), &IDTVEC(rsvd11), &IDTVEC(rsvd12), &IDTVEC(rsvd13), 1834 &IDTVEC(rsvd14), &IDTVEC(rsvd15), &IDTVEC(rsvd16), &IDTVEC(rsvd17), 1835 &IDTVEC(rsvd18), &IDTVEC(rsvd19), &IDTVEC(rsvd1a), &IDTVEC(rsvd1b), 1836 &IDTVEC(rsvd1c), &IDTVEC(rsvd1d), &IDTVEC(rsvd1e), &IDTVEC(rsvd1f), 1837 &IDTVEC(rsvd20), &IDTVEC(rsvd21), &IDTVEC(rsvd22), &IDTVEC(rsvd23), 1838 &IDTVEC(rsvd24), &IDTVEC(rsvd25), &IDTVEC(rsvd26), &IDTVEC(rsvd27), 1839 &IDTVEC(rsvd28), &IDTVEC(rsvd29), &IDTVEC(rsvd2a), &IDTVEC(rsvd2b), 1840 &IDTVEC(rsvd2c), &IDTVEC(rsvd2d), &IDTVEC(rsvd2e), &IDTVEC(rsvd2f), 1841 &IDTVEC(rsvd30), &IDTVEC(rsvd31), &IDTVEC(rsvd32), &IDTVEC(rsvd33), 1842 &IDTVEC(rsvd34), &IDTVEC(rsvd35), &IDTVEC(rsvd36), &IDTVEC(rsvd37), 1843 &IDTVEC(rsvd38), &IDTVEC(rsvd39), &IDTVEC(rsvd3a), &IDTVEC(rsvd3b), 1844 &IDTVEC(rsvd3c), &IDTVEC(rsvd3d), &IDTVEC(rsvd3e), &IDTVEC(rsvd3f), 1845 &IDTVEC(rsvd40), &IDTVEC(rsvd41), &IDTVEC(rsvd42), &IDTVEC(rsvd43), 1846 &IDTVEC(rsvd44), &IDTVEC(rsvd45), &IDTVEC(rsvd46), &IDTVEC(rsvd47), 1847 &IDTVEC(rsvd48), &IDTVEC(rsvd49), &IDTVEC(rsvd4a), &IDTVEC(rsvd4b), 1848 &IDTVEC(rsvd4c), &IDTVEC(rsvd4d), &IDTVEC(rsvd4e), &IDTVEC(rsvd4f), 1849 &IDTVEC(rsvd50), &IDTVEC(rsvd51), &IDTVEC(rsvd52), &IDTVEC(rsvd53), 1850 &IDTVEC(rsvd54), &IDTVEC(rsvd55), &IDTVEC(rsvd56), &IDTVEC(rsvd57), 1851 &IDTVEC(rsvd58), &IDTVEC(rsvd59), &IDTVEC(rsvd5a), &IDTVEC(rsvd5b), 1852 &IDTVEC(rsvd5c), &IDTVEC(rsvd5d), &IDTVEC(rsvd5e), &IDTVEC(rsvd5f), 1853 &IDTVEC(rsvd60), &IDTVEC(rsvd61), &IDTVEC(rsvd62), &IDTVEC(rsvd63), 1854 &IDTVEC(rsvd64), &IDTVEC(rsvd65), &IDTVEC(rsvd66), &IDTVEC(rsvd67), 1855 &IDTVEC(rsvd68), &IDTVEC(rsvd69), &IDTVEC(rsvd6a), &IDTVEC(rsvd6b), 1856 &IDTVEC(rsvd6c), &IDTVEC(rsvd6d), &IDTVEC(rsvd6e), &IDTVEC(rsvd6f), 1857 &IDTVEC(rsvd70), &IDTVEC(rsvd71), &IDTVEC(rsvd72), &IDTVEC(rsvd73), 1858 &IDTVEC(rsvd74), &IDTVEC(rsvd75), &IDTVEC(rsvd76), &IDTVEC(rsvd77), 1859 &IDTVEC(rsvd78), &IDTVEC(rsvd79), &IDTVEC(rsvd7a), &IDTVEC(rsvd7b), 1860 &IDTVEC(rsvd7c), &IDTVEC(rsvd7d), &IDTVEC(rsvd7e), &IDTVEC(rsvd7f), 1861 &IDTVEC(rsvd80), &IDTVEC(rsvd81), &IDTVEC(rsvd82), &IDTVEC(rsvd83), 1862 &IDTVEC(rsvd84), &IDTVEC(rsvd85), &IDTVEC(rsvd86), &IDTVEC(rsvd87), 1863 &IDTVEC(rsvd88), &IDTVEC(rsvd89), &IDTVEC(rsvd8a), &IDTVEC(rsvd8b), 1864 &IDTVEC(rsvd8c), &IDTVEC(rsvd8d), &IDTVEC(rsvd8e), &IDTVEC(rsvd8f), 1865 &IDTVEC(rsvd90), &IDTVEC(rsvd91), &IDTVEC(rsvd92), &IDTVEC(rsvd93), 1866 &IDTVEC(rsvd94), &IDTVEC(rsvd95), &IDTVEC(rsvd96), &IDTVEC(rsvd97), 1867 &IDTVEC(rsvd98), &IDTVEC(rsvd99), &IDTVEC(rsvd9a), &IDTVEC(rsvd9b), 1868 &IDTVEC(rsvd9c), &IDTVEC(rsvd9d), &IDTVEC(rsvd9e), &IDTVEC(rsvd9f), 1869 &IDTVEC(rsvda0), &IDTVEC(rsvda1), &IDTVEC(rsvda2), &IDTVEC(rsvda3), 1870 &IDTVEC(rsvda4), &IDTVEC(rsvda5), &IDTVEC(rsvda6), &IDTVEC(rsvda7), 1871 &IDTVEC(rsvda8), &IDTVEC(rsvda9), &IDTVEC(rsvdaa), &IDTVEC(rsvdab), 1872 &IDTVEC(rsvdac), &IDTVEC(rsvdad), &IDTVEC(rsvdae), &IDTVEC(rsvdaf), 1873 &IDTVEC(rsvdb0), &IDTVEC(rsvdb1), &IDTVEC(rsvdb2), &IDTVEC(rsvdb3), 1874 &IDTVEC(rsvdb4), &IDTVEC(rsvdb5), &IDTVEC(rsvdb6), &IDTVEC(rsvdb7), 1875 &IDTVEC(rsvdb8), &IDTVEC(rsvdb9), &IDTVEC(rsvdba), &IDTVEC(rsvdbb), 1876 &IDTVEC(rsvdbc), &IDTVEC(rsvdbd), &IDTVEC(rsvdbe), &IDTVEC(rsvdbf), 1877 &IDTVEC(rsvdc0), &IDTVEC(rsvdc1), &IDTVEC(rsvdc2), &IDTVEC(rsvdc3), 1878 &IDTVEC(rsvdc4), &IDTVEC(rsvdc5), &IDTVEC(rsvdc6), &IDTVEC(rsvdc7), 1879 &IDTVEC(rsvdc8), &IDTVEC(rsvdc9), &IDTVEC(rsvdca), &IDTVEC(rsvdcb), 1880 &IDTVEC(rsvdcc), &IDTVEC(rsvdcd), &IDTVEC(rsvdce), &IDTVEC(rsvdcf), 1881 &IDTVEC(rsvdd0), &IDTVEC(rsvdd1), &IDTVEC(rsvdd2), &IDTVEC(rsvdd3), 1882 &IDTVEC(rsvdd4), &IDTVEC(rsvdd5), &IDTVEC(rsvdd6), &IDTVEC(rsvdd7), 1883 &IDTVEC(rsvdd8), &IDTVEC(rsvdd9), &IDTVEC(rsvdda), &IDTVEC(rsvddb), 1884 &IDTVEC(rsvddc), &IDTVEC(rsvddd), &IDTVEC(rsvdde), &IDTVEC(rsvddf), 1885 &IDTVEC(rsvde0), &IDTVEC(rsvde1), &IDTVEC(rsvde2), &IDTVEC(rsvde3), 1886 &IDTVEC(rsvde4), &IDTVEC(rsvde5), &IDTVEC(rsvde6), &IDTVEC(rsvde7), 1887 &IDTVEC(rsvde8), &IDTVEC(rsvde9), &IDTVEC(rsvdea), &IDTVEC(rsvdeb), 1888 &IDTVEC(rsvdec), &IDTVEC(rsvded), &IDTVEC(rsvdee), &IDTVEC(rsvdef), 1889 &IDTVEC(rsvdf0), &IDTVEC(rsvdf1), &IDTVEC(rsvdf2), &IDTVEC(rsvdf3), 1890 &IDTVEC(rsvdf4), &IDTVEC(rsvdf5), &IDTVEC(rsvdf6), &IDTVEC(rsvdf7), 1891 &IDTVEC(rsvdf8), &IDTVEC(rsvdf9), &IDTVEC(rsvdfa), &IDTVEC(rsvdfb), 1892 &IDTVEC(rsvdfc), &IDTVEC(rsvdfd), &IDTVEC(rsvdfe), &IDTVEC(rsvdff) 1893 }; 1894 1895 void 1896 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd) 1897 { 1898 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 1899 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 1900 ssd->ssd_type = sd->sd_type; 1901 ssd->ssd_dpl = sd->sd_dpl; 1902 ssd->ssd_p = sd->sd_p; 1903 ssd->ssd_def32 = sd->sd_def32; 1904 ssd->ssd_gran = sd->sd_gran; 1905 } 1906 1907 void 1908 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd) 1909 { 1910 1911 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 1912 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 1913 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 1914 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 1915 sd->sd_type = ssd->ssd_type; 1916 sd->sd_dpl = ssd->ssd_dpl; 1917 sd->sd_p = ssd->ssd_p; 1918 sd->sd_long = ssd->ssd_long; 1919 sd->sd_def32 = ssd->ssd_def32; 1920 sd->sd_gran = ssd->ssd_gran; 1921 } 1922 1923 void 1924 ssdtosyssd(struct soft_segment_descriptor *ssd, 1925 struct system_segment_descriptor *sd) 1926 { 1927 1928 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 1929 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 1930 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 1931 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 1932 sd->sd_type = ssd->ssd_type; 1933 sd->sd_dpl = ssd->ssd_dpl; 1934 sd->sd_p = ssd->ssd_p; 1935 sd->sd_gran = ssd->ssd_gran; 1936 } 1937 1938 /* 1939 * Populate the (physmap) array with base/bound pairs describing the 1940 * available physical memory in the system, then test this memory and 1941 * build the phys_avail array describing the actually-available memory. 1942 * 1943 * If we cannot accurately determine the physical memory map, then use 1944 * value from the 0xE801 call, and failing that, the RTC. 1945 * 1946 * Total memory size may be set by the kernel environment variable 1947 * hw.physmem or the compile-time define MAXMEM. 1948 * 1949 * Memory is aligned to PHYSMAP_ALIGN which must be a multiple 1950 * of PAGE_SIZE. This also greatly reduces the memory test time 1951 * which would otherwise be excessive on machines with > 8G of ram. 1952 * 1953 * XXX first should be vm_paddr_t. 1954 */ 1955 1956 #define PHYSMAP_ALIGN (vm_paddr_t)(128 * 1024) 1957 #define PHYSMAP_ALIGN_MASK (vm_paddr_t)(PHYSMAP_ALIGN - 1) 1958 #define PHYSMAP_SIZE VM_PHYSSEG_MAX 1959 1960 vm_paddr_t physmap[PHYSMAP_SIZE]; 1961 struct bios_smap *smapbase, *smap, *smapend; 1962 struct efi_map_header *efihdrbase; 1963 u_int32_t smapsize; 1964 1965 #define PHYSMAP_HANDWAVE (vm_paddr_t)(2 * 1024 * 1024) 1966 #define PHYSMAP_HANDWAVE_MASK (PHYSMAP_HANDWAVE - 1) 1967 1968 static void 1969 add_smap_entries(int *physmap_idx) 1970 { 1971 int i; 1972 1973 smapsize = *((u_int32_t *)smapbase - 1); 1974 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1975 1976 for (smap = smapbase; smap < smapend; smap++) { 1977 if (boothowto & RB_VERBOSE) 1978 kprintf("SMAP type=%02x base=%016lx len=%016lx\n", 1979 smap->type, smap->base, smap->length); 1980 1981 if (smap->type != SMAP_TYPE_MEMORY) 1982 continue; 1983 1984 if (smap->length == 0) 1985 continue; 1986 1987 for (i = 0; i <= *physmap_idx; i += 2) { 1988 if (smap->base < physmap[i + 1]) { 1989 if (boothowto & RB_VERBOSE) { 1990 kprintf("Overlapping or non-monotonic " 1991 "memory region, ignoring " 1992 "second region\n"); 1993 } 1994 break; 1995 } 1996 } 1997 if (i <= *physmap_idx) 1998 continue; 1999 2000 Realmem += smap->length; 2001 2002 if (smap->base == physmap[*physmap_idx + 1]) { 2003 physmap[*physmap_idx + 1] += smap->length; 2004 continue; 2005 } 2006 2007 *physmap_idx += 2; 2008 if (*physmap_idx == PHYSMAP_SIZE) { 2009 kprintf("Too many segments in the physical " 2010 "address map, giving up\n"); 2011 break; 2012 } 2013 physmap[*physmap_idx] = smap->base; 2014 physmap[*physmap_idx + 1] = smap->base + smap->length; 2015 } 2016 } 2017 2018 static void 2019 add_efi_map_entries(int *physmap_idx) 2020 { 2021 struct efi_md *map, *p; 2022 const char *type; 2023 size_t efisz; 2024 int i, ndesc; 2025 2026 static const char *types[] = { 2027 "Reserved", 2028 "LoaderCode", 2029 "LoaderData", 2030 "BootServicesCode", 2031 "BootServicesData", 2032 "RuntimeServicesCode", 2033 "RuntimeServicesData", 2034 "ConventionalMemory", 2035 "UnusableMemory", 2036 "ACPIReclaimMemory", 2037 "ACPIMemoryNVS", 2038 "MemoryMappedIO", 2039 "MemoryMappedIOPortSpace", 2040 "PalCode" 2041 }; 2042 2043 /* 2044 * Memory map data provided by UEFI via the GetMemoryMap 2045 * Boot Services API. 2046 */ 2047 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; 2048 map = (struct efi_md *)((uint8_t *)efihdrbase + efisz); 2049 2050 if (efihdrbase->descriptor_size == 0) 2051 return; 2052 ndesc = efihdrbase->memory_size / efihdrbase->descriptor_size; 2053 2054 if (boothowto & RB_VERBOSE) 2055 kprintf("%23s %12s %12s %8s %4s\n", 2056 "Type", "Physical", "Virtual", "#Pages", "Attr"); 2057 2058 for (i = 0, p = map; i < ndesc; i++, 2059 p = efi_next_descriptor(p, efihdrbase->descriptor_size)) { 2060 if (boothowto & RB_VERBOSE) { 2061 if (p->md_type <= EFI_MD_TYPE_PALCODE) 2062 type = types[p->md_type]; 2063 else 2064 type = "<INVALID>"; 2065 kprintf("%23s %012lx %12p %08lx ", type, p->md_phys, 2066 p->md_virt, p->md_pages); 2067 if (p->md_attr & EFI_MD_ATTR_UC) 2068 kprintf("UC "); 2069 if (p->md_attr & EFI_MD_ATTR_WC) 2070 kprintf("WC "); 2071 if (p->md_attr & EFI_MD_ATTR_WT) 2072 kprintf("WT "); 2073 if (p->md_attr & EFI_MD_ATTR_WB) 2074 kprintf("WB "); 2075 if (p->md_attr & EFI_MD_ATTR_UCE) 2076 kprintf("UCE "); 2077 if (p->md_attr & EFI_MD_ATTR_WP) 2078 kprintf("WP "); 2079 if (p->md_attr & EFI_MD_ATTR_RP) 2080 kprintf("RP "); 2081 if (p->md_attr & EFI_MD_ATTR_XP) 2082 kprintf("XP "); 2083 if (p->md_attr & EFI_MD_ATTR_RT) 2084 kprintf("RUNTIME"); 2085 kprintf("\n"); 2086 } 2087 2088 switch (p->md_type) { 2089 case EFI_MD_TYPE_CODE: 2090 case EFI_MD_TYPE_DATA: 2091 case EFI_MD_TYPE_BS_CODE: 2092 case EFI_MD_TYPE_BS_DATA: 2093 case EFI_MD_TYPE_FREE: 2094 /* 2095 * We're allowed to use any entry with these types. 2096 */ 2097 break; 2098 default: 2099 continue; 2100 } 2101 2102 Realmem += p->md_pages * PAGE_SIZE; 2103 2104 if (p->md_phys == physmap[*physmap_idx + 1]) { 2105 physmap[*physmap_idx + 1] += p->md_pages * PAGE_SIZE; 2106 continue; 2107 } 2108 2109 *physmap_idx += 2; 2110 if (*physmap_idx == PHYSMAP_SIZE) { 2111 kprintf("Too many segments in the physical " 2112 "address map, giving up\n"); 2113 break; 2114 } 2115 physmap[*physmap_idx] = p->md_phys; 2116 physmap[*physmap_idx + 1] = p->md_phys + p->md_pages * PAGE_SIZE; 2117 } 2118 } 2119 2120 struct fb_info efi_fb_info; 2121 static int have_efi_framebuffer = 0; 2122 2123 static void 2124 efi_fb_init_vaddr(int direct_map) 2125 { 2126 uint64_t sz; 2127 vm_offset_t addr, v; 2128 2129 v = efi_fb_info.vaddr; 2130 sz = efi_fb_info.stride * efi_fb_info.height; 2131 2132 if (direct_map) { 2133 addr = PHYS_TO_DMAP(efi_fb_info.paddr); 2134 if (addr >= DMAP_MIN_ADDRESS && addr + sz <= DMapMaxAddress) 2135 efi_fb_info.vaddr = addr; 2136 } else { 2137 efi_fb_info.vaddr = 2138 (vm_offset_t)pmap_mapdev_attr(efi_fb_info.paddr, 2139 sz, 2140 PAT_WRITE_COMBINING); 2141 } 2142 } 2143 2144 static u_int 2145 efifb_color_depth(struct efi_fb *efifb) 2146 { 2147 uint32_t mask; 2148 u_int depth; 2149 2150 mask = efifb->fb_mask_red | efifb->fb_mask_green | 2151 efifb->fb_mask_blue | efifb->fb_mask_reserved; 2152 if (mask == 0) 2153 return (0); 2154 for (depth = 1; mask != 1; depth++) 2155 mask >>= 1; 2156 return (depth); 2157 } 2158 2159 int 2160 probe_efi_fb(int early) 2161 { 2162 struct efi_fb *efifb; 2163 caddr_t kmdp; 2164 u_int depth; 2165 2166 if (have_efi_framebuffer) { 2167 if (!early && 2168 (efi_fb_info.vaddr == 0 || 2169 efi_fb_info.vaddr == PHYS_TO_DMAP(efi_fb_info.paddr))) 2170 efi_fb_init_vaddr(0); 2171 return 0; 2172 } 2173 2174 kmdp = preload_search_by_type("elf kernel"); 2175 if (kmdp == NULL) 2176 kmdp = preload_search_by_type("elf64 kernel"); 2177 efifb = (struct efi_fb *)preload_search_info(kmdp, 2178 MODINFO_METADATA | MODINFOMD_EFI_FB); 2179 if (efifb == NULL) 2180 return 1; 2181 2182 depth = efifb_color_depth(efifb); 2183 /* 2184 * Our bootloader should already notice, when we won't be able to 2185 * use the UEFI framebuffer. 2186 */ 2187 if (depth != 24 && depth != 32) 2188 return 1; 2189 2190 have_efi_framebuffer = 1; 2191 2192 efi_fb_info.is_vga_boot_display = 1; 2193 efi_fb_info.width = efifb->fb_width; 2194 efi_fb_info.height = efifb->fb_height; 2195 efi_fb_info.depth = depth; 2196 efi_fb_info.stride = efifb->fb_stride * (depth / 8); 2197 efi_fb_info.paddr = efifb->fb_addr; 2198 if (early) { 2199 efi_fb_info.vaddr = 0; 2200 } else { 2201 efi_fb_init_vaddr(0); 2202 } 2203 efi_fb_info.fbops.fb_set_par = NULL; 2204 efi_fb_info.fbops.fb_blank = NULL; 2205 efi_fb_info.fbops.fb_debug_enter = NULL; 2206 efi_fb_info.device = NULL; 2207 2208 return 0; 2209 } 2210 2211 static void 2212 efifb_startup(void *arg) 2213 { 2214 probe_efi_fb(0); 2215 } 2216 2217 SYSINIT(efi_fb_info, SI_BOOT1_POST, SI_ORDER_FIRST, efifb_startup, NULL); 2218 2219 static void 2220 getmemsize(caddr_t kmdp, u_int64_t first) 2221 { 2222 int off, physmap_idx, pa_indx, da_indx; 2223 int i, j; 2224 vm_paddr_t pa; 2225 vm_paddr_t msgbuf_size; 2226 u_long physmem_tunable; 2227 pt_entry_t *pte; 2228 quad_t dcons_addr, dcons_size; 2229 2230 bzero(physmap, sizeof(physmap)); 2231 physmap_idx = 0; 2232 2233 /* 2234 * get memory map from INT 15:E820, kindly supplied by the loader. 2235 * 2236 * subr_module.c says: 2237 * "Consumer may safely assume that size value precedes data." 2238 * ie: an int32_t immediately precedes smap. 2239 */ 2240 efihdrbase = (struct efi_map_header *)preload_search_info(kmdp, 2241 MODINFO_METADATA | MODINFOMD_EFI_MAP); 2242 smapbase = (struct bios_smap *)preload_search_info(kmdp, 2243 MODINFO_METADATA | MODINFOMD_SMAP); 2244 if (smapbase == NULL && efihdrbase == NULL) 2245 panic("No BIOS smap or EFI map info from loader!"); 2246 2247 if (efihdrbase == NULL) 2248 add_smap_entries(&physmap_idx); 2249 else 2250 add_efi_map_entries(&physmap_idx); 2251 2252 base_memory = physmap[1] / 1024; 2253 /* make hole for AP bootstrap code */ 2254 physmap[1] = mp_bootaddress(base_memory); 2255 2256 /* Save EBDA address, if any */ 2257 ebda_addr = (u_long)(*(u_short *)(KERNBASE + 0x40e)); 2258 ebda_addr <<= 4; 2259 2260 /* 2261 * Maxmem isn't the "maximum memory", it's one larger than the 2262 * highest page of the physical address space. It should be 2263 * called something like "Maxphyspage". We may adjust this 2264 * based on ``hw.physmem'' and the results of the memory test. 2265 */ 2266 Maxmem = atop(physmap[physmap_idx + 1]); 2267 2268 #ifdef MAXMEM 2269 Maxmem = MAXMEM / 4; 2270 #endif 2271 2272 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 2273 Maxmem = atop(physmem_tunable); 2274 2275 /* 2276 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 2277 * in the system. 2278 */ 2279 if (Maxmem > atop(physmap[physmap_idx + 1])) 2280 Maxmem = atop(physmap[physmap_idx + 1]); 2281 2282 /* 2283 * Blowing out the DMAP will blow up the system. 2284 */ 2285 if (Maxmem > atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS)) { 2286 kprintf("Limiting Maxmem due to DMAP size\n"); 2287 Maxmem = atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS); 2288 } 2289 2290 if (atop(physmap[physmap_idx + 1]) != Maxmem && 2291 (boothowto & RB_VERBOSE)) { 2292 kprintf("Physical memory use set to %ldK\n", Maxmem * 4); 2293 } 2294 2295 /* 2296 * Call pmap initialization to make new kernel address space 2297 * 2298 * Mask off page 0. 2299 */ 2300 pmap_bootstrap(&first); 2301 physmap[0] = PAGE_SIZE; 2302 2303 /* 2304 * Align the physmap to PHYSMAP_ALIGN and cut out anything 2305 * exceeding Maxmem. 2306 */ 2307 for (i = j = 0; i <= physmap_idx; i += 2) { 2308 if (physmap[i+1] > ptoa(Maxmem)) 2309 physmap[i+1] = ptoa(Maxmem); 2310 physmap[i] = (physmap[i] + PHYSMAP_ALIGN_MASK) & 2311 ~PHYSMAP_ALIGN_MASK; 2312 physmap[i+1] = physmap[i+1] & ~PHYSMAP_ALIGN_MASK; 2313 2314 physmap[j] = physmap[i]; 2315 physmap[j+1] = physmap[i+1]; 2316 2317 if (physmap[i] < physmap[i+1]) 2318 j += 2; 2319 } 2320 physmap_idx = j - 2; 2321 2322 /* 2323 * Align anything else used in the validation loop. 2324 * 2325 * Also make sure that our 2MB kernel text+data+bss mappings 2326 * do not overlap potentially allocatable space. 2327 */ 2328 first = (first + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK; 2329 2330 /* 2331 * Size up each available chunk of physical memory. 2332 */ 2333 pa_indx = 0; 2334 da_indx = 0; 2335 phys_avail[pa_indx].phys_beg = physmap[0]; 2336 phys_avail[pa_indx].phys_end = physmap[0]; 2337 dump_avail[da_indx].phys_beg = 0; 2338 dump_avail[da_indx].phys_end = physmap[0]; 2339 pte = CMAP1; 2340 2341 /* 2342 * Get dcons buffer address 2343 */ 2344 if (kgetenv_quad("dcons.addr", &dcons_addr) == 0 || 2345 kgetenv_quad("dcons.size", &dcons_size) == 0) 2346 dcons_addr = 0; 2347 2348 /* 2349 * Validate the physical memory. The physical memory segments 2350 * have already been aligned to PHYSMAP_ALIGN which is a multiple 2351 * of PAGE_SIZE. 2352 * 2353 * We no longer perform an exhaustive memory test. Instead we 2354 * simply test the first and last word in each physmap[] 2355 * segment. 2356 */ 2357 for (i = 0; i <= physmap_idx; i += 2) { 2358 vm_paddr_t end; 2359 vm_paddr_t incr; 2360 2361 end = physmap[i + 1]; 2362 2363 for (pa = physmap[i]; pa < end; pa += incr) { 2364 int page_bad, full; 2365 volatile uint64_t *ptr = (uint64_t *)CADDR1; 2366 uint64_t tmp; 2367 2368 full = FALSE; 2369 2370 /* 2371 * Calculate incr. Just test the first and 2372 * last page in each physmap[] segment. 2373 */ 2374 if (pa == end - PAGE_SIZE) 2375 incr = PAGE_SIZE; 2376 else 2377 incr = end - pa - PAGE_SIZE; 2378 2379 /* 2380 * Make sure we don't skip blacked out areas. 2381 */ 2382 if (pa < 0x200000 && 0x200000 < end) { 2383 incr = 0x200000 - pa; 2384 } 2385 if (dcons_addr > 0 && 2386 pa < dcons_addr && 2387 dcons_addr < end) { 2388 incr = dcons_addr - pa; 2389 } 2390 2391 /* 2392 * Block out kernel memory as not available. 2393 */ 2394 if (pa >= 0x200000 && pa < first) { 2395 incr = first - pa; 2396 if (pa + incr > end) 2397 incr = end - pa; 2398 goto do_dump_avail; 2399 } 2400 2401 /* 2402 * Block out the dcons buffer if it exists. 2403 */ 2404 if (dcons_addr > 0 && 2405 pa >= trunc_page(dcons_addr) && 2406 pa < dcons_addr + dcons_size) { 2407 incr = dcons_addr + dcons_size - pa; 2408 incr = (incr + PAGE_MASK) & 2409 ~(vm_paddr_t)PAGE_MASK; 2410 if (pa + incr > end) 2411 incr = end - pa; 2412 goto do_dump_avail; 2413 } 2414 2415 page_bad = FALSE; 2416 2417 /* 2418 * Map the page non-cacheable for the memory 2419 * test. 2420 */ 2421 *pte = pa | 2422 kernel_pmap.pmap_bits[PG_V_IDX] | 2423 kernel_pmap.pmap_bits[PG_RW_IDX] | 2424 kernel_pmap.pmap_bits[PG_N_IDX]; 2425 cpu_invlpg(__DEVOLATILE(void *, ptr)); 2426 cpu_mfence(); 2427 2428 /* 2429 * Save original value for restoration later. 2430 */ 2431 tmp = *ptr; 2432 2433 /* 2434 * Test for alternating 1's and 0's 2435 */ 2436 *ptr = 0xaaaaaaaaaaaaaaaaLLU; 2437 cpu_mfence(); 2438 if (*ptr != 0xaaaaaaaaaaaaaaaaLLU) 2439 page_bad = TRUE; 2440 /* 2441 * Test for alternating 0's and 1's 2442 */ 2443 *ptr = 0x5555555555555555LLU; 2444 cpu_mfence(); 2445 if (*ptr != 0x5555555555555555LLU) 2446 page_bad = TRUE; 2447 /* 2448 * Test for all 1's 2449 */ 2450 *ptr = 0xffffffffffffffffLLU; 2451 cpu_mfence(); 2452 if (*ptr != 0xffffffffffffffffLLU) 2453 page_bad = TRUE; 2454 /* 2455 * Test for all 0's 2456 */ 2457 *ptr = 0x0; 2458 cpu_mfence(); 2459 if (*ptr != 0x0) 2460 page_bad = TRUE; 2461 2462 /* 2463 * Restore original value. 2464 */ 2465 *ptr = tmp; 2466 2467 /* 2468 * Adjust array of valid/good pages. 2469 */ 2470 if (page_bad == TRUE) { 2471 incr = PAGE_SIZE; 2472 continue; 2473 } 2474 2475 /* 2476 * Collapse page address into phys_avail[]. Do a 2477 * continuation of the current phys_avail[] index 2478 * when possible. 2479 */ 2480 if (phys_avail[pa_indx].phys_end == pa) { 2481 /* 2482 * Continuation 2483 */ 2484 phys_avail[pa_indx].phys_end += incr; 2485 } else if (phys_avail[pa_indx].phys_beg == 2486 phys_avail[pa_indx].phys_end) { 2487 /* 2488 * Current phys_avail is completely empty, 2489 * reuse the index. 2490 */ 2491 phys_avail[pa_indx].phys_beg = pa; 2492 phys_avail[pa_indx].phys_end = pa + incr; 2493 } else { 2494 /* 2495 * Allocate next phys_avail index. 2496 */ 2497 ++pa_indx; 2498 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 2499 kprintf( 2500 "Too many holes in the physical address space, giving up\n"); 2501 --pa_indx; 2502 full = TRUE; 2503 goto do_dump_avail; 2504 } 2505 phys_avail[pa_indx].phys_beg = pa; 2506 phys_avail[pa_indx].phys_end = pa + incr; 2507 } 2508 physmem += incr / PAGE_SIZE; 2509 2510 /* 2511 * pa available for dumping 2512 */ 2513 do_dump_avail: 2514 if (dump_avail[da_indx].phys_end == pa) { 2515 dump_avail[da_indx].phys_end += incr; 2516 } else { 2517 ++da_indx; 2518 if (da_indx == DUMP_AVAIL_ARRAY_END) { 2519 --da_indx; 2520 goto do_next; 2521 } 2522 dump_avail[da_indx].phys_beg = pa; 2523 dump_avail[da_indx].phys_end = pa + incr; 2524 } 2525 do_next: 2526 if (full) 2527 break; 2528 } 2529 } 2530 *pte = 0; 2531 cpu_invltlb(); 2532 cpu_mfence(); 2533 2534 /* 2535 * The last chunk must contain at least one page plus the message 2536 * buffer to avoid complicating other code (message buffer address 2537 * calculation, etc.). 2538 */ 2539 msgbuf_size = (MSGBUF_SIZE + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK; 2540 2541 while (phys_avail[pa_indx].phys_beg + PHYSMAP_ALIGN + msgbuf_size >= 2542 phys_avail[pa_indx].phys_end) { 2543 physmem -= atop(phys_avail[pa_indx].phys_end - 2544 phys_avail[pa_indx].phys_beg); 2545 phys_avail[pa_indx].phys_beg = 0; 2546 phys_avail[pa_indx].phys_end = 0; 2547 --pa_indx; 2548 } 2549 2550 Maxmem = atop(phys_avail[pa_indx].phys_end); 2551 2552 /* Trim off space for the message buffer. */ 2553 phys_avail[pa_indx].phys_end -= msgbuf_size; 2554 2555 avail_end = phys_avail[pa_indx].phys_end; 2556 2557 /* Map the message buffer. */ 2558 for (off = 0; off < msgbuf_size; off += PAGE_SIZE) { 2559 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); 2560 } 2561 2562 /* 2563 * Try to get EFI framebuffer working as early as possible. 2564 * 2565 * WARN: Some BIOSes do not list the EFI framebuffer memory, causing 2566 * the pmap probe code to create a DMAP that does not cover its 2567 * physical address space, efi_fb_init_vaddr(1) might not return 2568 * an initialized framebuffer base pointer. In this situation the 2569 * later efi_fb_init_vaddr(0) call will deal with it. 2570 */ 2571 if (have_efi_framebuffer) 2572 efi_fb_init_vaddr(1); 2573 } 2574 2575 struct machintr_abi MachIntrABI; 2576 2577 /* 2578 * IDT VECTORS: 2579 * 0 Divide by zero 2580 * 1 Debug 2581 * 2 NMI 2582 * 3 BreakPoint 2583 * 4 OverFlow 2584 * 5 Bound-Range 2585 * 6 Invalid OpCode 2586 * 7 Device Not Available (x87) 2587 * 8 Double-Fault 2588 * 9 Coprocessor Segment overrun (unsupported, reserved) 2589 * 10 Invalid-TSS 2590 * 11 Segment not present 2591 * 12 Stack 2592 * 13 General Protection 2593 * 14 Page Fault 2594 * 15 Reserved 2595 * 16 x87 FP Exception pending 2596 * 17 Alignment Check 2597 * 18 Machine Check 2598 * 19 SIMD floating point 2599 * 20-31 reserved 2600 * 32-255 INTn/external sources 2601 */ 2602 u_int64_t 2603 hammer_time(u_int64_t modulep, u_int64_t physfree) 2604 { 2605 caddr_t kmdp; 2606 int gsel_tss, x, cpu; 2607 #if 0 /* JG */ 2608 int metadata_missing, off; 2609 #endif 2610 struct mdglobaldata *gd; 2611 struct privatespace *ps; 2612 u_int64_t msr; 2613 2614 /* 2615 * Prevent lowering of the ipl if we call tsleep() early. 2616 */ 2617 gd = &CPU_prvspace[0]->mdglobaldata; 2618 ps = (struct privatespace *)gd; 2619 bzero(gd, sizeof(*gd)); 2620 bzero(&ps->common_tss, sizeof(ps->common_tss)); 2621 2622 /* 2623 * Note: on both UP and SMP curthread must be set non-NULL 2624 * early in the boot sequence because the system assumes 2625 * that 'curthread' is never NULL. 2626 */ 2627 2628 gd->mi.gd_curthread = &thread0; 2629 thread0.td_gd = &gd->mi; 2630 2631 atdevbase = ISA_HOLE_START + PTOV_OFFSET; 2632 2633 #if 0 /* JG */ 2634 metadata_missing = 0; 2635 if (bootinfo.bi_modulep) { 2636 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; 2637 preload_bootstrap_relocate(KERNBASE); 2638 } else { 2639 metadata_missing = 1; 2640 } 2641 if (bootinfo.bi_envp) 2642 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; 2643 #endif 2644 2645 preload_metadata = (caddr_t)(uintptr_t)(modulep + PTOV_OFFSET); 2646 preload_bootstrap_relocate(PTOV_OFFSET); 2647 kmdp = preload_search_by_type("elf kernel"); 2648 if (kmdp == NULL) 2649 kmdp = preload_search_by_type("elf64 kernel"); 2650 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 2651 kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + PTOV_OFFSET; 2652 #ifdef DDB 2653 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 2654 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 2655 #endif 2656 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); 2657 2658 if (boothowto & RB_VERBOSE) 2659 bootverbose++; 2660 2661 /* 2662 * Default MachIntrABI to ICU 2663 */ 2664 MachIntrABI = MachIntrABI_ICU; 2665 2666 /* 2667 * start with one cpu. Note: with one cpu, ncpus_fit_mask remain 0. 2668 */ 2669 ncpus = 1; 2670 ncpus_fit = 1; 2671 /* Init basic tunables, hz etc */ 2672 init_param1(); 2673 2674 /* 2675 * make gdt memory segments 2676 */ 2677 gdt_segs[GPROC0_SEL].ssd_base = 2678 (uintptr_t) &CPU_prvspace[0]->common_tss; 2679 2680 gd->mi.gd_prvspace = CPU_prvspace[0]; 2681 2682 for (x = 0; x < NGDT; x++) { 2683 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1)) 2684 ssdtosd(&gdt_segs[x], &gdt[x]); 2685 } 2686 ssdtosyssd(&gdt_segs[GPROC0_SEL], 2687 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 2688 2689 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 2690 r_gdt.rd_base = (long) gdt; 2691 lgdt(&r_gdt); 2692 2693 wrmsr(MSR_FSBASE, 0); /* User value */ 2694 wrmsr(MSR_GSBASE, (u_int64_t)&gd->mi); 2695 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 2696 2697 mi_gdinit(&gd->mi, 0); 2698 cpu_gdinit(gd, 0); 2699 proc0paddr = proc0paddr_buff; 2700 mi_proc0init(&gd->mi, proc0paddr); 2701 safepri = TDPRI_MAX; 2702 2703 /* spinlocks and the BGL */ 2704 init_locks(); 2705 2706 /* exceptions */ 2707 for (x = 0; x < NIDT; x++) 2708 setidt_global(x, rsvdary[x], SDT_SYSIGT, SEL_KPL, 0); 2709 setidt_global(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); 2710 setidt_global(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 2); 2711 setidt_global(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 1); 2712 setidt_global(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0); 2713 setidt_global(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0); 2714 setidt_global(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); 2715 setidt_global(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); 2716 setidt_global(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); 2717 setidt_global(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 2718 setidt_global(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); 2719 setidt_global(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); 2720 setidt_global(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); 2721 setidt_global(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); 2722 setidt_global(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); 2723 setidt_global(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); 2724 setidt_global(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); 2725 setidt_global(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); 2726 setidt_global(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0); 2727 setidt_global(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); 2728 2729 for (cpu = 0; cpu < MAXCPU; ++cpu) { 2730 r_idt_arr[cpu].rd_limit = sizeof(idt_arr[cpu]) - 1; 2731 r_idt_arr[cpu].rd_base = (long) &idt_arr[cpu][0]; 2732 } 2733 2734 lidt(&r_idt_arr[0]); 2735 2736 /* 2737 * Initialize the console before we print anything out. 2738 */ 2739 cninit(); 2740 2741 #if 0 /* JG */ 2742 if (metadata_missing) 2743 kprintf("WARNING: loader(8) metadata is missing!\n"); 2744 #endif 2745 2746 #if NISA >0 2747 elcr_probe(); 2748 isa_defaultirq(); 2749 #endif 2750 rand_initialize(); 2751 2752 /* 2753 * Initialize IRQ mapping 2754 * 2755 * NOTE: 2756 * SHOULD be after elcr_probe() 2757 */ 2758 MachIntrABI_ICU.initmap(); 2759 MachIntrABI_IOAPIC.initmap(); 2760 2761 #ifdef DDB 2762 kdb_init(); 2763 if (boothowto & RB_KDB) 2764 Debugger("Boot flags requested debugger"); 2765 #endif 2766 2767 identify_cpu(); /* Final stage of CPU initialization */ 2768 initializecpu(0); /* Initialize CPU registers */ 2769 2770 /* 2771 * On modern Intel cpus, haswell or later, cpu_idle_hlt=1 is better 2772 * because the cpu does significant power management in MWAIT 2773 * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP). 2774 * 2775 * On many AMD cpus cpu_idle_hlt=3 is better, because the cpu does 2776 * significant power management only when using ACPI halt mode. 2777 * (However, on Ryzen, mode 4 (HLT) also does power management). 2778 * 2779 * On older AMD or Intel cpus, cpu_idle_hlt=2 is better because ACPI 2780 * is needed to reduce power consumption, but wakeup times are often 2781 * too long. 2782 */ 2783 if (cpu_vendor_id == CPU_VENDOR_INTEL && 2784 CPUID_TO_MODEL(cpu_id) >= 0x3C) { /* Haswell or later */ 2785 cpu_idle_hlt = 1; 2786 } 2787 if (cpu_vendor_id == CPU_VENDOR_AMD) { 2788 if (CPUID_TO_FAMILY(cpu_id) >= 0x17) { 2789 /* Ryzen or later */ 2790 cpu_idle_hlt = 3; 2791 } else if (CPUID_TO_FAMILY(cpu_id) >= 0x14) { 2792 /* Bobcat or later */ 2793 cpu_idle_hlt = 3; 2794 } 2795 } 2796 2797 TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable); /* for compat */ 2798 TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable); 2799 TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable); 2800 TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt); 2801 2802 /* 2803 * Some of the virtual machines do not work w/ I/O APIC 2804 * enabled. If the user does not explicitly enable or 2805 * disable the I/O APIC (ioapic_enable < 0), then we 2806 * disable I/O APIC on all virtual machines. 2807 * 2808 * NOTE: 2809 * This must be done after identify_cpu(), which sets 2810 * 'cpu_feature2' 2811 */ 2812 if (ioapic_enable < 0) { 2813 if (cpu_feature2 & CPUID2_VMM) 2814 ioapic_enable = 0; 2815 else 2816 ioapic_enable = 1; 2817 } 2818 2819 /* 2820 * TSS entry point for interrupts, traps, and exceptions 2821 * (sans NMI). This will always go to near the top of the pcpu 2822 * trampoline area. Hardware-pushed data will be copied into 2823 * the trap-frame on entry, and (if necessary) returned to the 2824 * trampoline on exit. 2825 * 2826 * We store some pcb data for the trampoline code above the 2827 * stack the cpu hw pushes into, and arrange things so the 2828 * address of tr_pcb_rsp is the same as the desired top of 2829 * stack. 2830 */ 2831 ps->common_tss.tss_rsp0 = (register_t)&ps->trampoline.tr_pcb_rsp; 2832 ps->trampoline.tr_pcb_rsp = ps->common_tss.tss_rsp0; 2833 ps->trampoline.tr_pcb_gs_kernel = (register_t)gd; 2834 ps->trampoline.tr_pcb_cr3 = KPML4phys; /* adj to user cr3 live */ 2835 ps->dbltramp.tr_pcb_gs_kernel = (register_t)gd; 2836 ps->dbltramp.tr_pcb_cr3 = KPML4phys; 2837 ps->dbgtramp.tr_pcb_gs_kernel = (register_t)gd; 2838 ps->dbgtramp.tr_pcb_cr3 = KPML4phys; 2839 2840 /* double fault stack */ 2841 ps->common_tss.tss_ist1 = (register_t)&ps->dbltramp.tr_pcb_rsp; 2842 /* #DB debugger needs its own stack */ 2843 ps->common_tss.tss_ist2 = (register_t)&ps->dbgtramp.tr_pcb_rsp; 2844 2845 /* Set the IO permission bitmap (empty due to tss seg limit) */ 2846 ps->common_tss.tss_iobase = sizeof(struct x86_64tss); 2847 2848 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 2849 gd->gd_tss_gdt = &gdt[GPROC0_SEL]; 2850 gd->gd_common_tssd = *gd->gd_tss_gdt; 2851 ltr(gsel_tss); 2852 2853 /* Set up the fast syscall stuff */ 2854 msr = rdmsr(MSR_EFER) | EFER_SCE; 2855 wrmsr(MSR_EFER, msr); 2856 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); 2857 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 2858 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 2859 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 2860 wrmsr(MSR_STAR, msr); 2861 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_IOPL|PSL_AC); 2862 2863 getmemsize(kmdp, physfree); 2864 init_param2(physmem); 2865 2866 /* now running on new page tables, configured,and u/iom is accessible */ 2867 2868 /* Map the message buffer. */ 2869 #if 0 /* JG */ 2870 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) 2871 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); 2872 #endif 2873 2874 msgbufinit(msgbufp, MSGBUF_SIZE); 2875 2876 2877 /* transfer to user mode */ 2878 2879 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 2880 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 2881 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 2882 2883 load_ds(_udatasel); 2884 load_es(_udatasel); 2885 load_fs(_udatasel); 2886 2887 /* setup proc 0's pcb */ 2888 thread0.td_pcb->pcb_flags = 0; 2889 thread0.td_pcb->pcb_cr3 = KPML4phys; 2890 thread0.td_pcb->pcb_cr3_iso = 0; 2891 thread0.td_pcb->pcb_ext = NULL; 2892 lwp0.lwp_md.md_regs = &proc0_tf; /* XXX needed? */ 2893 2894 /* Location of kernel stack for locore */ 2895 return ((u_int64_t)thread0.td_pcb); 2896 } 2897 2898 /* 2899 * Initialize machine-dependant portions of the global data structure. 2900 * Note that the global data area and cpu0's idlestack in the private 2901 * data space were allocated in locore. 2902 * 2903 * Note: the idlethread's cpl is 0 2904 * 2905 * WARNING! Called from early boot, 'mycpu' may not work yet. 2906 */ 2907 void 2908 cpu_gdinit(struct mdglobaldata *gd, int cpu) 2909 { 2910 if (cpu) 2911 gd->mi.gd_curthread = &gd->mi.gd_idlethread; 2912 2913 lwkt_init_thread(&gd->mi.gd_idlethread, 2914 gd->mi.gd_prvspace->idlestack, 2915 sizeof(gd->mi.gd_prvspace->idlestack), 2916 0, &gd->mi); 2917 lwkt_set_comm(&gd->mi.gd_idlethread, "idle_%d", cpu); 2918 gd->mi.gd_idlethread.td_switch = cpu_lwkt_switch; 2919 gd->mi.gd_idlethread.td_sp -= sizeof(void *); 2920 *(void **)gd->mi.gd_idlethread.td_sp = cpu_idle_restore; 2921 } 2922 2923 /* 2924 * We only have to check for DMAP bounds, the globaldata space is 2925 * actually part of the kernel_map so we don't have to waste time 2926 * checking CPU_prvspace[*]. 2927 */ 2928 int 2929 is_globaldata_space(vm_offset_t saddr, vm_offset_t eaddr) 2930 { 2931 #if 0 2932 if (saddr >= (vm_offset_t)&CPU_prvspace[0] && 2933 eaddr <= (vm_offset_t)&CPU_prvspace[MAXCPU]) { 2934 return (TRUE); 2935 } 2936 #endif 2937 if (saddr >= DMAP_MIN_ADDRESS && eaddr <= DMAP_MAX_ADDRESS) 2938 return (TRUE); 2939 return (FALSE); 2940 } 2941 2942 struct globaldata * 2943 globaldata_find(int cpu) 2944 { 2945 KKASSERT(cpu >= 0 && cpu < ncpus); 2946 return(&CPU_prvspace[cpu]->mdglobaldata.mi); 2947 } 2948 2949 /* 2950 * This path should be safe from the SYSRET issue because only stopped threads 2951 * can have their %rip adjusted this way (and all heavy weight thread switches 2952 * clear QUICKREF and thus do not use SYSRET). However, the code path is 2953 * convoluted so add a safety by forcing %rip to be cannonical. 2954 */ 2955 int 2956 ptrace_set_pc(struct lwp *lp, unsigned long addr) 2957 { 2958 if (addr & 0x0000800000000000LLU) 2959 lp->lwp_md.md_regs->tf_rip = addr | 0xFFFF000000000000LLU; 2960 else 2961 lp->lwp_md.md_regs->tf_rip = addr & 0x0000FFFFFFFFFFFFLLU; 2962 return (0); 2963 } 2964 2965 int 2966 ptrace_single_step(struct lwp *lp) 2967 { 2968 lp->lwp_md.md_regs->tf_rflags |= PSL_T; 2969 return (0); 2970 } 2971 2972 int 2973 fill_regs(struct lwp *lp, struct reg *regs) 2974 { 2975 struct trapframe *tp; 2976 2977 if ((tp = lp->lwp_md.md_regs) == NULL) 2978 return EINVAL; 2979 bcopy(&tp->tf_rdi, ®s->r_rdi, sizeof(*regs)); 2980 return (0); 2981 } 2982 2983 int 2984 set_regs(struct lwp *lp, struct reg *regs) 2985 { 2986 struct trapframe *tp; 2987 2988 tp = lp->lwp_md.md_regs; 2989 if (!EFL_SECURE(regs->r_rflags, tp->tf_rflags) || 2990 !CS_SECURE(regs->r_cs)) 2991 return (EINVAL); 2992 bcopy(®s->r_rdi, &tp->tf_rdi, sizeof(*regs)); 2993 clear_quickret(); 2994 return (0); 2995 } 2996 2997 static void 2998 fill_fpregs_xmm(struct savexmm *sv_xmm, struct save87 *sv_87) 2999 { 3000 struct env87 *penv_87 = &sv_87->sv_env; 3001 struct envxmm *penv_xmm = &sv_xmm->sv_env; 3002 int i; 3003 3004 /* FPU control/status */ 3005 penv_87->en_cw = penv_xmm->en_cw; 3006 penv_87->en_sw = penv_xmm->en_sw; 3007 penv_87->en_tw = penv_xmm->en_tw; 3008 penv_87->en_fip = penv_xmm->en_fip; 3009 penv_87->en_fcs = penv_xmm->en_fcs; 3010 penv_87->en_opcode = penv_xmm->en_opcode; 3011 penv_87->en_foo = penv_xmm->en_foo; 3012 penv_87->en_fos = penv_xmm->en_fos; 3013 3014 /* FPU registers */ 3015 for (i = 0; i < 8; ++i) 3016 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; 3017 } 3018 3019 static void 3020 set_fpregs_xmm(struct save87 *sv_87, struct savexmm *sv_xmm) 3021 { 3022 struct env87 *penv_87 = &sv_87->sv_env; 3023 struct envxmm *penv_xmm = &sv_xmm->sv_env; 3024 int i; 3025 3026 /* FPU control/status */ 3027 penv_xmm->en_cw = penv_87->en_cw; 3028 penv_xmm->en_sw = penv_87->en_sw; 3029 penv_xmm->en_tw = penv_87->en_tw; 3030 penv_xmm->en_fip = penv_87->en_fip; 3031 penv_xmm->en_fcs = penv_87->en_fcs; 3032 penv_xmm->en_opcode = penv_87->en_opcode; 3033 penv_xmm->en_foo = penv_87->en_foo; 3034 penv_xmm->en_fos = penv_87->en_fos; 3035 3036 /* FPU registers */ 3037 for (i = 0; i < 8; ++i) 3038 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; 3039 } 3040 3041 int 3042 fill_fpregs(struct lwp *lp, struct fpreg *fpregs) 3043 { 3044 if (lp->lwp_thread == NULL || lp->lwp_thread->td_pcb == NULL) 3045 return EINVAL; 3046 if (cpu_fxsr) { 3047 fill_fpregs_xmm(&lp->lwp_thread->td_pcb->pcb_save.sv_xmm, 3048 (struct save87 *)fpregs); 3049 return (0); 3050 } 3051 bcopy(&lp->lwp_thread->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs); 3052 return (0); 3053 } 3054 3055 int 3056 set_fpregs(struct lwp *lp, struct fpreg *fpregs) 3057 { 3058 if (cpu_fxsr) { 3059 set_fpregs_xmm((struct save87 *)fpregs, 3060 &lp->lwp_thread->td_pcb->pcb_save.sv_xmm); 3061 return (0); 3062 } 3063 bcopy(fpregs, &lp->lwp_thread->td_pcb->pcb_save.sv_87, sizeof *fpregs); 3064 return (0); 3065 } 3066 3067 int 3068 fill_dbregs(struct lwp *lp, struct dbreg *dbregs) 3069 { 3070 struct pcb *pcb; 3071 3072 if (lp == NULL) { 3073 dbregs->dr[0] = rdr0(); 3074 dbregs->dr[1] = rdr1(); 3075 dbregs->dr[2] = rdr2(); 3076 dbregs->dr[3] = rdr3(); 3077 dbregs->dr[4] = rdr4(); 3078 dbregs->dr[5] = rdr5(); 3079 dbregs->dr[6] = rdr6(); 3080 dbregs->dr[7] = rdr7(); 3081 return (0); 3082 } 3083 if (lp->lwp_thread == NULL || (pcb = lp->lwp_thread->td_pcb) == NULL) 3084 return EINVAL; 3085 dbregs->dr[0] = pcb->pcb_dr0; 3086 dbregs->dr[1] = pcb->pcb_dr1; 3087 dbregs->dr[2] = pcb->pcb_dr2; 3088 dbregs->dr[3] = pcb->pcb_dr3; 3089 dbregs->dr[4] = 0; 3090 dbregs->dr[5] = 0; 3091 dbregs->dr[6] = pcb->pcb_dr6; 3092 dbregs->dr[7] = pcb->pcb_dr7; 3093 return (0); 3094 } 3095 3096 int 3097 set_dbregs(struct lwp *lp, struct dbreg *dbregs) 3098 { 3099 if (lp == NULL) { 3100 load_dr0(dbregs->dr[0]); 3101 load_dr1(dbregs->dr[1]); 3102 load_dr2(dbregs->dr[2]); 3103 load_dr3(dbregs->dr[3]); 3104 load_dr4(dbregs->dr[4]); 3105 load_dr5(dbregs->dr[5]); 3106 load_dr6(dbregs->dr[6]); 3107 load_dr7(dbregs->dr[7]); 3108 } else { 3109 struct pcb *pcb; 3110 struct ucred *ucred; 3111 int i; 3112 uint64_t mask1, mask2; 3113 3114 /* 3115 * Don't let an illegal value for dr7 get set. Specifically, 3116 * check for undefined settings. Setting these bit patterns 3117 * result in undefined behaviour and can lead to an unexpected 3118 * TRCTRAP. 3119 */ 3120 /* JG this loop looks unreadable */ 3121 /* Check 4 2-bit fields for invalid patterns. 3122 * These fields are R/Wi, for i = 0..3 3123 */ 3124 /* Is 10 in LENi allowed when running in compatibility mode? */ 3125 /* Pattern 10 in R/Wi might be used to indicate 3126 * breakpoint on I/O. Further analysis should be 3127 * carried to decide if it is safe and useful to 3128 * provide access to that capability 3129 */ 3130 for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 4; 3131 i++, mask1 <<= 4, mask2 <<= 4) 3132 if ((dbregs->dr[7] & mask1) == mask2) 3133 return (EINVAL); 3134 3135 pcb = lp->lwp_thread->td_pcb; 3136 ucred = lp->lwp_proc->p_ucred; 3137 3138 /* 3139 * Don't let a process set a breakpoint that is not within the 3140 * process's address space. If a process could do this, it 3141 * could halt the system by setting a breakpoint in the kernel 3142 * (if ddb was enabled). Thus, we need to check to make sure 3143 * that no breakpoints are being enabled for addresses outside 3144 * process's address space, unless, perhaps, we were called by 3145 * uid 0. 3146 * 3147 * XXX - what about when the watched area of the user's 3148 * address space is written into from within the kernel 3149 * ... wouldn't that still cause a breakpoint to be generated 3150 * from within kernel mode? 3151 */ 3152 3153 if (priv_check_cred(ucred, PRIV_ROOT, 0) != 0) { 3154 if (dbregs->dr[7] & 0x3) { 3155 /* dr0 is enabled */ 3156 if (dbregs->dr[0] >= VM_MAX_USER_ADDRESS) 3157 return (EINVAL); 3158 } 3159 3160 if (dbregs->dr[7] & (0x3<<2)) { 3161 /* dr1 is enabled */ 3162 if (dbregs->dr[1] >= VM_MAX_USER_ADDRESS) 3163 return (EINVAL); 3164 } 3165 3166 if (dbregs->dr[7] & (0x3<<4)) { 3167 /* dr2 is enabled */ 3168 if (dbregs->dr[2] >= VM_MAX_USER_ADDRESS) 3169 return (EINVAL); 3170 } 3171 3172 if (dbregs->dr[7] & (0x3<<6)) { 3173 /* dr3 is enabled */ 3174 if (dbregs->dr[3] >= VM_MAX_USER_ADDRESS) 3175 return (EINVAL); 3176 } 3177 } 3178 3179 pcb->pcb_dr0 = dbregs->dr[0]; 3180 pcb->pcb_dr1 = dbregs->dr[1]; 3181 pcb->pcb_dr2 = dbregs->dr[2]; 3182 pcb->pcb_dr3 = dbregs->dr[3]; 3183 pcb->pcb_dr6 = dbregs->dr[6]; 3184 pcb->pcb_dr7 = dbregs->dr[7]; 3185 3186 pcb->pcb_flags |= PCB_DBREGS; 3187 } 3188 3189 return (0); 3190 } 3191 3192 /* 3193 * Return > 0 if a hardware breakpoint has been hit, and the 3194 * breakpoint was in user space. Return 0, otherwise. 3195 */ 3196 int 3197 user_dbreg_trap(void) 3198 { 3199 u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */ 3200 u_int64_t bp; /* breakpoint bits extracted from dr6 */ 3201 int nbp; /* number of breakpoints that triggered */ 3202 caddr_t addr[4]; /* breakpoint addresses */ 3203 int i; 3204 3205 dr7 = rdr7(); 3206 if ((dr7 & 0xff) == 0) { 3207 /* 3208 * all GE and LE bits in the dr7 register are zero, 3209 * thus the trap couldn't have been caused by the 3210 * hardware debug registers 3211 */ 3212 return 0; 3213 } 3214 3215 nbp = 0; 3216 dr6 = rdr6(); 3217 bp = dr6 & 0xf; 3218 3219 if (bp == 0) { 3220 /* 3221 * None of the breakpoint bits are set meaning this 3222 * trap was not caused by any of the debug registers 3223 */ 3224 return 0; 3225 } 3226 3227 /* 3228 * at least one of the breakpoints were hit, check to see 3229 * which ones and if any of them are user space addresses 3230 */ 3231 3232 if (bp & 0x01) { 3233 addr[nbp++] = (caddr_t)rdr0(); 3234 } 3235 if (bp & 0x02) { 3236 addr[nbp++] = (caddr_t)rdr1(); 3237 } 3238 if (bp & 0x04) { 3239 addr[nbp++] = (caddr_t)rdr2(); 3240 } 3241 if (bp & 0x08) { 3242 addr[nbp++] = (caddr_t)rdr3(); 3243 } 3244 3245 for (i = 0; i < nbp; i++) { 3246 if (addr[i] < (caddr_t)VM_MAX_USER_ADDRESS) { 3247 /* 3248 * addr[i] is in user space 3249 */ 3250 return nbp; 3251 } 3252 } 3253 3254 /* 3255 * None of the breakpoints are in user space. 3256 */ 3257 return 0; 3258 } 3259 3260 3261 #ifndef DDB 3262 void 3263 Debugger(const char *msg) 3264 { 3265 kprintf("Debugger(\"%s\") called.\n", msg); 3266 } 3267 #endif /* no DDB */ 3268 3269 #ifdef DDB 3270 3271 /* 3272 * Provide inb() and outb() as functions. They are normally only 3273 * available as macros calling inlined functions, thus cannot be 3274 * called inside DDB. 3275 * 3276 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined. 3277 */ 3278 3279 #undef inb 3280 #undef outb 3281 3282 /* silence compiler warnings */ 3283 u_char inb(u_int); 3284 void outb(u_int, u_char); 3285 3286 u_char 3287 inb(u_int port) 3288 { 3289 u_char data; 3290 /* 3291 * We use %%dx and not %1 here because i/o is done at %dx and not at 3292 * %edx, while gcc generates inferior code (movw instead of movl) 3293 * if we tell it to load (u_short) port. 3294 */ 3295 __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); 3296 return (data); 3297 } 3298 3299 void 3300 outb(u_int port, u_char data) 3301 { 3302 u_char al; 3303 /* 3304 * Use an unnecessary assignment to help gcc's register allocator. 3305 * This make a large difference for gcc-1.40 and a tiny difference 3306 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for 3307 * best results. gcc-2.6.0 can't handle this. 3308 */ 3309 al = data; 3310 __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); 3311 } 3312 3313 #endif /* DDB */ 3314 3315 3316 3317 /* 3318 * initialize all the SMP locks 3319 */ 3320 3321 /* critical region when masking or unmasking interupts */ 3322 struct spinlock_deprecated imen_spinlock; 3323 3324 /* locks com (tty) data/hardware accesses: a FASTINTR() */ 3325 struct spinlock_deprecated com_spinlock; 3326 3327 /* lock regions around the clock hardware */ 3328 struct spinlock_deprecated clock_spinlock; 3329 3330 static void 3331 init_locks(void) 3332 { 3333 /* 3334 * Get the initial mplock with a count of 1 for the BSP. 3335 * This uses a LOGICAL cpu ID, ie BSP == 0. 3336 */ 3337 cpu_get_initial_mplock(); 3338 /* DEPRECATED */ 3339 spin_init_deprecated(&imen_spinlock); 3340 spin_init_deprecated(&com_spinlock); 3341 spin_init_deprecated(&clock_spinlock); 3342 3343 /* our token pool needs to work early */ 3344 lwkt_token_pool_init(); 3345 } 3346 3347 boolean_t 3348 cpu_mwait_hint_valid(uint32_t hint) 3349 { 3350 int cx_idx, sub; 3351 3352 cx_idx = MWAIT_EAX_TO_CX(hint); 3353 if (cx_idx >= CPU_MWAIT_CX_MAX) 3354 return FALSE; 3355 3356 sub = MWAIT_EAX_TO_CX_SUB(hint); 3357 if (sub >= cpu_mwait_cx_info[cx_idx].subcnt) 3358 return FALSE; 3359 3360 return TRUE; 3361 } 3362 3363 void 3364 cpu_mwait_cx_no_bmsts(void) 3365 { 3366 atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_STS); 3367 } 3368 3369 void 3370 cpu_mwait_cx_no_bmarb(void) 3371 { 3372 atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_ARB); 3373 } 3374 3375 static int 3376 cpu_mwait_cx_hint2name(int hint, char *name, int namelen, boolean_t allow_auto) 3377 { 3378 int old_cx_idx, sub = 0; 3379 3380 if (hint >= 0) { 3381 old_cx_idx = MWAIT_EAX_TO_CX(hint); 3382 sub = MWAIT_EAX_TO_CX_SUB(hint); 3383 } else if (hint == CPU_MWAIT_HINT_AUTO) { 3384 old_cx_idx = allow_auto ? CPU_MWAIT_C2 : CPU_MWAIT_CX_MAX; 3385 } else if (hint == CPU_MWAIT_HINT_AUTODEEP) { 3386 old_cx_idx = allow_auto ? CPU_MWAIT_C3 : CPU_MWAIT_CX_MAX; 3387 } else { 3388 old_cx_idx = CPU_MWAIT_CX_MAX; 3389 } 3390 3391 if (!CPU_MWAIT_HAS_CX) 3392 strlcpy(name, "NONE", namelen); 3393 else if (allow_auto && hint == CPU_MWAIT_HINT_AUTO) 3394 strlcpy(name, "AUTO", namelen); 3395 else if (allow_auto && hint == CPU_MWAIT_HINT_AUTODEEP) 3396 strlcpy(name, "AUTODEEP", namelen); 3397 else if (old_cx_idx >= CPU_MWAIT_CX_MAX || 3398 sub >= cpu_mwait_cx_info[old_cx_idx].subcnt) 3399 strlcpy(name, "INVALID", namelen); 3400 else 3401 ksnprintf(name, namelen, "C%d/%d", old_cx_idx, sub); 3402 3403 return old_cx_idx; 3404 } 3405 3406 static int 3407 cpu_mwait_cx_name2hint(char *name, int *hint0, boolean_t allow_auto) 3408 { 3409 int cx_idx, sub, hint; 3410 char *ptr, *start; 3411 3412 if (allow_auto && strcmp(name, "AUTO") == 0) { 3413 hint = CPU_MWAIT_HINT_AUTO; 3414 cx_idx = CPU_MWAIT_C2; 3415 goto done; 3416 } 3417 if (allow_auto && strcmp(name, "AUTODEEP") == 0) { 3418 hint = CPU_MWAIT_HINT_AUTODEEP; 3419 cx_idx = CPU_MWAIT_C3; 3420 goto done; 3421 } 3422 3423 if (strlen(name) < 4 || toupper(name[0]) != 'C') 3424 return -1; 3425 start = &name[1]; 3426 ptr = NULL; 3427 3428 cx_idx = strtol(start, &ptr, 10); 3429 if (ptr == start || *ptr != '/') 3430 return -1; 3431 if (cx_idx < 0 || cx_idx >= CPU_MWAIT_CX_MAX) 3432 return -1; 3433 3434 start = ptr + 1; 3435 ptr = NULL; 3436 3437 sub = strtol(start, &ptr, 10); 3438 if (*ptr != '\0') 3439 return -1; 3440 if (sub < 0 || sub >= cpu_mwait_cx_info[cx_idx].subcnt) 3441 return -1; 3442 3443 hint = MWAIT_EAX_HINT(cx_idx, sub); 3444 done: 3445 *hint0 = hint; 3446 return cx_idx; 3447 } 3448 3449 static int 3450 cpu_mwait_cx_transit(int old_cx_idx, int cx_idx) 3451 { 3452 if (cx_idx >= CPU_MWAIT_C3 && cpu_mwait_c3_preamble) 3453 return EOPNOTSUPP; 3454 if (old_cx_idx < CPU_MWAIT_C3 && cx_idx >= CPU_MWAIT_C3) { 3455 int error; 3456 3457 error = cputimer_intr_powersave_addreq(); 3458 if (error) 3459 return error; 3460 } else if (old_cx_idx >= CPU_MWAIT_C3 && cx_idx < CPU_MWAIT_C3) { 3461 cputimer_intr_powersave_remreq(); 3462 } 3463 return 0; 3464 } 3465 3466 static int 3467 cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS, int *hint0, 3468 boolean_t allow_auto) 3469 { 3470 int error, cx_idx, old_cx_idx, hint; 3471 char name[CPU_MWAIT_CX_NAMELEN]; 3472 3473 hint = *hint0; 3474 old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name), 3475 allow_auto); 3476 3477 error = sysctl_handle_string(oidp, name, sizeof(name), req); 3478 if (error != 0 || req->newptr == NULL) 3479 return error; 3480 3481 if (!CPU_MWAIT_HAS_CX) 3482 return EOPNOTSUPP; 3483 3484 cx_idx = cpu_mwait_cx_name2hint(name, &hint, allow_auto); 3485 if (cx_idx < 0) 3486 return EINVAL; 3487 3488 error = cpu_mwait_cx_transit(old_cx_idx, cx_idx); 3489 if (error) 3490 return error; 3491 3492 *hint0 = hint; 3493 return 0; 3494 } 3495 3496 static int 3497 cpu_mwait_cx_setname(struct cpu_idle_stat *stat, const char *cx_name) 3498 { 3499 int error, cx_idx, old_cx_idx, hint; 3500 char name[CPU_MWAIT_CX_NAMELEN]; 3501 3502 KASSERT(CPU_MWAIT_HAS_CX, ("cpu does not support mwait CX extension")); 3503 3504 hint = stat->hint; 3505 old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE); 3506 3507 strlcpy(name, cx_name, sizeof(name)); 3508 cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE); 3509 if (cx_idx < 0) 3510 return EINVAL; 3511 3512 error = cpu_mwait_cx_transit(old_cx_idx, cx_idx); 3513 if (error) 3514 return error; 3515 3516 stat->hint = hint; 3517 return 0; 3518 } 3519 3520 static int 3521 cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS) 3522 { 3523 int hint = cpu_mwait_halt_global; 3524 int error, cx_idx, cpu; 3525 char name[CPU_MWAIT_CX_NAMELEN], cx_name[CPU_MWAIT_CX_NAMELEN]; 3526 3527 cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE); 3528 3529 error = sysctl_handle_string(oidp, name, sizeof(name), req); 3530 if (error != 0 || req->newptr == NULL) 3531 return error; 3532 3533 if (!CPU_MWAIT_HAS_CX) 3534 return EOPNOTSUPP; 3535 3536 /* Save name for later per-cpu CX configuration */ 3537 strlcpy(cx_name, name, sizeof(cx_name)); 3538 3539 cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE); 3540 if (cx_idx < 0) 3541 return EINVAL; 3542 3543 /* Change per-cpu CX configuration */ 3544 for (cpu = 0; cpu < ncpus; ++cpu) { 3545 error = cpu_mwait_cx_setname(&cpu_idle_stats[cpu], cx_name); 3546 if (error) 3547 return error; 3548 } 3549 3550 cpu_mwait_halt_global = hint; 3551 return 0; 3552 } 3553 3554 static int 3555 cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS) 3556 { 3557 struct cpu_idle_stat *stat = arg1; 3558 int error; 3559 3560 error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req, 3561 &stat->hint, TRUE); 3562 return error; 3563 } 3564 3565 static int 3566 cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS) 3567 { 3568 int error; 3569 3570 error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req, 3571 &cpu_mwait_spin, FALSE); 3572 return error; 3573 } 3574 3575 /* 3576 * This manual debugging code is called unconditionally from Xtimer 3577 * (the per-cpu timer interrupt) whether the current thread is in a 3578 * critical section or not) and can be useful in tracking down lockups. 3579 * 3580 * NOTE: MANUAL DEBUG CODE 3581 */ 3582 #if 0 3583 static int saveticks[SMP_MAXCPU]; 3584 static int savecounts[SMP_MAXCPU]; 3585 #endif 3586 3587 void 3588 pcpu_timer_always(struct intrframe *frame) 3589 { 3590 #if 0 3591 globaldata_t gd = mycpu; 3592 int cpu = gd->gd_cpuid; 3593 char buf[64]; 3594 short *gptr; 3595 int i; 3596 3597 if (cpu <= 20) { 3598 gptr = (short *)0xFFFFFFFF800b8000 + 80 * cpu; 3599 *gptr = ((*gptr + 1) & 0x00FF) | 0x0700; 3600 ++gptr; 3601 3602 ksnprintf(buf, sizeof(buf), " %p %16s %d %16s ", 3603 (void *)frame->if_rip, gd->gd_curthread->td_comm, ticks, 3604 gd->gd_infomsg); 3605 for (i = 0; buf[i]; ++i) { 3606 gptr[i] = 0x0700 | (unsigned char)buf[i]; 3607 } 3608 } 3609 #if 0 3610 if (saveticks[gd->gd_cpuid] != ticks) { 3611 saveticks[gd->gd_cpuid] = ticks; 3612 savecounts[gd->gd_cpuid] = 0; 3613 } 3614 ++savecounts[gd->gd_cpuid]; 3615 if (savecounts[gd->gd_cpuid] > 2000 && panicstr == NULL) { 3616 panic("cpud %d panicing on ticks failure", 3617 gd->gd_cpuid); 3618 } 3619 for (i = 0; i < ncpus; ++i) { 3620 int delta; 3621 if (saveticks[i] && panicstr == NULL) { 3622 delta = saveticks[i] - ticks; 3623 if (delta < -10 || delta > 10) { 3624 panic("cpu %d panicing on cpu %d watchdog", 3625 gd->gd_cpuid, i); 3626 } 3627 } 3628 } 3629 #endif 3630 #endif 3631 } 3632 3633 SET_DECLARE(smap_open, char); 3634 SET_DECLARE(smap_close, char); 3635 3636 static void 3637 cpu_implement_smap(void) 3638 { 3639 char **scan; 3640 3641 for (scan = SET_BEGIN(smap_open); /* nop -> stac */ 3642 scan < SET_LIMIT(smap_open); ++scan) { 3643 (*scan)[0] = 0x0F; 3644 (*scan)[1] = 0x01; 3645 (*scan)[2] = 0xCB; 3646 } 3647 for (scan = SET_BEGIN(smap_close); /* nop -> clac */ 3648 scan < SET_LIMIT(smap_close); ++scan) { 3649 (*scan)[0] = 0x0F; 3650 (*scan)[1] = 0x01; 3651 (*scan)[2] = 0xCA; 3652 } 3653 } 3654 3655 /* 3656 * From a hard interrupt 3657 */ 3658 int 3659 cpu_interrupt_running(struct thread *td) 3660 { 3661 struct mdglobaldata *gd = mdcpu; 3662 3663 if (clock_debug1 > 0) { 3664 --clock_debug1; 3665 kprintf("%d %016lx %016lx %016lx\n", 3666 ((td->td_flags & TDF_INTTHREAD) != 0), 3667 gd->gd_ipending[0], 3668 gd->gd_ipending[1], 3669 gd->gd_ipending[2]); 3670 if (td->td_flags & TDF_CLKTHREAD) { 3671 kprintf("CLKTD %s PREEMPT %s\n", 3672 td->td_comm, 3673 (td->td_preempted ? 3674 td->td_preempted->td_comm : "")); 3675 } else { 3676 kprintf("NORTD %s\n", td->td_comm); 3677 } 3678 } 3679 if ((td->td_flags & TDF_INTTHREAD) || 3680 gd->gd_ipending[0] || 3681 gd->gd_ipending[1] || 3682 gd->gd_ipending[2]) { 3683 return 1; 3684 } else { 3685 return 0; 3686 } 3687 } 3688