1 /*- 2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 3 * Copyright (c) 1992 Terrence R. Lambert. 4 * Copyright (c) 2003 Peter Wemm. 5 * Copyright (c) 2008-2017 The DragonFly Project. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * William Jolitz. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $ 41 */ 42 43 //#include "use_npx.h" 44 #include "use_isa.h" 45 #include "opt_cpu.h" 46 #include "opt_ddb.h" 47 #include "opt_inet.h" 48 #include "opt_msgbuf.h" 49 #include "opt_swap.h" 50 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/sysmsg.h> 54 #include <sys/signalvar.h> 55 #include <sys/kernel.h> 56 #include <sys/linker.h> 57 #include <sys/malloc.h> 58 #include <sys/proc.h> 59 #include <sys/priv.h> 60 #include <sys/buf.h> 61 #include <sys/reboot.h> 62 #include <sys/mbuf.h> 63 #include <sys/msgbuf.h> 64 #include <sys/sysent.h> 65 #include <sys/sysctl.h> 66 #include <sys/vmmeter.h> 67 #include <sys/bus.h> 68 #include <sys/usched.h> 69 #include <sys/reg.h> 70 #include <sys/sbuf.h> 71 #include <sys/ctype.h> 72 #include <sys/serialize.h> 73 #include <sys/systimer.h> 74 75 #include <vm/vm.h> 76 #include <vm/vm_param.h> 77 #include <sys/lock.h> 78 #include <vm/vm_kern.h> 79 #include <vm/vm_object.h> 80 #include <vm/vm_page.h> 81 #include <vm/vm_map.h> 82 #include <vm/vm_pager.h> 83 #include <vm/vm_extern.h> 84 85 #include <sys/thread2.h> 86 #include <sys/mplock2.h> 87 88 #include <sys/exec.h> 89 #include <sys/cons.h> 90 91 #include <sys/efi.h> 92 93 #include <ddb/ddb.h> 94 95 #include <machine/cpu.h> 96 #include <machine/clock.h> 97 #include <machine/specialreg.h> 98 #if 0 /* JG */ 99 #include <machine/bootinfo.h> 100 #endif 101 #include <machine/md_var.h> 102 #include <machine/metadata.h> 103 #include <machine/pc/bios.h> 104 #include <machine/pcb_ext.h> 105 #include <machine/globaldata.h> /* CPU_prvspace */ 106 #include <machine/smp.h> 107 #include <machine/cputypes.h> 108 #include <machine/intr_machdep.h> 109 #include <machine/framebuffer.h> 110 111 #ifdef OLD_BUS_ARCH 112 #include <bus/isa/isa_device.h> 113 #endif 114 #include <machine_base/isa/isa_intr.h> 115 #include <bus/isa/rtc.h> 116 #include <sys/random.h> 117 #include <sys/ptrace.h> 118 #include <machine/sigframe.h> 119 120 #include <sys/machintr.h> 121 #include <machine_base/icu/icu_abi.h> 122 #include <machine_base/icu/elcr_var.h> 123 #include <machine_base/apic/lapic.h> 124 #include <machine_base/apic/ioapic.h> 125 #include <machine_base/apic/ioapic_abi.h> 126 #include <machine/mptable.h> 127 128 #define PHYSMAP_ENTRIES 10 129 #define MAXBUFSTRUCTSIZE ((size_t)512 * 1024 * 1024) 130 131 extern u_int64_t hammer_time(u_int64_t, u_int64_t); 132 133 extern void printcpuinfo(void); /* XXX header file */ 134 extern void identify_cpu(void); 135 extern void panicifcpuunsupported(void); 136 137 static void cpu_startup(void *); 138 static void pic_finish(void *); 139 static void cpu_finish(void *); 140 141 static void set_fpregs_xmm(struct save87 *, struct savexmm *); 142 static void fill_fpregs_xmm(struct savexmm *, struct save87 *); 143 static void init_locks(void); 144 145 extern void pcpu_timer_always(struct intrframe *); 146 147 SYSINIT(cpu, SI_BOOT2_START_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 148 SYSINIT(pic_finish, SI_BOOT2_FINISH_PIC, SI_ORDER_FIRST, pic_finish, NULL); 149 SYSINIT(cpu_finish, SI_BOOT2_FINISH_CPU, SI_ORDER_FIRST, cpu_finish, NULL); 150 151 #ifdef DDB 152 extern vm_offset_t ksym_start, ksym_end; 153 #endif 154 155 struct privatespace CPU_prvspace_bsp __aligned(4096); 156 struct privatespace *CPU_prvspace[MAXCPU] = { &CPU_prvspace_bsp }; 157 158 vm_paddr_t efi_systbl_phys; 159 int _udatasel, _ucodesel, _ucode32sel; 160 u_long atdevbase; 161 int64_t tsc_offsets[MAXCPU]; 162 cpumask_t smp_idleinvl_mask; 163 cpumask_t smp_idleinvl_reqs; 164 165 /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */ 166 __read_mostly static int cpu_mwait_halt_global; 167 __read_mostly static int clock_debug1; 168 169 #if defined(SWTCH_OPTIM_STATS) 170 extern int swtch_optim_stats; 171 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, 172 CTLFLAG_RD, &swtch_optim_stats, 0, ""); 173 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, 174 CTLFLAG_RD, &tlb_flush_count, 0, ""); 175 #endif 176 SYSCTL_INT(_debug, OID_AUTO, clock_debug1, 177 CTLFLAG_RW, &clock_debug1, 0, ""); 178 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_halt, 179 CTLFLAG_RD, &cpu_mwait_halt_global, 0, ""); 180 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_spin, 181 CTLFLAG_RD, &cpu_mwait_spin, 0, "monitor/mwait target state"); 182 183 #define CPU_MWAIT_HAS_CX \ 184 ((cpu_feature2 & CPUID2_MON) && \ 185 (cpu_mwait_feature & CPUID_MWAIT_EXT)) 186 187 #define CPU_MWAIT_CX_NAMELEN 16 188 189 #define CPU_MWAIT_C1 1 190 #define CPU_MWAIT_C2 2 191 #define CPU_MWAIT_C3 3 192 #define CPU_MWAIT_CX_MAX 8 193 194 #define CPU_MWAIT_HINT_AUTO -1 /* C1 and C2 */ 195 #define CPU_MWAIT_HINT_AUTODEEP -2 /* C3+ */ 196 197 SYSCTL_NODE(_machdep, OID_AUTO, mwait, CTLFLAG_RW, 0, "MWAIT features"); 198 SYSCTL_NODE(_machdep_mwait, OID_AUTO, CX, CTLFLAG_RW, 0, "MWAIT Cx settings"); 199 200 struct cpu_mwait_cx { 201 int subcnt; 202 char name[4]; 203 struct sysctl_ctx_list sysctl_ctx; 204 struct sysctl_oid *sysctl_tree; 205 }; 206 static struct cpu_mwait_cx cpu_mwait_cx_info[CPU_MWAIT_CX_MAX]; 207 static char cpu_mwait_cx_supported[256]; 208 209 static int cpu_mwait_c1_hints_cnt; 210 static int cpu_mwait_hints_cnt; 211 static int *cpu_mwait_hints; 212 213 static int cpu_mwait_deep_hints_cnt; 214 static int *cpu_mwait_deep_hints; 215 216 #define CPU_IDLE_REPEAT_DEFAULT 750 217 218 static u_int cpu_idle_repeat = CPU_IDLE_REPEAT_DEFAULT; 219 static u_long cpu_idle_repeat_max = CPU_IDLE_REPEAT_DEFAULT; 220 static u_int cpu_mwait_repeat_shift = 1; 221 222 #define CPU_MWAIT_C3_PREAMBLE_BM_ARB 0x1 223 #define CPU_MWAIT_C3_PREAMBLE_BM_STS 0x2 224 225 static int cpu_mwait_c3_preamble = 226 CPU_MWAIT_C3_PREAMBLE_BM_ARB | 227 CPU_MWAIT_C3_PREAMBLE_BM_STS; 228 229 SYSCTL_STRING(_machdep_mwait_CX, OID_AUTO, supported, CTLFLAG_RD, 230 cpu_mwait_cx_supported, 0, "MWAIT supported C states"); 231 SYSCTL_INT(_machdep_mwait_CX, OID_AUTO, c3_preamble, CTLFLAG_RD, 232 &cpu_mwait_c3_preamble, 0, "C3+ preamble mask"); 233 234 static int cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS, 235 int *, boolean_t); 236 static int cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS); 237 static int cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS); 238 static int cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS); 239 240 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, idle, CTLTYPE_STRING|CTLFLAG_RW, 241 NULL, 0, cpu_mwait_cx_idle_sysctl, "A", ""); 242 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, spin, CTLTYPE_STRING|CTLFLAG_RW, 243 NULL, 0, cpu_mwait_cx_spin_sysctl, "A", ""); 244 SYSCTL_UINT(_machdep_mwait_CX, OID_AUTO, repeat_shift, CTLFLAG_RW, 245 &cpu_mwait_repeat_shift, 0, ""); 246 247 long physmem = 0; 248 249 u_long ebda_addr = 0; 250 251 int imcr_present = 0; 252 253 int naps = 0; /* # of Applications processors */ 254 255 u_int base_memory; 256 257 static int 258 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS) 259 { 260 u_long pmem = ctob(physmem); 261 int error; 262 263 error = sysctl_handle_long(oidp, &pmem, 0, req); 264 265 return (error); 266 } 267 268 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_ULONG|CTLFLAG_RD, 269 0, 0, sysctl_hw_physmem, "LU", 270 "Total system memory in bytes (number of pages * page size)"); 271 272 static int 273 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS) 274 { 275 u_long usermem = ctob(physmem - vmstats.v_wire_count); 276 int error; 277 278 error = sysctl_handle_long(oidp, &usermem, 0, req); 279 280 return (error); 281 } 282 283 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_ULONG|CTLFLAG_RD, 284 0, 0, sysctl_hw_usermem, "LU", ""); 285 286 static int 287 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS) 288 { 289 int error; 290 u_long availpages; 291 292 availpages = x86_64_btop(avail_end - avail_start); 293 error = sysctl_handle_long(oidp, &availpages, 0, req); 294 295 return (error); 296 } 297 298 SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_ULONG|CTLFLAG_RD, 299 0, 0, sysctl_hw_availpages, "LU", ""); 300 301 vm_paddr_t Maxmem; 302 vm_paddr_t Realmem; 303 304 /* 305 * The number of PHYSMAP entries must be one less than the number of 306 * PHYSSEG entries because the PHYSMAP entry that spans the largest 307 * physical address that is accessible by ISA DMA is split into two 308 * PHYSSEG entries. 309 */ 310 vm_phystable_t phys_avail[VM_PHYSSEG_MAX + 1]; 311 vm_phystable_t dump_avail[VM_PHYSSEG_MAX + 1]; 312 313 /* must be 1 less so 0 0 can signal end of chunks */ 314 #define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 1) 315 #define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 1) 316 317 static vm_offset_t buffer_sva, buffer_eva; 318 vm_offset_t clean_sva, clean_eva; 319 static vm_offset_t pager_sva, pager_eva; 320 static struct trapframe proc0_tf; 321 322 static void cpu_implement_smap(void); 323 324 static void 325 cpu_startup(void *dummy) 326 { 327 caddr_t v; 328 vm_size_t size = 0; 329 vm_offset_t firstaddr; 330 331 /* 332 * Good {morning,afternoon,evening,night}. 333 */ 334 kprintf("%s", version); 335 startrtclock(); 336 printcpuinfo(); 337 panicifcpuunsupported(); 338 if (cpu_stdext_feature & CPUID_STDEXT_SMAP) 339 cpu_implement_smap(); 340 341 kprintf("real memory = %ju (%ju MB)\n", 342 (intmax_t)Realmem, 343 (intmax_t)Realmem / 1024 / 1024); 344 /* 345 * Display any holes after the first chunk of extended memory. 346 */ 347 if (bootverbose) { 348 int indx; 349 350 kprintf("Physical memory chunk(s):\n"); 351 for (indx = 0; phys_avail[indx].phys_end != 0; ++indx) { 352 vm_paddr_t size1; 353 354 size1 = phys_avail[indx].phys_end - 355 phys_avail[indx].phys_beg; 356 357 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n", 358 (intmax_t)phys_avail[indx].phys_beg, 359 (intmax_t)phys_avail[indx].phys_end - 1, 360 (intmax_t)size1, 361 (intmax_t)(size1 / PAGE_SIZE)); 362 } 363 } 364 365 /* 366 * Allocate space for system data structures. 367 * The first available kernel virtual address is in "v". 368 * As pages of kernel virtual memory are allocated, "v" is incremented. 369 * As pages of memory are allocated and cleared, 370 * "firstaddr" is incremented. 371 * An index into the kernel page table corresponding to the 372 * virtual memory address maintained in "v" is kept in "mapaddr". 373 */ 374 375 /* 376 * Make two passes. The first pass calculates how much memory is 377 * needed and allocates it. The second pass assigns virtual 378 * addresses to the various data structures. 379 */ 380 firstaddr = 0; 381 again: 382 v = (caddr_t)firstaddr; 383 384 #define valloc(name, type, num) \ 385 (name) = (type *)v; v = (caddr_t)((name)+(num)) 386 #define valloclim(name, type, num, lim) \ 387 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) 388 389 /* 390 * Calculate nbuf such that maxbufspace uses approximately 1/20 391 * of physical memory by default, with a minimum of 50 buffers. 392 * 393 * The calculation is made after discounting 128MB. 394 * 395 * NOTE: maxbufspace is (nbuf * NBUFCALCSIZE) (NBUFCALCSIZE ~= 16KB). 396 * nbuf = (kbytes / factor) would cover all of memory. 397 */ 398 if (nbuf == 0) { 399 long factor = NBUFCALCSIZE / 1024; /* KB/nbuf */ 400 long kbytes = physmem * (PAGE_SIZE / 1024); /* physmem */ 401 402 nbuf = 50; 403 if (kbytes > 128 * 1024) 404 nbuf += (kbytes - 128 * 1024) / (factor * 20); 405 if (maxbcache && nbuf > maxbcache / NBUFCALCSIZE) 406 nbuf = maxbcache / NBUFCALCSIZE; 407 if ((size_t)nbuf * sizeof(struct buf) > MAXBUFSTRUCTSIZE) { 408 kprintf("Warning: nbuf capped at %ld due to the " 409 "reasonability limit\n", nbuf); 410 nbuf = MAXBUFSTRUCTSIZE / sizeof(struct buf); 411 } 412 } 413 414 /* 415 * Do not allow the buffer_map to be more then 1/2 the size of the 416 * kernel_map. 417 */ 418 if (nbuf > (virtual_end - virtual_start + 419 virtual2_end - virtual2_start) / (MAXBSIZE * 2)) { 420 nbuf = (virtual_end - virtual_start + 421 virtual2_end - virtual2_start) / (MAXBSIZE * 2); 422 kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf); 423 } 424 425 /* 426 * Do not allow the buffer_map to use more than 50% of available 427 * physical-equivalent memory. Since the VM pages which back 428 * individual buffers are typically wired, having too many bufs 429 * can prevent the system from paging properly. 430 */ 431 if (nbuf > physmem * PAGE_SIZE / (NBUFCALCSIZE * 2)) { 432 nbuf = physmem * PAGE_SIZE / (NBUFCALCSIZE * 2); 433 kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf); 434 } 435 436 /* 437 * Do not allow the sizeof(struct buf) * nbuf to exceed 1/4 of 438 * the valloc space which is just the virtual_end - virtual_start 439 * section. This is typically ~2GB regardless of the amount of 440 * memory, so we use 500MB as a metric. 441 * 442 * This is because we use valloc() to allocate the buf header array. 443 * 444 * NOTE: buffer space in bytes is limited by vfs.*bufspace sysctls. 445 */ 446 if (nbuf > (virtual_end - virtual_start) / (sizeof(struct buf) * 4)) { 447 nbuf = (virtual_end - virtual_start) / 448 (sizeof(struct buf) * 4); 449 kprintf("Warning: nbufs capped at %ld due to " 450 "valloc considerations\n", 451 nbuf); 452 } 453 454 nswbuf_mem = lmax(lmin(nbuf / 32, 512), 8); 455 #ifdef NSWBUF_MIN 456 if (nswbuf_mem < NSWBUF_MIN) 457 nswbuf_mem = NSWBUF_MIN; 458 #endif 459 nswbuf_kva = lmax(lmin(nbuf / 4, 512), 16); 460 #ifdef NSWBUF_MIN 461 if (nswbuf_kva < NSWBUF_MIN) 462 nswbuf_kva = NSWBUF_MIN; 463 #endif 464 465 valloc(swbuf_mem, struct buf, nswbuf_mem); 466 valloc(swbuf_kva, struct buf, nswbuf_kva); 467 valloc(buf, struct buf, nbuf); 468 469 /* 470 * End of first pass, size has been calculated so allocate memory 471 */ 472 if (firstaddr == 0) { 473 size = (vm_size_t)(v - firstaddr); 474 firstaddr = kmem_alloc(&kernel_map, round_page(size), 475 VM_SUBSYS_BUF); 476 if (firstaddr == 0) 477 panic("startup: no room for tables"); 478 goto again; 479 } 480 481 /* 482 * End of second pass, addresses have been assigned 483 * 484 * nbuf is an int, make sure we don't overflow the field. 485 * 486 * On 64-bit systems we always reserve maximal allocations for 487 * buffer cache buffers and there are no fragmentation issues, 488 * so the KVA segment does not have to be excessively oversized. 489 */ 490 if ((vm_size_t)(v - firstaddr) != size) 491 panic("startup: table size inconsistency"); 492 493 kmem_suballoc(&kernel_map, &clean_map, &clean_sva, &clean_eva, 494 ((vm_offset_t)(nbuf + 16) * MAXBSIZE) + 495 ((nswbuf_mem + nswbuf_kva) * MAXPHYS) + pager_map_size); 496 kmem_suballoc(&clean_map, &buffer_map, &buffer_sva, &buffer_eva, 497 ((vm_offset_t)(nbuf + 16) * MAXBSIZE)); 498 buffer_map.system_map = 1; 499 kmem_suballoc(&clean_map, &pager_map, &pager_sva, &pager_eva, 500 ((vm_offset_t)(nswbuf_mem + nswbuf_kva) * MAXPHYS) + 501 pager_map_size); 502 pager_map.system_map = 1; 503 kprintf("avail memory = %ju (%ju MB)\n", 504 (uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages), 505 (uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages) / 506 1024 / 1024); 507 } 508 509 struct cpu_idle_stat { 510 int hint; 511 int reserved; 512 u_long halt; 513 u_long spin; 514 u_long repeat; 515 u_long repeat_last; 516 u_long repeat_delta; 517 u_long mwait_cx[CPU_MWAIT_CX_MAX]; 518 } __cachealign; 519 520 #define CPU_IDLE_STAT_HALT -1 521 #define CPU_IDLE_STAT_SPIN -2 522 523 static struct cpu_idle_stat cpu_idle_stats[MAXCPU]; 524 525 static int 526 sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS) 527 { 528 int idx = arg2, cpu, error; 529 u_long val = 0; 530 531 if (idx == CPU_IDLE_STAT_HALT) { 532 for (cpu = 0; cpu < ncpus; ++cpu) 533 val += cpu_idle_stats[cpu].halt; 534 } else if (idx == CPU_IDLE_STAT_SPIN) { 535 for (cpu = 0; cpu < ncpus; ++cpu) 536 val += cpu_idle_stats[cpu].spin; 537 } else { 538 KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX, 539 ("invalid index %d", idx)); 540 for (cpu = 0; cpu < ncpus; ++cpu) 541 val += cpu_idle_stats[cpu].mwait_cx[idx]; 542 } 543 544 error = sysctl_handle_quad(oidp, &val, 0, req); 545 if (error || req->newptr == NULL) 546 return error; 547 548 if (idx == CPU_IDLE_STAT_HALT) { 549 for (cpu = 0; cpu < ncpus; ++cpu) 550 cpu_idle_stats[cpu].halt = 0; 551 cpu_idle_stats[0].halt = val; 552 } else if (idx == CPU_IDLE_STAT_SPIN) { 553 for (cpu = 0; cpu < ncpus; ++cpu) 554 cpu_idle_stats[cpu].spin = 0; 555 cpu_idle_stats[0].spin = val; 556 } else { 557 KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX, 558 ("invalid index %d", idx)); 559 for (cpu = 0; cpu < ncpus; ++cpu) 560 cpu_idle_stats[cpu].mwait_cx[idx] = 0; 561 cpu_idle_stats[0].mwait_cx[idx] = val; 562 } 563 return 0; 564 } 565 566 static void 567 cpu_mwait_attach(void) 568 { 569 struct sbuf sb; 570 int hint_idx, i; 571 572 if (!CPU_MWAIT_HAS_CX) 573 return; 574 575 if (cpu_vendor_id == CPU_VENDOR_INTEL && 576 (CPUID_TO_FAMILY(cpu_id) > 0xf || 577 (CPUID_TO_FAMILY(cpu_id) == 0x6 && 578 CPUID_TO_MODEL(cpu_id) >= 0xf))) { 579 int bm_sts = 1; 580 581 /* 582 * Pentium dual-core, Core 2 and beyond do not need any 583 * additional activities to enter deep C-state, i.e. C3(+). 584 */ 585 cpu_mwait_cx_no_bmarb(); 586 587 TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts); 588 if (!bm_sts) 589 cpu_mwait_cx_no_bmsts(); 590 } 591 592 sbuf_new(&sb, cpu_mwait_cx_supported, 593 sizeof(cpu_mwait_cx_supported), SBUF_FIXEDLEN); 594 595 for (i = 0; i < CPU_MWAIT_CX_MAX; ++i) { 596 struct cpu_mwait_cx *cx = &cpu_mwait_cx_info[i]; 597 int sub; 598 599 ksnprintf(cx->name, sizeof(cx->name), "C%d", i); 600 601 sysctl_ctx_init(&cx->sysctl_ctx); 602 cx->sysctl_tree = SYSCTL_ADD_NODE(&cx->sysctl_ctx, 603 SYSCTL_STATIC_CHILDREN(_machdep_mwait), OID_AUTO, 604 cx->name, CTLFLAG_RW, NULL, "Cx control/info"); 605 if (cx->sysctl_tree == NULL) 606 continue; 607 608 cx->subcnt = CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu, i); 609 SYSCTL_ADD_INT(&cx->sysctl_ctx, 610 SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO, 611 "subcnt", CTLFLAG_RD, &cx->subcnt, 0, 612 "sub-state count"); 613 SYSCTL_ADD_PROC(&cx->sysctl_ctx, 614 SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO, 615 "entered", (CTLTYPE_QUAD | CTLFLAG_RW), 0, 616 i, sysctl_cpu_idle_cnt, "Q", "# of times entered"); 617 618 for (sub = 0; sub < cx->subcnt; ++sub) 619 sbuf_printf(&sb, "C%d/%d ", i, sub); 620 } 621 sbuf_trim(&sb); 622 sbuf_finish(&sb); 623 624 /* 625 * Non-deep C-states 626 */ 627 cpu_mwait_c1_hints_cnt = cpu_mwait_cx_info[CPU_MWAIT_C1].subcnt; 628 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i) 629 cpu_mwait_hints_cnt += cpu_mwait_cx_info[i].subcnt; 630 cpu_mwait_hints = kmalloc(sizeof(int) * cpu_mwait_hints_cnt, 631 M_DEVBUF, M_WAITOK); 632 633 hint_idx = 0; 634 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i) { 635 int j, subcnt; 636 637 subcnt = cpu_mwait_cx_info[i].subcnt; 638 for (j = 0; j < subcnt; ++j) { 639 KASSERT(hint_idx < cpu_mwait_hints_cnt, 640 ("invalid mwait hint index %d", hint_idx)); 641 cpu_mwait_hints[hint_idx] = MWAIT_EAX_HINT(i, j); 642 ++hint_idx; 643 } 644 } 645 KASSERT(hint_idx == cpu_mwait_hints_cnt, 646 ("mwait hint count %d != index %d", 647 cpu_mwait_hints_cnt, hint_idx)); 648 649 if (bootverbose) { 650 kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt); 651 for (i = 0; i < cpu_mwait_hints_cnt; ++i) { 652 int hint = cpu_mwait_hints[i]; 653 654 kprintf(" C%d/%d hint 0x%04x\n", 655 MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint), 656 hint); 657 } 658 } 659 660 /* 661 * Deep C-states 662 */ 663 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i) 664 cpu_mwait_deep_hints_cnt += cpu_mwait_cx_info[i].subcnt; 665 cpu_mwait_deep_hints = kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt, 666 M_DEVBUF, M_WAITOK); 667 668 hint_idx = 0; 669 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i) { 670 int j, subcnt; 671 672 subcnt = cpu_mwait_cx_info[i].subcnt; 673 for (j = 0; j < subcnt; ++j) { 674 KASSERT(hint_idx < cpu_mwait_deep_hints_cnt, 675 ("invalid mwait deep hint index %d", hint_idx)); 676 cpu_mwait_deep_hints[hint_idx] = MWAIT_EAX_HINT(i, j); 677 ++hint_idx; 678 } 679 } 680 KASSERT(hint_idx == cpu_mwait_deep_hints_cnt, 681 ("mwait deep hint count %d != index %d", 682 cpu_mwait_deep_hints_cnt, hint_idx)); 683 684 if (bootverbose) { 685 kprintf("MWAIT deep hints:\n"); 686 for (i = 0; i < cpu_mwait_deep_hints_cnt; ++i) { 687 int hint = cpu_mwait_deep_hints[i]; 688 689 kprintf(" C%d/%d hint 0x%04x\n", 690 MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint), 691 hint); 692 } 693 } 694 cpu_idle_repeat_max = 256 * cpu_mwait_deep_hints_cnt; 695 696 for (i = 0; i < ncpus; ++i) { 697 char name[16]; 698 699 ksnprintf(name, sizeof(name), "idle%d", i); 700 SYSCTL_ADD_PROC(NULL, 701 SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX), OID_AUTO, 702 name, (CTLTYPE_STRING | CTLFLAG_RW), &cpu_idle_stats[i], 703 0, cpu_mwait_cx_pcpu_idle_sysctl, "A", ""); 704 } 705 } 706 707 static void 708 cpu_finish(void *dummy __unused) 709 { 710 cpu_setregs(); 711 cpu_mwait_attach(); 712 } 713 714 static void 715 pic_finish(void *dummy __unused) 716 { 717 /* Log ELCR information */ 718 elcr_dump(); 719 720 /* Log MPTABLE information */ 721 mptable_pci_int_dump(); 722 723 /* Finalize PCI */ 724 MachIntrABI.finalize(); 725 } 726 727 /* 728 * Send an interrupt to process. 729 * 730 * Stack is set up to allow sigcode stored 731 * at top to call routine, followed by kcall 732 * to sigreturn routine below. After sigreturn 733 * resets the signal mask, the stack, and the 734 * frame pointer, it returns to the user 735 * specified pc, psl. 736 */ 737 void 738 sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code) 739 { 740 struct lwp *lp = curthread->td_lwp; 741 struct proc *p = lp->lwp_proc; 742 struct trapframe *regs; 743 struct sigacts *psp = p->p_sigacts; 744 struct sigframe sf, *sfp; 745 int oonstack; 746 char *sp; 747 748 regs = lp->lwp_md.md_regs; 749 oonstack = (lp->lwp_sigstk.ss_flags & SS_ONSTACK) ? 1 : 0; 750 751 /* Save user context */ 752 bzero(&sf, sizeof(struct sigframe)); 753 sf.sf_uc.uc_sigmask = *mask; 754 sf.sf_uc.uc_stack = lp->lwp_sigstk; 755 sf.sf_uc.uc_mcontext.mc_onstack = oonstack; 756 KKASSERT(__offsetof(struct trapframe, tf_rdi) == 0); 757 /* gcc errors out on optimized bcopy */ 758 _bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(struct trapframe)); 759 760 /* Make the size of the saved context visible to userland */ 761 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); 762 763 /* Allocate and validate space for the signal handler context. */ 764 if ((lp->lwp_flags & LWP_ALTSTACK) != 0 && !oonstack && 765 SIGISMEMBER(psp->ps_sigonstack, sig)) { 766 sp = (char *)lp->lwp_sigstk.ss_sp + lp->lwp_sigstk.ss_size - 767 sizeof(struct sigframe); 768 lp->lwp_sigstk.ss_flags |= SS_ONSTACK; 769 } else { 770 /* We take red zone into account */ 771 sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128; 772 } 773 774 /* 775 * XXX AVX needs 64-byte alignment but sigframe has other fields and 776 * the embedded ucontext is not at the front, so aligning this won't 777 * help us. Fortunately we bcopy in/out of the sigframe, so the 778 * kernel is ok. 779 * 780 * The problem though is if userland winds up trying to use the 781 * context directly. 782 */ 783 sfp = (struct sigframe *)((intptr_t)sp & ~(intptr_t)0xF); 784 785 /* Translate the signal is appropriate */ 786 if (p->p_sysent->sv_sigtbl) { 787 if (sig <= p->p_sysent->sv_sigsize) 788 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 789 } 790 791 /* 792 * Build the argument list for the signal handler. 793 * 794 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx) 795 */ 796 regs->tf_rdi = sig; /* argument 1 */ 797 regs->tf_rdx = (register_t)&sfp->sf_uc; /* argument 3 */ 798 799 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 800 /* 801 * Signal handler installed with SA_SIGINFO. 802 * 803 * action(signo, siginfo, ucontext) 804 */ 805 regs->tf_rsi = (register_t)&sfp->sf_si; /* argument 2 */ 806 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */ 807 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 808 809 /* fill siginfo structure */ 810 sf.sf_si.si_signo = sig; 811 sf.sf_si.si_pid = psp->ps_frominfo[sig].pid; 812 sf.sf_si.si_uid = psp->ps_frominfo[sig].uid; 813 sf.sf_si.si_code = code; 814 sf.sf_si.si_addr = (void *)regs->tf_addr; 815 } else { 816 /* 817 * Old FreeBSD-style arguments. 818 * 819 * handler (signo, code, [uc], addr) 820 */ 821 regs->tf_rsi = (register_t)code; /* argument 2 */ 822 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */ 823 sf.sf_ahu.sf_handler = catcher; 824 } 825 826 /* 827 * If we're a vm86 process, we want to save the segment registers. 828 * We also change eflags to be our emulated eflags, not the actual 829 * eflags. 830 */ 831 #if 0 /* JG */ 832 if (regs->tf_eflags & PSL_VM) { 833 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 834 struct vm86_kernel *vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86; 835 836 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 837 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 838 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 839 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 840 841 if (vm86->vm86_has_vme == 0) 842 sf.sf_uc.uc_mcontext.mc_eflags = 843 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 844 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 845 846 /* 847 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 848 * syscalls made by the signal handler. This just avoids 849 * wasting time for our lazy fixup of such faults. PSL_NT 850 * does nothing in vm86 mode, but vm86 programs can set it 851 * almost legitimately in probes for old cpu types. 852 */ 853 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 854 } 855 #endif 856 857 /* 858 * Save the FPU state and reinit the FP unit 859 */ 860 npxpush(&sf.sf_uc.uc_mcontext); 861 862 /* 863 * Copy the sigframe out to the user's stack. 864 */ 865 if (copyout(&sf, sfp, sizeof(struct sigframe)) != 0) { 866 /* 867 * Something is wrong with the stack pointer. 868 * ...Kill the process. 869 */ 870 sigexit(lp, SIGILL); 871 } 872 873 regs->tf_rsp = (register_t)sfp; 874 regs->tf_rip = trunc_page64(PS_STRINGS - *(p->p_sysent->sv_szsigcode)); 875 regs->tf_rip -= SZSIGCODE_EXTRA_BYTES; 876 877 /* 878 * x86 abi specifies that the direction flag must be cleared 879 * on function entry 880 */ 881 regs->tf_rflags &= ~(PSL_T | PSL_D); 882 883 /* 884 * 64 bit mode has a code and stack selector but 885 * no data or extra selector. %fs and %gs are not 886 * stored in-context. 887 */ 888 regs->tf_cs = _ucodesel; 889 regs->tf_ss = _udatasel; 890 clear_quickret(); 891 } 892 893 /* 894 * Sanitize the trapframe for a virtual kernel passing control to a custom 895 * VM context. Remove any items that would otherwise create a privilage 896 * issue. 897 * 898 * XXX at the moment we allow userland to set the resume flag. Is this a 899 * bad idea? 900 */ 901 int 902 cpu_sanitize_frame(struct trapframe *frame) 903 { 904 frame->tf_cs = _ucodesel; 905 frame->tf_ss = _udatasel; 906 /* XXX VM (8086) mode not supported? */ 907 frame->tf_rflags &= (PSL_RF | PSL_USERCHANGE | PSL_VM_UNSUPP); 908 frame->tf_rflags |= PSL_RESERVED_DEFAULT | PSL_I; 909 910 return(0); 911 } 912 913 /* 914 * Sanitize the tls so loading the descriptor does not blow up 915 * on us. For x86_64 we don't have to do anything. 916 */ 917 int 918 cpu_sanitize_tls(struct savetls *tls) 919 { 920 return(0); 921 } 922 923 /* 924 * sigreturn(ucontext_t *sigcntxp) 925 * 926 * System call to cleanup state after a signal 927 * has been taken. Reset signal mask and 928 * stack state from context left by sendsig (above). 929 * Return to previous pc and psl as specified by 930 * context left by sendsig. Check carefully to 931 * make sure that the user has not modified the 932 * state to gain improper privileges. 933 * 934 * MPSAFE 935 */ 936 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 937 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 938 939 int 940 sys_sigreturn(struct sysmsg *sysmsg, const struct sigreturn_args *uap) 941 { 942 struct lwp *lp = curthread->td_lwp; 943 struct trapframe *regs; 944 ucontext_t uc; 945 ucontext_t *ucp; 946 register_t rflags; 947 int cs; 948 int error; 949 950 /* 951 * We have to copy the information into kernel space so userland 952 * can't modify it while we are sniffing it. 953 */ 954 regs = lp->lwp_md.md_regs; 955 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 956 if (error) 957 return (error); 958 ucp = &uc; 959 rflags = ucp->uc_mcontext.mc_rflags; 960 961 /* VM (8086) mode not supported */ 962 rflags &= ~PSL_VM_UNSUPP; 963 964 #if 0 /* JG */ 965 if (eflags & PSL_VM) { 966 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 967 struct vm86_kernel *vm86; 968 969 /* 970 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 971 * set up the vm86 area, and we can't enter vm86 mode. 972 */ 973 if (lp->lwp_thread->td_pcb->pcb_ext == 0) 974 return (EINVAL); 975 vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86; 976 if (vm86->vm86_inited == 0) 977 return (EINVAL); 978 979 /* go back to user mode if both flags are set */ 980 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) 981 trapsignal(lp, SIGBUS, 0); 982 983 if (vm86->vm86_has_vme) { 984 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 985 (eflags & VME_USERCHANGE) | PSL_VM; 986 } else { 987 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 988 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 989 (eflags & VM_USERCHANGE) | PSL_VM; 990 } 991 bcopy(&ucp->uc_mcontext.mc_gs, tf, sizeof(struct trapframe)); 992 tf->tf_eflags = eflags; 993 tf->tf_vm86_ds = tf->tf_ds; 994 tf->tf_vm86_es = tf->tf_es; 995 tf->tf_vm86_fs = tf->tf_fs; 996 tf->tf_vm86_gs = tf->tf_gs; 997 tf->tf_ds = _udatasel; 998 tf->tf_es = _udatasel; 999 tf->tf_fs = _udatasel; 1000 tf->tf_gs = _udatasel; 1001 } else 1002 #endif 1003 { 1004 /* 1005 * Don't allow users to change privileged or reserved flags. 1006 */ 1007 /* 1008 * XXX do allow users to change the privileged flag PSL_RF. 1009 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers 1010 * should sometimes set it there too. tf_eflags is kept in 1011 * the signal context during signal handling and there is no 1012 * other place to remember it, so the PSL_RF bit may be 1013 * corrupted by the signal handler without us knowing. 1014 * Corruption of the PSL_RF bit at worst causes one more or 1015 * one less debugger trap, so allowing it is fairly harmless. 1016 */ 1017 if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { 1018 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags); 1019 return(EINVAL); 1020 } 1021 1022 /* 1023 * Don't allow users to load a valid privileged %cs. Let the 1024 * hardware check for invalid selectors, excess privilege in 1025 * other selectors, invalid %eip's and invalid %esp's. 1026 */ 1027 cs = ucp->uc_mcontext.mc_cs; 1028 if (!CS_SECURE(cs)) { 1029 kprintf("sigreturn: cs = 0x%x\n", cs); 1030 trapsignal(lp, SIGBUS, T_PROTFLT); 1031 return(EINVAL); 1032 } 1033 /* gcc errors out on optimized bcopy */ 1034 _bcopy(&ucp->uc_mcontext.mc_rdi, regs, 1035 sizeof(struct trapframe)); 1036 } 1037 1038 /* 1039 * Restore the FPU state from the frame 1040 */ 1041 crit_enter(); 1042 npxpop(&ucp->uc_mcontext); 1043 1044 if (ucp->uc_mcontext.mc_onstack & 1) 1045 lp->lwp_sigstk.ss_flags |= SS_ONSTACK; 1046 else 1047 lp->lwp_sigstk.ss_flags &= ~SS_ONSTACK; 1048 1049 lp->lwp_sigmask = ucp->uc_sigmask; 1050 SIG_CANTMASK(lp->lwp_sigmask); 1051 clear_quickret(); 1052 crit_exit(); 1053 return(EJUSTRETURN); 1054 } 1055 1056 /* 1057 * Machine dependent boot() routine 1058 * 1059 * I haven't seen anything to put here yet 1060 * Possibly some stuff might be grafted back here from boot() 1061 */ 1062 void 1063 cpu_boot(int howto) 1064 { 1065 } 1066 1067 /* 1068 * Shutdown the CPU as much as possible 1069 */ 1070 void 1071 cpu_halt(void) 1072 { 1073 for (;;) 1074 __asm__ __volatile("hlt"); 1075 } 1076 1077 /* 1078 * cpu_idle() represents the idle LWKT. You cannot return from this function 1079 * (unless you want to blow things up!). Instead we look for runnable threads 1080 * and loop or halt as appropriate. Giant is not held on entry to the thread. 1081 * 1082 * The main loop is entered with a critical section held, we must release 1083 * the critical section before doing anything else. lwkt_switch() will 1084 * check for pending interrupts due to entering and exiting its own 1085 * critical section. 1086 * 1087 * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up. 1088 * However, there are cases where the idlethread will be entered with 1089 * the possibility that no IPI will occur and in such cases 1090 * lwkt_switch() sets TDF_IDLE_NOHLT. 1091 * 1092 * NOTE: cpu_idle_repeat determines how many entries into the idle thread 1093 * must occur before it starts using ACPI halt. 1094 * 1095 * NOTE: Value overridden in hammer_time(). 1096 */ 1097 static int cpu_idle_hlt = 2; 1098 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, 1099 &cpu_idle_hlt, 0, "Idle loop HLT enable"); 1100 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_repeat, CTLFLAG_RW, 1101 &cpu_idle_repeat, 0, "Idle entries before acpi hlt"); 1102 1103 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_hltcnt, (CTLTYPE_QUAD | CTLFLAG_RW), 1104 0, CPU_IDLE_STAT_HALT, sysctl_cpu_idle_cnt, "Q", "Idle loop entry halts"); 1105 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_spincnt, (CTLTYPE_QUAD | CTLFLAG_RW), 1106 0, CPU_IDLE_STAT_SPIN, sysctl_cpu_idle_cnt, "Q", "Idle loop entry spins"); 1107 1108 static void 1109 cpu_idle_default_hook(void) 1110 { 1111 /* 1112 * We must guarentee that hlt is exactly the instruction 1113 * following the sti. 1114 */ 1115 __asm __volatile("sti; hlt"); 1116 } 1117 1118 /* Other subsystems (e.g., ACPI) can hook this later. */ 1119 void (*cpu_idle_hook)(void) = cpu_idle_default_hook; 1120 1121 static __inline int 1122 cpu_mwait_cx_hint(struct cpu_idle_stat *stat) 1123 { 1124 int hint, cx_idx; 1125 u_int idx; 1126 1127 hint = stat->hint; 1128 if (hint >= 0) 1129 goto done; 1130 1131 idx = (stat->repeat + stat->repeat_last + stat->repeat_delta) >> 1132 cpu_mwait_repeat_shift; 1133 if (idx >= cpu_mwait_c1_hints_cnt) { 1134 /* Step up faster, once we walked through all C1 states */ 1135 stat->repeat_delta += 1 << (cpu_mwait_repeat_shift + 1); 1136 } 1137 if (hint == CPU_MWAIT_HINT_AUTODEEP) { 1138 if (idx >= cpu_mwait_deep_hints_cnt) 1139 idx = cpu_mwait_deep_hints_cnt - 1; 1140 hint = cpu_mwait_deep_hints[idx]; 1141 } else { 1142 if (idx >= cpu_mwait_hints_cnt) 1143 idx = cpu_mwait_hints_cnt - 1; 1144 hint = cpu_mwait_hints[idx]; 1145 } 1146 done: 1147 cx_idx = MWAIT_EAX_TO_CX(hint); 1148 if (cx_idx >= 0 && cx_idx < CPU_MWAIT_CX_MAX) 1149 stat->mwait_cx[cx_idx]++; 1150 return hint; 1151 } 1152 1153 void 1154 cpu_idle(void) 1155 { 1156 globaldata_t gd = mycpu; 1157 struct cpu_idle_stat *stat = &cpu_idle_stats[gd->gd_cpuid]; 1158 struct thread *td __debugvar = gd->gd_curthread; 1159 int reqflags; 1160 1161 stat->repeat = stat->repeat_last = cpu_idle_repeat_max; 1162 1163 crit_exit(); 1164 KKASSERT(td->td_critcount == 0); 1165 1166 for (;;) { 1167 /* 1168 * See if there are any LWKTs ready to go. 1169 */ 1170 lwkt_switch(); 1171 1172 /* 1173 * When halting inside a cli we must check for reqflags 1174 * races, particularly [re]schedule requests. Running 1175 * splz() does the job. 1176 * 1177 * cpu_idle_hlt: 1178 * 0 Never halt, just spin 1179 * 1180 * 1 Always use MONITOR/MWAIT if avail, HLT 1181 * otherwise. 1182 * 1183 * Better default for modern (Haswell+) Intel 1184 * cpus. 1185 * 1186 * 2 Use HLT/MONITOR/MWAIT up to a point and then 1187 * use the ACPI halt (default). This is a hybrid 1188 * approach. See machdep.cpu_idle_repeat. 1189 * 1190 * Better default for modern AMD cpus and older 1191 * Intel cpus. 1192 * 1193 * 3 Always use the ACPI halt. This typically 1194 * eats the least amount of power but the cpu 1195 * will be slow waking up. Slows down e.g. 1196 * compiles and other pipe/event oriented stuff. 1197 * 1198 * Usually the best default for AMD cpus. 1199 * 1200 * 4 Always use HLT. 1201 * 1202 * 5 Always spin. 1203 * 1204 * NOTE: Interrupts are enabled and we are not in a critical 1205 * section. 1206 * 1207 * NOTE: Preemptions do not reset gd_idle_repeat. Also we 1208 * don't bother capping gd_idle_repeat, it is ok if 1209 * it overflows (we do make it unsigned, however). 1210 * 1211 * Implement optimized invltlb operations when halted 1212 * in idle. By setting the bit in smp_idleinvl_mask 1213 * we inform other cpus that they can set _reqs to 1214 * request an invltlb. Current the code to do that 1215 * sets the bits in _reqs anyway, but then check _mask 1216 * to determine if they can assume the invltlb will execute. 1217 * 1218 * A critical section is required to ensure that interrupts 1219 * do not fully run until after we've had a chance to execute 1220 * the request. 1221 */ 1222 if (gd->gd_idle_repeat == 0) { 1223 stat->repeat = (stat->repeat + stat->repeat_last) >> 1; 1224 if (stat->repeat > cpu_idle_repeat_max) 1225 stat->repeat = cpu_idle_repeat_max; 1226 stat->repeat_last = 0; 1227 stat->repeat_delta = 0; 1228 } 1229 ++stat->repeat_last; 1230 1231 /* 1232 * General idle thread halt code 1233 * 1234 * IBRS NOTES - IBRS is a SPECTRE mitigation. When going 1235 * idle, disable IBRS to reduce hyperthread 1236 * overhead. 1237 */ 1238 ++gd->gd_idle_repeat; 1239 1240 switch(cpu_idle_hlt) { 1241 default: 1242 case 0: 1243 /* 1244 * Always spin 1245 */ 1246 ; 1247 do_spin: 1248 splz(); 1249 __asm __volatile("sti"); 1250 stat->spin++; 1251 crit_enter_gd(gd); 1252 crit_exit_gd(gd); 1253 break; 1254 case 2: 1255 /* 1256 * Use MONITOR/MWAIT (or HLT) for a few cycles, 1257 * then start using the ACPI halt code if we 1258 * continue to be idle. 1259 */ 1260 if (gd->gd_idle_repeat >= cpu_idle_repeat) 1261 goto do_acpi; 1262 /* FALL THROUGH */ 1263 case 1: 1264 /* 1265 * Always use MONITOR/MWAIT (will use HLT if 1266 * MONITOR/MWAIT not available). 1267 */ 1268 if (cpu_mi_feature & CPU_MI_MONITOR) { 1269 splz(); /* XXX */ 1270 reqflags = gd->gd_reqflags; 1271 if (reqflags & RQF_IDLECHECK_WK_MASK) 1272 goto do_spin; 1273 crit_enter_gd(gd); 1274 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask, gd->gd_cpuid); 1275 /* 1276 * IBRS/STIBP 1277 */ 1278 if (pscpu->trampoline.tr_pcb_spec_ctrl[1] & 1279 SPEC_CTRL_DUMMY_ENABLE) { 1280 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1281 } 1282 cpu_mmw_pause_int(&gd->gd_reqflags, reqflags, 1283 cpu_mwait_cx_hint(stat), 0); 1284 if (pscpu->trampoline.tr_pcb_spec_ctrl[0] & 1285 SPEC_CTRL_DUMMY_ENABLE) { 1286 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1287 } 1288 stat->halt++; 1289 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask, gd->gd_cpuid); 1290 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs, 1291 gd->gd_cpuid)) { 1292 cpu_invltlb(); 1293 cpu_mfence(); 1294 } 1295 crit_exit_gd(gd); 1296 break; 1297 } 1298 /* FALLTHROUGH */ 1299 case 4: 1300 /* 1301 * Use HLT 1302 */ 1303 __asm __volatile("cli"); 1304 splz(); 1305 crit_enter_gd(gd); 1306 if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) { 1307 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask, 1308 gd->gd_cpuid); 1309 if (pscpu->trampoline.tr_pcb_spec_ctrl[1] & 1310 SPEC_CTRL_DUMMY_ENABLE) { 1311 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1312 } 1313 cpu_idle_default_hook(); 1314 if (pscpu->trampoline.tr_pcb_spec_ctrl[0] & 1315 SPEC_CTRL_DUMMY_ENABLE) { 1316 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1317 } 1318 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask, 1319 gd->gd_cpuid); 1320 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs, 1321 gd->gd_cpuid)) { 1322 cpu_invltlb(); 1323 cpu_mfence(); 1324 } 1325 } 1326 __asm __volatile("sti"); 1327 stat->halt++; 1328 crit_exit_gd(gd); 1329 break; 1330 case 3: 1331 /* 1332 * Use ACPI halt 1333 */ 1334 ; 1335 do_acpi: 1336 __asm __volatile("cli"); 1337 splz(); 1338 crit_enter_gd(gd); 1339 if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) { 1340 ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask, 1341 gd->gd_cpuid); 1342 if (pscpu->trampoline.tr_pcb_spec_ctrl[1] & 1343 SPEC_CTRL_DUMMY_ENABLE) { 1344 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1345 } 1346 cpu_idle_hook(); 1347 if (pscpu->trampoline.tr_pcb_spec_ctrl[0] & 1348 SPEC_CTRL_DUMMY_ENABLE) { 1349 wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP)); 1350 } 1351 ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask, 1352 gd->gd_cpuid); 1353 if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs, 1354 gd->gd_cpuid)) { 1355 cpu_invltlb(); 1356 cpu_mfence(); 1357 } 1358 } 1359 __asm __volatile("sti"); 1360 stat->halt++; 1361 crit_exit_gd(gd); 1362 break; 1363 } 1364 } 1365 } 1366 1367 /* 1368 * Called from deep ACPI via cpu_idle_hook() (see above) to actually halt 1369 * the cpu in C1. ACPI might use other halt methods for deeper states 1370 * and not reach here. 1371 * 1372 * For now we always use HLT as we are not sure what ACPI may have actually 1373 * done. MONITOR/MWAIT might not be appropriate. 1374 * 1375 * NOTE: MONITOR/MWAIT does not appear to throttle AMD cpus, while HLT 1376 * does. On Intel, MONITOR/MWAIT does appear to throttle the cpu. 1377 */ 1378 void 1379 cpu_idle_halt(void) 1380 { 1381 globaldata_t gd; 1382 1383 gd = mycpu; 1384 #if 0 1385 /* DISABLED FOR NOW */ 1386 struct cpu_idle_stat *stat; 1387 int reqflags; 1388 1389 1390 if ((cpu_idle_hlt == 1 || cpu_idle_hlt == 2) && 1391 (cpu_mi_feature & CPU_MI_MONITOR) && 1392 cpu_vendor_id != CPU_VENDOR_AMD) { 1393 /* 1394 * Use MONITOR/MWAIT 1395 * 1396 * (NOTE: On ryzen, MWAIT does not throttle clocks, so we 1397 * have to use HLT) 1398 */ 1399 stat = &cpu_idle_stats[gd->gd_cpuid]; 1400 reqflags = gd->gd_reqflags; 1401 if ((reqflags & RQF_IDLECHECK_WK_MASK) == 0) { 1402 __asm __volatile("sti"); 1403 cpu_mmw_pause_int(&gd->gd_reqflags, reqflags, 1404 cpu_mwait_cx_hint(stat), 0); 1405 } else { 1406 __asm __volatile("sti; pause"); 1407 } 1408 } else 1409 #endif 1410 { 1411 /* 1412 * Use HLT 1413 */ 1414 if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) 1415 __asm __volatile("sti; hlt"); 1416 else 1417 __asm __volatile("sti; pause"); 1418 } 1419 } 1420 1421 1422 /* 1423 * Called in a loop indirectly via Xcpustop 1424 */ 1425 void 1426 cpu_smp_stopped(void) 1427 { 1428 globaldata_t gd = mycpu; 1429 volatile __uint64_t *ptr; 1430 __uint64_t ovalue; 1431 1432 ptr = CPUMASK_ADDR(started_cpus, gd->gd_cpuid); 1433 ovalue = *ptr; 1434 if ((ovalue & CPUMASK_SIMPLE(gd->gd_cpuid & 63)) == 0) { 1435 if (cpu_mi_feature & CPU_MI_MONITOR) { 1436 if (cpu_mwait_hints) { 1437 cpu_mmw_pause_long(__DEVOLATILE(void *, ptr), 1438 ovalue, 1439 cpu_mwait_hints[ 1440 cpu_mwait_hints_cnt - 1], 0); 1441 } else { 1442 cpu_mmw_pause_long(__DEVOLATILE(void *, ptr), 1443 ovalue, 0, 0); 1444 } 1445 } else { 1446 cpu_halt(); /* depend on lapic timer */ 1447 } 1448 } 1449 } 1450 1451 /* 1452 * This routine is called if a spinlock has been held through the 1453 * exponential backoff period and is seriously contested. On a real cpu 1454 * we let it spin. 1455 */ 1456 void 1457 cpu_spinlock_contested(void) 1458 { 1459 cpu_pause(); 1460 } 1461 1462 /* 1463 * Clear registers on exec 1464 */ 1465 void 1466 exec_setregs(u_long entry, u_long stack, u_long ps_strings) 1467 { 1468 struct thread *td = curthread; 1469 struct lwp *lp = td->td_lwp; 1470 struct pcb *pcb = td->td_pcb; 1471 struct trapframe *regs = lp->lwp_md.md_regs; 1472 1473 user_ldt_free(pcb); 1474 1475 clear_quickret(); 1476 bzero((char *)regs, sizeof(struct trapframe)); 1477 regs->tf_rip = entry; 1478 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; /* align the stack */ 1479 regs->tf_rdi = stack; /* argv */ 1480 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); 1481 regs->tf_ss = _udatasel; 1482 regs->tf_cs = _ucodesel; 1483 regs->tf_rbx = ps_strings; 1484 1485 /* 1486 * Reset the hardware debug registers if they were in use. 1487 * They won't have any meaning for the newly exec'd process. 1488 */ 1489 if (pcb->pcb_flags & PCB_DBREGS) { 1490 pcb->pcb_dr0 = 0; 1491 pcb->pcb_dr1 = 0; 1492 pcb->pcb_dr2 = 0; 1493 pcb->pcb_dr3 = 0; 1494 pcb->pcb_dr6 = 0; 1495 pcb->pcb_dr7 = 0; /* JG set bit 10? */ 1496 if (pcb == td->td_pcb) { 1497 /* 1498 * Clear the debug registers on the running 1499 * CPU, otherwise they will end up affecting 1500 * the next process we switch to. 1501 */ 1502 reset_dbregs(); 1503 } 1504 pcb->pcb_flags &= ~PCB_DBREGS; 1505 } 1506 1507 /* 1508 * Initialize the math emulator (if any) for the current process. 1509 * Actually, just clear the bit that says that the emulator has 1510 * been initialized. Initialization is delayed until the process 1511 * traps to the emulator (if it is done at all) mainly because 1512 * emulators don't provide an entry point for initialization. 1513 */ 1514 pcb->pcb_flags &= ~FP_SOFTFP; 1515 1516 /* 1517 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing 1518 * gd_npxthread. Otherwise a preemptive interrupt thread 1519 * may panic in npxdna(). 1520 */ 1521 crit_enter(); 1522 load_cr0(rcr0() | CR0_MP); 1523 1524 /* 1525 * NOTE: The MSR values must be correct so we can return to 1526 * userland. gd_user_fs/gs must be correct so the switch 1527 * code knows what the current MSR values are. 1528 */ 1529 pcb->pcb_fsbase = 0; /* Values loaded from PCB on switch */ 1530 pcb->pcb_gsbase = 0; 1531 mdcpu->gd_user_fs = 0; /* Cache of current MSR values */ 1532 mdcpu->gd_user_gs = 0; 1533 wrmsr(MSR_FSBASE, 0); /* Set MSR values for return to userland */ 1534 wrmsr(MSR_KGSBASE, 0); 1535 1536 /* Initialize the npx (if any) for the current process. */ 1537 npxinit(); 1538 crit_exit(); 1539 1540 pcb->pcb_ds = _udatasel; 1541 pcb->pcb_es = _udatasel; 1542 pcb->pcb_fs = _udatasel; 1543 pcb->pcb_gs = _udatasel; 1544 } 1545 1546 void 1547 cpu_setregs(void) 1548 { 1549 register_t cr0; 1550 1551 cr0 = rcr0(); 1552 cr0 |= CR0_NE; /* Done by npxinit() */ 1553 cr0 |= CR0_MP | CR0_TS; /* Done at every execve() too. */ 1554 cr0 |= CR0_WP | CR0_AM; 1555 load_cr0(cr0); 1556 load_gs(_udatasel); 1557 } 1558 1559 static int 1560 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS) 1561 { 1562 int error; 1563 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, 1564 req); 1565 if (!error && req->newptr) 1566 resettodr(); 1567 return (error); 1568 } 1569 1570 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, 1571 &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); 1572 1573 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, 1574 CTLFLAG_RW, &disable_rtc_set, 0, ""); 1575 1576 #if 0 /* JG */ 1577 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, 1578 CTLFLAG_RD, &bootinfo, bootinfo, ""); 1579 #endif 1580 1581 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, 1582 CTLFLAG_RW, &wall_cmos_clock, 0, ""); 1583 1584 static int 1585 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) 1586 { 1587 struct efi_map_header *efihdr; 1588 caddr_t kmdp; 1589 uint32_t efisize; 1590 1591 kmdp = preload_search_by_type("elf kernel"); 1592 if (kmdp == NULL) 1593 kmdp = preload_search_by_type("elf64 kernel"); 1594 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1595 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1596 if (efihdr == NULL) 1597 return (0); 1598 efisize = *((uint32_t *)efihdr - 1); 1599 return (SYSCTL_OUT(req, efihdr, efisize)); 1600 } 1601 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, 1602 efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map"); 1603 1604 /* 1605 * Initialize x86 and configure to run kernel 1606 */ 1607 1608 /* 1609 * Initialize segments & interrupt table 1610 */ 1611 1612 int _default_ldt; 1613 struct user_segment_descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ 1614 struct gate_descriptor idt_arr[MAXCPU][NIDT]; 1615 #if 0 /* JG */ 1616 union descriptor ldt[NLDT]; /* local descriptor table */ 1617 #endif 1618 1619 /* table descriptors - used to load tables by cpu */ 1620 struct region_descriptor r_gdt; 1621 struct region_descriptor r_idt_arr[MAXCPU]; 1622 1623 /* JG proc0paddr is a virtual address */ 1624 void *proc0paddr; 1625 /* JG alignment? */ 1626 char proc0paddr_buff[LWKT_THREAD_STACK]; 1627 1628 1629 /* software prototypes -- in more palatable form */ 1630 struct soft_segment_descriptor gdt_segs[] = { 1631 /* GNULL_SEL 0 Null Descriptor */ 1632 { 0x0, /* segment base address */ 1633 0x0, /* length */ 1634 0, /* segment type */ 1635 0, /* segment descriptor priority level */ 1636 0, /* segment descriptor present */ 1637 0, /* long */ 1638 0, /* default 32 vs 16 bit size */ 1639 0 /* limit granularity (byte/page units)*/ }, 1640 /* GCODE_SEL 1 Code Descriptor for kernel */ 1641 { 0x0, /* segment base address */ 1642 0xfffff, /* length - all address space */ 1643 SDT_MEMERA, /* segment type */ 1644 SEL_KPL, /* segment descriptor priority level */ 1645 1, /* segment descriptor present */ 1646 1, /* long */ 1647 0, /* default 32 vs 16 bit size */ 1648 1 /* limit granularity (byte/page units)*/ }, 1649 /* GDATA_SEL 2 Data Descriptor for kernel */ 1650 { 0x0, /* segment base address */ 1651 0xfffff, /* length - all address space */ 1652 SDT_MEMRWA, /* segment type */ 1653 SEL_KPL, /* segment descriptor priority level */ 1654 1, /* segment descriptor present */ 1655 1, /* long */ 1656 0, /* default 32 vs 16 bit size */ 1657 1 /* limit granularity (byte/page units)*/ }, 1658 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */ 1659 { 0x0, /* segment base address */ 1660 0xfffff, /* length - all address space */ 1661 SDT_MEMERA, /* segment type */ 1662 SEL_UPL, /* segment descriptor priority level */ 1663 1, /* segment descriptor present */ 1664 0, /* long */ 1665 1, /* default 32 vs 16 bit size */ 1666 1 /* limit granularity (byte/page units)*/ }, 1667 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */ 1668 { 0x0, /* segment base address */ 1669 0xfffff, /* length - all address space */ 1670 SDT_MEMRWA, /* segment type */ 1671 SEL_UPL, /* segment descriptor priority level */ 1672 1, /* segment descriptor present */ 1673 0, /* long */ 1674 1, /* default 32 vs 16 bit size */ 1675 1 /* limit granularity (byte/page units)*/ }, 1676 /* GUCODE_SEL 5 64 bit Code Descriptor for user */ 1677 { 0x0, /* segment base address */ 1678 0xfffff, /* length - all address space */ 1679 SDT_MEMERA, /* segment type */ 1680 SEL_UPL, /* segment descriptor priority level */ 1681 1, /* segment descriptor present */ 1682 1, /* long */ 1683 0, /* default 32 vs 16 bit size */ 1684 1 /* limit granularity (byte/page units)*/ }, 1685 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */ 1686 { 1687 0x0, /* segment base address */ 1688 sizeof(struct x86_64tss)-1,/* length - all address space */ 1689 SDT_SYSTSS, /* segment type */ 1690 SEL_KPL, /* segment descriptor priority level */ 1691 1, /* segment descriptor present */ 1692 0, /* long */ 1693 0, /* unused - default 32 vs 16 bit size */ 1694 0 /* limit granularity (byte/page units)*/ }, 1695 /* Actually, the TSS is a system descriptor which is double size */ 1696 { 0x0, /* segment base address */ 1697 0x0, /* length */ 1698 0, /* segment type */ 1699 0, /* segment descriptor priority level */ 1700 0, /* segment descriptor present */ 1701 0, /* long */ 1702 0, /* default 32 vs 16 bit size */ 1703 0 /* limit granularity (byte/page units)*/ }, 1704 /* GUGS32_SEL 8 32 bit GS Descriptor for user */ 1705 { 0x0, /* segment base address */ 1706 0xfffff, /* length - all address space */ 1707 SDT_MEMRWA, /* segment type */ 1708 SEL_UPL, /* segment descriptor priority level */ 1709 1, /* segment descriptor present */ 1710 0, /* long */ 1711 1, /* default 32 vs 16 bit size */ 1712 1 /* limit granularity (byte/page units)*/ }, 1713 }; 1714 1715 void 1716 setidt_global(int idx, inthand_t *func, int typ, int dpl, int ist) 1717 { 1718 int cpu; 1719 1720 for (cpu = 0; cpu < MAXCPU; ++cpu) { 1721 struct gate_descriptor *ip = &idt_arr[cpu][idx]; 1722 1723 ip->gd_looffset = (uintptr_t)func; 1724 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 1725 ip->gd_ist = ist; 1726 ip->gd_xx = 0; 1727 ip->gd_type = typ; 1728 ip->gd_dpl = dpl; 1729 ip->gd_p = 1; 1730 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 1731 } 1732 } 1733 1734 void 1735 setidt(int idx, inthand_t *func, int typ, int dpl, int ist, int cpu) 1736 { 1737 struct gate_descriptor *ip; 1738 1739 KASSERT(cpu >= 0 && cpu < ncpus, ("invalid cpu %d", cpu)); 1740 1741 ip = &idt_arr[cpu][idx]; 1742 ip->gd_looffset = (uintptr_t)func; 1743 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 1744 ip->gd_ist = ist; 1745 ip->gd_xx = 0; 1746 ip->gd_type = typ; 1747 ip->gd_dpl = dpl; 1748 ip->gd_p = 1; 1749 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 1750 } 1751 1752 #define IDTVEC(name) __CONCAT(X,name) 1753 1754 extern inthand_t 1755 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 1756 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 1757 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 1758 IDTVEC(page), IDTVEC(mchk), IDTVEC(fpu), IDTVEC(align), 1759 IDTVEC(xmm), IDTVEC(dblfault), 1760 IDTVEC(fast_syscall), IDTVEC(fast_syscall32); 1761 1762 extern inthand_t 1763 IDTVEC(rsvd00), IDTVEC(rsvd01), IDTVEC(rsvd02), IDTVEC(rsvd03), 1764 IDTVEC(rsvd04), IDTVEC(rsvd05), IDTVEC(rsvd06), IDTVEC(rsvd07), 1765 IDTVEC(rsvd08), IDTVEC(rsvd09), IDTVEC(rsvd0a), IDTVEC(rsvd0b), 1766 IDTVEC(rsvd0c), IDTVEC(rsvd0d), IDTVEC(rsvd0e), IDTVEC(rsvd0f), 1767 IDTVEC(rsvd10), IDTVEC(rsvd11), IDTVEC(rsvd12), IDTVEC(rsvd13), 1768 IDTVEC(rsvd14), IDTVEC(rsvd15), IDTVEC(rsvd16), IDTVEC(rsvd17), 1769 IDTVEC(rsvd18), IDTVEC(rsvd19), IDTVEC(rsvd1a), IDTVEC(rsvd1b), 1770 IDTVEC(rsvd1c), IDTVEC(rsvd1d), IDTVEC(rsvd1e), IDTVEC(rsvd1f), 1771 IDTVEC(rsvd20), IDTVEC(rsvd21), IDTVEC(rsvd22), IDTVEC(rsvd23), 1772 IDTVEC(rsvd24), IDTVEC(rsvd25), IDTVEC(rsvd26), IDTVEC(rsvd27), 1773 IDTVEC(rsvd28), IDTVEC(rsvd29), IDTVEC(rsvd2a), IDTVEC(rsvd2b), 1774 IDTVEC(rsvd2c), IDTVEC(rsvd2d), IDTVEC(rsvd2e), IDTVEC(rsvd2f), 1775 IDTVEC(rsvd30), IDTVEC(rsvd31), IDTVEC(rsvd32), IDTVEC(rsvd33), 1776 IDTVEC(rsvd34), IDTVEC(rsvd35), IDTVEC(rsvd36), IDTVEC(rsvd37), 1777 IDTVEC(rsvd38), IDTVEC(rsvd39), IDTVEC(rsvd3a), IDTVEC(rsvd3b), 1778 IDTVEC(rsvd3c), IDTVEC(rsvd3d), IDTVEC(rsvd3e), IDTVEC(rsvd3f), 1779 IDTVEC(rsvd40), IDTVEC(rsvd41), IDTVEC(rsvd42), IDTVEC(rsvd43), 1780 IDTVEC(rsvd44), IDTVEC(rsvd45), IDTVEC(rsvd46), IDTVEC(rsvd47), 1781 IDTVEC(rsvd48), IDTVEC(rsvd49), IDTVEC(rsvd4a), IDTVEC(rsvd4b), 1782 IDTVEC(rsvd4c), IDTVEC(rsvd4d), IDTVEC(rsvd4e), IDTVEC(rsvd4f), 1783 IDTVEC(rsvd50), IDTVEC(rsvd51), IDTVEC(rsvd52), IDTVEC(rsvd53), 1784 IDTVEC(rsvd54), IDTVEC(rsvd55), IDTVEC(rsvd56), IDTVEC(rsvd57), 1785 IDTVEC(rsvd58), IDTVEC(rsvd59), IDTVEC(rsvd5a), IDTVEC(rsvd5b), 1786 IDTVEC(rsvd5c), IDTVEC(rsvd5d), IDTVEC(rsvd5e), IDTVEC(rsvd5f), 1787 IDTVEC(rsvd60), IDTVEC(rsvd61), IDTVEC(rsvd62), IDTVEC(rsvd63), 1788 IDTVEC(rsvd64), IDTVEC(rsvd65), IDTVEC(rsvd66), IDTVEC(rsvd67), 1789 IDTVEC(rsvd68), IDTVEC(rsvd69), IDTVEC(rsvd6a), IDTVEC(rsvd6b), 1790 IDTVEC(rsvd6c), IDTVEC(rsvd6d), IDTVEC(rsvd6e), IDTVEC(rsvd6f), 1791 IDTVEC(rsvd70), IDTVEC(rsvd71), IDTVEC(rsvd72), IDTVEC(rsvd73), 1792 IDTVEC(rsvd74), IDTVEC(rsvd75), IDTVEC(rsvd76), IDTVEC(rsvd77), 1793 IDTVEC(rsvd78), IDTVEC(rsvd79), IDTVEC(rsvd7a), IDTVEC(rsvd7b), 1794 IDTVEC(rsvd7c), IDTVEC(rsvd7d), IDTVEC(rsvd7e), IDTVEC(rsvd7f), 1795 IDTVEC(rsvd80), IDTVEC(rsvd81), IDTVEC(rsvd82), IDTVEC(rsvd83), 1796 IDTVEC(rsvd84), IDTVEC(rsvd85), IDTVEC(rsvd86), IDTVEC(rsvd87), 1797 IDTVEC(rsvd88), IDTVEC(rsvd89), IDTVEC(rsvd8a), IDTVEC(rsvd8b), 1798 IDTVEC(rsvd8c), IDTVEC(rsvd8d), IDTVEC(rsvd8e), IDTVEC(rsvd8f), 1799 IDTVEC(rsvd90), IDTVEC(rsvd91), IDTVEC(rsvd92), IDTVEC(rsvd93), 1800 IDTVEC(rsvd94), IDTVEC(rsvd95), IDTVEC(rsvd96), IDTVEC(rsvd97), 1801 IDTVEC(rsvd98), IDTVEC(rsvd99), IDTVEC(rsvd9a), IDTVEC(rsvd9b), 1802 IDTVEC(rsvd9c), IDTVEC(rsvd9d), IDTVEC(rsvd9e), IDTVEC(rsvd9f), 1803 IDTVEC(rsvda0), IDTVEC(rsvda1), IDTVEC(rsvda2), IDTVEC(rsvda3), 1804 IDTVEC(rsvda4), IDTVEC(rsvda5), IDTVEC(rsvda6), IDTVEC(rsvda7), 1805 IDTVEC(rsvda8), IDTVEC(rsvda9), IDTVEC(rsvdaa), IDTVEC(rsvdab), 1806 IDTVEC(rsvdac), IDTVEC(rsvdad), IDTVEC(rsvdae), IDTVEC(rsvdaf), 1807 IDTVEC(rsvdb0), IDTVEC(rsvdb1), IDTVEC(rsvdb2), IDTVEC(rsvdb3), 1808 IDTVEC(rsvdb4), IDTVEC(rsvdb5), IDTVEC(rsvdb6), IDTVEC(rsvdb7), 1809 IDTVEC(rsvdb8), IDTVEC(rsvdb9), IDTVEC(rsvdba), IDTVEC(rsvdbb), 1810 IDTVEC(rsvdbc), IDTVEC(rsvdbd), IDTVEC(rsvdbe), IDTVEC(rsvdbf), 1811 IDTVEC(rsvdc0), IDTVEC(rsvdc1), IDTVEC(rsvdc2), IDTVEC(rsvdc3), 1812 IDTVEC(rsvdc4), IDTVEC(rsvdc5), IDTVEC(rsvdc6), IDTVEC(rsvdc7), 1813 IDTVEC(rsvdc8), IDTVEC(rsvdc9), IDTVEC(rsvdca), IDTVEC(rsvdcb), 1814 IDTVEC(rsvdcc), IDTVEC(rsvdcd), IDTVEC(rsvdce), IDTVEC(rsvdcf), 1815 IDTVEC(rsvdd0), IDTVEC(rsvdd1), IDTVEC(rsvdd2), IDTVEC(rsvdd3), 1816 IDTVEC(rsvdd4), IDTVEC(rsvdd5), IDTVEC(rsvdd6), IDTVEC(rsvdd7), 1817 IDTVEC(rsvdd8), IDTVEC(rsvdd9), IDTVEC(rsvdda), IDTVEC(rsvddb), 1818 IDTVEC(rsvddc), IDTVEC(rsvddd), IDTVEC(rsvdde), IDTVEC(rsvddf), 1819 IDTVEC(rsvde0), IDTVEC(rsvde1), IDTVEC(rsvde2), IDTVEC(rsvde3), 1820 IDTVEC(rsvde4), IDTVEC(rsvde5), IDTVEC(rsvde6), IDTVEC(rsvde7), 1821 IDTVEC(rsvde8), IDTVEC(rsvde9), IDTVEC(rsvdea), IDTVEC(rsvdeb), 1822 IDTVEC(rsvdec), IDTVEC(rsvded), IDTVEC(rsvdee), IDTVEC(rsvdef), 1823 IDTVEC(rsvdf0), IDTVEC(rsvdf1), IDTVEC(rsvdf2), IDTVEC(rsvdf3), 1824 IDTVEC(rsvdf4), IDTVEC(rsvdf5), IDTVEC(rsvdf6), IDTVEC(rsvdf7), 1825 IDTVEC(rsvdf8), IDTVEC(rsvdf9), IDTVEC(rsvdfa), IDTVEC(rsvdfb), 1826 IDTVEC(rsvdfc), IDTVEC(rsvdfd), IDTVEC(rsvdfe), IDTVEC(rsvdff); 1827 1828 inthand_t *rsvdary[NIDT] = { 1829 &IDTVEC(rsvd00), &IDTVEC(rsvd01), &IDTVEC(rsvd02), &IDTVEC(rsvd03), 1830 &IDTVEC(rsvd04), &IDTVEC(rsvd05), &IDTVEC(rsvd06), &IDTVEC(rsvd07), 1831 &IDTVEC(rsvd08), &IDTVEC(rsvd09), &IDTVEC(rsvd0a), &IDTVEC(rsvd0b), 1832 &IDTVEC(rsvd0c), &IDTVEC(rsvd0d), &IDTVEC(rsvd0e), &IDTVEC(rsvd0f), 1833 &IDTVEC(rsvd10), &IDTVEC(rsvd11), &IDTVEC(rsvd12), &IDTVEC(rsvd13), 1834 &IDTVEC(rsvd14), &IDTVEC(rsvd15), &IDTVEC(rsvd16), &IDTVEC(rsvd17), 1835 &IDTVEC(rsvd18), &IDTVEC(rsvd19), &IDTVEC(rsvd1a), &IDTVEC(rsvd1b), 1836 &IDTVEC(rsvd1c), &IDTVEC(rsvd1d), &IDTVEC(rsvd1e), &IDTVEC(rsvd1f), 1837 &IDTVEC(rsvd20), &IDTVEC(rsvd21), &IDTVEC(rsvd22), &IDTVEC(rsvd23), 1838 &IDTVEC(rsvd24), &IDTVEC(rsvd25), &IDTVEC(rsvd26), &IDTVEC(rsvd27), 1839 &IDTVEC(rsvd28), &IDTVEC(rsvd29), &IDTVEC(rsvd2a), &IDTVEC(rsvd2b), 1840 &IDTVEC(rsvd2c), &IDTVEC(rsvd2d), &IDTVEC(rsvd2e), &IDTVEC(rsvd2f), 1841 &IDTVEC(rsvd30), &IDTVEC(rsvd31), &IDTVEC(rsvd32), &IDTVEC(rsvd33), 1842 &IDTVEC(rsvd34), &IDTVEC(rsvd35), &IDTVEC(rsvd36), &IDTVEC(rsvd37), 1843 &IDTVEC(rsvd38), &IDTVEC(rsvd39), &IDTVEC(rsvd3a), &IDTVEC(rsvd3b), 1844 &IDTVEC(rsvd3c), &IDTVEC(rsvd3d), &IDTVEC(rsvd3e), &IDTVEC(rsvd3f), 1845 &IDTVEC(rsvd40), &IDTVEC(rsvd41), &IDTVEC(rsvd42), &IDTVEC(rsvd43), 1846 &IDTVEC(rsvd44), &IDTVEC(rsvd45), &IDTVEC(rsvd46), &IDTVEC(rsvd47), 1847 &IDTVEC(rsvd48), &IDTVEC(rsvd49), &IDTVEC(rsvd4a), &IDTVEC(rsvd4b), 1848 &IDTVEC(rsvd4c), &IDTVEC(rsvd4d), &IDTVEC(rsvd4e), &IDTVEC(rsvd4f), 1849 &IDTVEC(rsvd50), &IDTVEC(rsvd51), &IDTVEC(rsvd52), &IDTVEC(rsvd53), 1850 &IDTVEC(rsvd54), &IDTVEC(rsvd55), &IDTVEC(rsvd56), &IDTVEC(rsvd57), 1851 &IDTVEC(rsvd58), &IDTVEC(rsvd59), &IDTVEC(rsvd5a), &IDTVEC(rsvd5b), 1852 &IDTVEC(rsvd5c), &IDTVEC(rsvd5d), &IDTVEC(rsvd5e), &IDTVEC(rsvd5f), 1853 &IDTVEC(rsvd60), &IDTVEC(rsvd61), &IDTVEC(rsvd62), &IDTVEC(rsvd63), 1854 &IDTVEC(rsvd64), &IDTVEC(rsvd65), &IDTVEC(rsvd66), &IDTVEC(rsvd67), 1855 &IDTVEC(rsvd68), &IDTVEC(rsvd69), &IDTVEC(rsvd6a), &IDTVEC(rsvd6b), 1856 &IDTVEC(rsvd6c), &IDTVEC(rsvd6d), &IDTVEC(rsvd6e), &IDTVEC(rsvd6f), 1857 &IDTVEC(rsvd70), &IDTVEC(rsvd71), &IDTVEC(rsvd72), &IDTVEC(rsvd73), 1858 &IDTVEC(rsvd74), &IDTVEC(rsvd75), &IDTVEC(rsvd76), &IDTVEC(rsvd77), 1859 &IDTVEC(rsvd78), &IDTVEC(rsvd79), &IDTVEC(rsvd7a), &IDTVEC(rsvd7b), 1860 &IDTVEC(rsvd7c), &IDTVEC(rsvd7d), &IDTVEC(rsvd7e), &IDTVEC(rsvd7f), 1861 &IDTVEC(rsvd80), &IDTVEC(rsvd81), &IDTVEC(rsvd82), &IDTVEC(rsvd83), 1862 &IDTVEC(rsvd84), &IDTVEC(rsvd85), &IDTVEC(rsvd86), &IDTVEC(rsvd87), 1863 &IDTVEC(rsvd88), &IDTVEC(rsvd89), &IDTVEC(rsvd8a), &IDTVEC(rsvd8b), 1864 &IDTVEC(rsvd8c), &IDTVEC(rsvd8d), &IDTVEC(rsvd8e), &IDTVEC(rsvd8f), 1865 &IDTVEC(rsvd90), &IDTVEC(rsvd91), &IDTVEC(rsvd92), &IDTVEC(rsvd93), 1866 &IDTVEC(rsvd94), &IDTVEC(rsvd95), &IDTVEC(rsvd96), &IDTVEC(rsvd97), 1867 &IDTVEC(rsvd98), &IDTVEC(rsvd99), &IDTVEC(rsvd9a), &IDTVEC(rsvd9b), 1868 &IDTVEC(rsvd9c), &IDTVEC(rsvd9d), &IDTVEC(rsvd9e), &IDTVEC(rsvd9f), 1869 &IDTVEC(rsvda0), &IDTVEC(rsvda1), &IDTVEC(rsvda2), &IDTVEC(rsvda3), 1870 &IDTVEC(rsvda4), &IDTVEC(rsvda5), &IDTVEC(rsvda6), &IDTVEC(rsvda7), 1871 &IDTVEC(rsvda8), &IDTVEC(rsvda9), &IDTVEC(rsvdaa), &IDTVEC(rsvdab), 1872 &IDTVEC(rsvdac), &IDTVEC(rsvdad), &IDTVEC(rsvdae), &IDTVEC(rsvdaf), 1873 &IDTVEC(rsvdb0), &IDTVEC(rsvdb1), &IDTVEC(rsvdb2), &IDTVEC(rsvdb3), 1874 &IDTVEC(rsvdb4), &IDTVEC(rsvdb5), &IDTVEC(rsvdb6), &IDTVEC(rsvdb7), 1875 &IDTVEC(rsvdb8), &IDTVEC(rsvdb9), &IDTVEC(rsvdba), &IDTVEC(rsvdbb), 1876 &IDTVEC(rsvdbc), &IDTVEC(rsvdbd), &IDTVEC(rsvdbe), &IDTVEC(rsvdbf), 1877 &IDTVEC(rsvdc0), &IDTVEC(rsvdc1), &IDTVEC(rsvdc2), &IDTVEC(rsvdc3), 1878 &IDTVEC(rsvdc4), &IDTVEC(rsvdc5), &IDTVEC(rsvdc6), &IDTVEC(rsvdc7), 1879 &IDTVEC(rsvdc8), &IDTVEC(rsvdc9), &IDTVEC(rsvdca), &IDTVEC(rsvdcb), 1880 &IDTVEC(rsvdcc), &IDTVEC(rsvdcd), &IDTVEC(rsvdce), &IDTVEC(rsvdcf), 1881 &IDTVEC(rsvdd0), &IDTVEC(rsvdd1), &IDTVEC(rsvdd2), &IDTVEC(rsvdd3), 1882 &IDTVEC(rsvdd4), &IDTVEC(rsvdd5), &IDTVEC(rsvdd6), &IDTVEC(rsvdd7), 1883 &IDTVEC(rsvdd8), &IDTVEC(rsvdd9), &IDTVEC(rsvdda), &IDTVEC(rsvddb), 1884 &IDTVEC(rsvddc), &IDTVEC(rsvddd), &IDTVEC(rsvdde), &IDTVEC(rsvddf), 1885 &IDTVEC(rsvde0), &IDTVEC(rsvde1), &IDTVEC(rsvde2), &IDTVEC(rsvde3), 1886 &IDTVEC(rsvde4), &IDTVEC(rsvde5), &IDTVEC(rsvde6), &IDTVEC(rsvde7), 1887 &IDTVEC(rsvde8), &IDTVEC(rsvde9), &IDTVEC(rsvdea), &IDTVEC(rsvdeb), 1888 &IDTVEC(rsvdec), &IDTVEC(rsvded), &IDTVEC(rsvdee), &IDTVEC(rsvdef), 1889 &IDTVEC(rsvdf0), &IDTVEC(rsvdf1), &IDTVEC(rsvdf2), &IDTVEC(rsvdf3), 1890 &IDTVEC(rsvdf4), &IDTVEC(rsvdf5), &IDTVEC(rsvdf6), &IDTVEC(rsvdf7), 1891 &IDTVEC(rsvdf8), &IDTVEC(rsvdf9), &IDTVEC(rsvdfa), &IDTVEC(rsvdfb), 1892 &IDTVEC(rsvdfc), &IDTVEC(rsvdfd), &IDTVEC(rsvdfe), &IDTVEC(rsvdff) 1893 }; 1894 1895 void 1896 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd) 1897 { 1898 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 1899 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 1900 ssd->ssd_type = sd->sd_type; 1901 ssd->ssd_dpl = sd->sd_dpl; 1902 ssd->ssd_p = sd->sd_p; 1903 ssd->ssd_def32 = sd->sd_def32; 1904 ssd->ssd_gran = sd->sd_gran; 1905 } 1906 1907 void 1908 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd) 1909 { 1910 1911 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 1912 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 1913 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 1914 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 1915 sd->sd_type = ssd->ssd_type; 1916 sd->sd_dpl = ssd->ssd_dpl; 1917 sd->sd_p = ssd->ssd_p; 1918 sd->sd_long = ssd->ssd_long; 1919 sd->sd_def32 = ssd->ssd_def32; 1920 sd->sd_gran = ssd->ssd_gran; 1921 } 1922 1923 void 1924 ssdtosyssd(struct soft_segment_descriptor *ssd, 1925 struct system_segment_descriptor *sd) 1926 { 1927 1928 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 1929 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 1930 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 1931 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 1932 sd->sd_type = ssd->ssd_type; 1933 sd->sd_dpl = ssd->ssd_dpl; 1934 sd->sd_p = ssd->ssd_p; 1935 sd->sd_gran = ssd->ssd_gran; 1936 } 1937 1938 /* 1939 * Populate the (physmap) array with base/bound pairs describing the 1940 * available physical memory in the system, then test this memory and 1941 * build the phys_avail array describing the actually-available memory. 1942 * 1943 * If we cannot accurately determine the physical memory map, then use 1944 * value from the 0xE801 call, and failing that, the RTC. 1945 * 1946 * Total memory size may be set by the kernel environment variable 1947 * hw.physmem or the compile-time define MAXMEM. 1948 * 1949 * Memory is aligned to PHYSMAP_ALIGN which must be a multiple 1950 * of PAGE_SIZE. This also greatly reduces the memory test time 1951 * which would otherwise be excessive on machines with > 8G of ram. 1952 * 1953 * XXX first should be vm_paddr_t. 1954 */ 1955 1956 #define PHYSMAP_ALIGN (vm_paddr_t)(128 * 1024) 1957 #define PHYSMAP_ALIGN_MASK (vm_paddr_t)(PHYSMAP_ALIGN - 1) 1958 #define PHYSMAP_SIZE VM_PHYSSEG_MAX 1959 1960 vm_paddr_t physmap[PHYSMAP_SIZE]; 1961 struct bios_smap *smapbase, *smap, *smapend; 1962 struct efi_map_header *efihdrbase; 1963 u_int32_t smapsize; 1964 1965 #define PHYSMAP_HANDWAVE (vm_paddr_t)(2 * 1024 * 1024) 1966 #define PHYSMAP_HANDWAVE_MASK (PHYSMAP_HANDWAVE - 1) 1967 1968 static void 1969 add_smap_entries(int *physmap_idx) 1970 { 1971 int i; 1972 1973 smapsize = *((u_int32_t *)smapbase - 1); 1974 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1975 1976 for (smap = smapbase; smap < smapend; smap++) { 1977 if (boothowto & RB_VERBOSE) 1978 kprintf("SMAP type=%02x base=%016lx len=%016lx\n", 1979 smap->type, smap->base, smap->length); 1980 1981 if (smap->type != SMAP_TYPE_MEMORY) 1982 continue; 1983 1984 if (smap->length == 0) 1985 continue; 1986 1987 for (i = 0; i <= *physmap_idx; i += 2) { 1988 if (smap->base < physmap[i + 1]) { 1989 if (boothowto & RB_VERBOSE) { 1990 kprintf("Overlapping or non-monotonic " 1991 "memory region, ignoring " 1992 "second region\n"); 1993 } 1994 break; 1995 } 1996 } 1997 if (i <= *physmap_idx) 1998 continue; 1999 2000 Realmem += smap->length; 2001 2002 /* 2003 * NOTE: This little bit of code initially expands 2004 * physmap[1] as well as later entries. 2005 */ 2006 if (smap->base == physmap[*physmap_idx + 1]) { 2007 physmap[*physmap_idx + 1] += smap->length; 2008 continue; 2009 } 2010 2011 *physmap_idx += 2; 2012 if (*physmap_idx == PHYSMAP_SIZE) { 2013 kprintf("Too many segments in the physical " 2014 "address map, giving up\n"); 2015 break; 2016 } 2017 physmap[*physmap_idx] = smap->base; 2018 physmap[*physmap_idx + 1] = smap->base + smap->length; 2019 } 2020 } 2021 2022 static void 2023 add_efi_map_entries(int *physmap_idx) 2024 { 2025 struct efi_md *map, *p; 2026 const char *type; 2027 size_t efisz; 2028 int i, ndesc; 2029 2030 static const char *types[] = { 2031 "Reserved", 2032 "LoaderCode", 2033 "LoaderData", 2034 "BootServicesCode", 2035 "BootServicesData", 2036 "RuntimeServicesCode", 2037 "RuntimeServicesData", 2038 "ConventionalMemory", 2039 "UnusableMemory", 2040 "ACPIReclaimMemory", 2041 "ACPIMemoryNVS", 2042 "MemoryMappedIO", 2043 "MemoryMappedIOPortSpace", 2044 "PalCode" 2045 }; 2046 2047 /* 2048 * Memory map data provided by UEFI via the GetMemoryMap 2049 * Boot Services API. 2050 */ 2051 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; 2052 map = (struct efi_md *)((uint8_t *)efihdrbase + efisz); 2053 2054 if (efihdrbase->descriptor_size == 0) 2055 return; 2056 ndesc = efihdrbase->memory_size / efihdrbase->descriptor_size; 2057 2058 if (boothowto & RB_VERBOSE) 2059 kprintf("%23s %12s %12s %8s %4s\n", 2060 "Type", "Physical", "Virtual", "#Pages", "Attr"); 2061 2062 for (i = 0, p = map; i < ndesc; i++, 2063 p = efi_next_descriptor(p, efihdrbase->descriptor_size)) { 2064 if (boothowto & RB_VERBOSE) { 2065 if (p->md_type <= EFI_MD_TYPE_PALCODE) 2066 type = types[p->md_type]; 2067 else 2068 type = "<INVALID>"; 2069 kprintf("%23s %012lx %12p %08lx ", type, p->md_phys, 2070 p->md_virt, p->md_pages); 2071 if (p->md_attr & EFI_MD_ATTR_UC) 2072 kprintf("UC "); 2073 if (p->md_attr & EFI_MD_ATTR_WC) 2074 kprintf("WC "); 2075 if (p->md_attr & EFI_MD_ATTR_WT) 2076 kprintf("WT "); 2077 if (p->md_attr & EFI_MD_ATTR_WB) 2078 kprintf("WB "); 2079 if (p->md_attr & EFI_MD_ATTR_UCE) 2080 kprintf("UCE "); 2081 if (p->md_attr & EFI_MD_ATTR_WP) 2082 kprintf("WP "); 2083 if (p->md_attr & EFI_MD_ATTR_RP) 2084 kprintf("RP "); 2085 if (p->md_attr & EFI_MD_ATTR_XP) 2086 kprintf("XP "); 2087 if (p->md_attr & EFI_MD_ATTR_RT) 2088 kprintf("RUNTIME"); 2089 kprintf("\n"); 2090 } 2091 2092 switch (p->md_type) { 2093 case EFI_MD_TYPE_CODE: 2094 case EFI_MD_TYPE_DATA: 2095 case EFI_MD_TYPE_BS_CODE: 2096 case EFI_MD_TYPE_BS_DATA: 2097 case EFI_MD_TYPE_FREE: 2098 /* 2099 * We're allowed to use any entry with these types. 2100 */ 2101 break; 2102 default: 2103 continue; 2104 } 2105 2106 Realmem += p->md_pages * PAGE_SIZE; 2107 2108 /* 2109 * NOTE: This little bit of code initially expands 2110 * physmap[1] as well as later entries. 2111 */ 2112 if (p->md_phys == physmap[*physmap_idx + 1]) { 2113 physmap[*physmap_idx + 1] += p->md_pages * PAGE_SIZE; 2114 continue; 2115 } 2116 2117 *physmap_idx += 2; 2118 if (*physmap_idx == PHYSMAP_SIZE) { 2119 kprintf("Too many segments in the physical " 2120 "address map, giving up\n"); 2121 break; 2122 } 2123 physmap[*physmap_idx] = p->md_phys; 2124 physmap[*physmap_idx + 1] = p->md_phys + p->md_pages * PAGE_SIZE; 2125 } 2126 } 2127 2128 struct fb_info efi_fb_info; 2129 static int have_efi_framebuffer = 0; 2130 2131 static void 2132 efi_fb_init_vaddr(int direct_map) 2133 { 2134 uint64_t sz; 2135 vm_offset_t addr, v; 2136 2137 v = efi_fb_info.vaddr; 2138 sz = efi_fb_info.stride * efi_fb_info.height; 2139 2140 if (direct_map) { 2141 addr = PHYS_TO_DMAP(efi_fb_info.paddr); 2142 if (addr >= DMAP_MIN_ADDRESS && addr + sz <= DMapMaxAddress) 2143 efi_fb_info.vaddr = addr; 2144 } else { 2145 efi_fb_info.vaddr = 2146 (vm_offset_t)pmap_mapdev_attr(efi_fb_info.paddr, 2147 sz, 2148 PAT_WRITE_COMBINING); 2149 } 2150 } 2151 2152 static u_int 2153 efifb_color_depth(struct efi_fb *efifb) 2154 { 2155 uint32_t mask; 2156 u_int depth; 2157 2158 mask = efifb->fb_mask_red | efifb->fb_mask_green | 2159 efifb->fb_mask_blue | efifb->fb_mask_reserved; 2160 if (mask == 0) 2161 return (0); 2162 for (depth = 1; mask != 1; depth++) 2163 mask >>= 1; 2164 return (depth); 2165 } 2166 2167 int 2168 probe_efi_fb(int early) 2169 { 2170 struct efi_fb *efifb; 2171 caddr_t kmdp; 2172 u_int depth; 2173 2174 if (have_efi_framebuffer) { 2175 if (!early && 2176 (efi_fb_info.vaddr == 0 || 2177 efi_fb_info.vaddr == PHYS_TO_DMAP(efi_fb_info.paddr))) 2178 efi_fb_init_vaddr(0); 2179 return 0; 2180 } 2181 2182 kmdp = preload_search_by_type("elf kernel"); 2183 if (kmdp == NULL) 2184 kmdp = preload_search_by_type("elf64 kernel"); 2185 efifb = (struct efi_fb *)preload_search_info(kmdp, 2186 MODINFO_METADATA | MODINFOMD_EFI_FB); 2187 if (efifb == NULL) 2188 return 1; 2189 2190 depth = efifb_color_depth(efifb); 2191 /* 2192 * Our bootloader should already notice, when we won't be able to 2193 * use the UEFI framebuffer. 2194 */ 2195 if (depth != 24 && depth != 32) 2196 return 1; 2197 2198 have_efi_framebuffer = 1; 2199 2200 efi_fb_info.is_vga_boot_display = 1; 2201 efi_fb_info.width = efifb->fb_width; 2202 efi_fb_info.height = efifb->fb_height; 2203 efi_fb_info.depth = depth; 2204 efi_fb_info.stride = efifb->fb_stride * (depth / 8); 2205 efi_fb_info.paddr = efifb->fb_addr; 2206 if (early) { 2207 efi_fb_info.vaddr = 0; 2208 } else { 2209 efi_fb_init_vaddr(0); 2210 } 2211 efi_fb_info.fbops.fb_set_par = NULL; 2212 efi_fb_info.fbops.fb_blank = NULL; 2213 efi_fb_info.fbops.fb_debug_enter = NULL; 2214 efi_fb_info.device = NULL; 2215 2216 return 0; 2217 } 2218 2219 static void 2220 efifb_startup(void *arg) 2221 { 2222 probe_efi_fb(0); 2223 } 2224 2225 SYSINIT(efi_fb_info, SI_BOOT1_POST, SI_ORDER_FIRST, efifb_startup, NULL); 2226 2227 static void 2228 getmemsize(caddr_t kmdp, u_int64_t first) 2229 { 2230 int off, physmap_idx, pa_indx, da_indx; 2231 int i, j; 2232 vm_paddr_t pa; 2233 vm_paddr_t msgbuf_size; 2234 u_long physmem_tunable; 2235 pt_entry_t *pte; 2236 quad_t dcons_addr, dcons_size; 2237 2238 bzero(physmap, sizeof(physmap)); 2239 physmap_idx = 0; 2240 2241 /* 2242 * get memory map from INT 15:E820, kindly supplied by the loader. 2243 * 2244 * subr_module.c says: 2245 * "Consumer may safely assume that size value precedes data." 2246 * ie: an int32_t immediately precedes smap. 2247 */ 2248 efihdrbase = (struct efi_map_header *)preload_search_info(kmdp, 2249 MODINFO_METADATA | MODINFOMD_EFI_MAP); 2250 smapbase = (struct bios_smap *)preload_search_info(kmdp, 2251 MODINFO_METADATA | MODINFOMD_SMAP); 2252 if (smapbase == NULL && efihdrbase == NULL) 2253 panic("No BIOS smap or EFI map info from loader!"); 2254 2255 if (efihdrbase == NULL) 2256 add_smap_entries(&physmap_idx); 2257 else 2258 add_efi_map_entries(&physmap_idx); 2259 2260 base_memory = physmap[1] / 1024; 2261 /* make hole for AP bootstrap code */ 2262 physmap[1] = mp_bootaddress(base_memory); 2263 2264 /* Save EBDA address, if any */ 2265 ebda_addr = (u_long)(*(u_short *)(KERNBASE + 0x40e)); 2266 ebda_addr <<= 4; 2267 2268 /* 2269 * Maxmem isn't the "maximum memory", it's one larger than the 2270 * highest page of the physical address space. It should be 2271 * called something like "Maxphyspage". We may adjust this 2272 * based on ``hw.physmem'' and the results of the memory test. 2273 */ 2274 Maxmem = atop(physmap[physmap_idx + 1]); 2275 2276 #ifdef MAXMEM 2277 Maxmem = MAXMEM / 4; 2278 #endif 2279 2280 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 2281 Maxmem = atop(physmem_tunable); 2282 2283 /* 2284 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 2285 * in the system. 2286 */ 2287 if (Maxmem > atop(physmap[physmap_idx + 1])) 2288 Maxmem = atop(physmap[physmap_idx + 1]); 2289 2290 /* 2291 * Blowing out the DMAP will blow up the system. 2292 */ 2293 if (Maxmem > atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS)) { 2294 kprintf("Limiting Maxmem due to DMAP size\n"); 2295 Maxmem = atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS); 2296 } 2297 2298 if (atop(physmap[physmap_idx + 1]) != Maxmem && 2299 (boothowto & RB_VERBOSE)) { 2300 kprintf("Physical memory use set to %ldK\n", Maxmem * 4); 2301 } 2302 2303 /* 2304 * Call pmap initialization to make new kernel address space 2305 * 2306 * Mask off page 0. 2307 */ 2308 pmap_bootstrap(&first); 2309 physmap[0] = PAGE_SIZE; 2310 2311 /* 2312 * Align the physmap to PHYSMAP_ALIGN and cut out anything 2313 * exceeding Maxmem. 2314 */ 2315 for (i = j = 0; i <= physmap_idx; i += 2) { 2316 if (physmap[i+1] > ptoa(Maxmem)) 2317 physmap[i+1] = ptoa(Maxmem); 2318 physmap[i] = (physmap[i] + PHYSMAP_ALIGN_MASK) & 2319 ~PHYSMAP_ALIGN_MASK; 2320 physmap[i+1] = physmap[i+1] & ~PHYSMAP_ALIGN_MASK; 2321 2322 physmap[j] = physmap[i]; 2323 physmap[j+1] = physmap[i+1]; 2324 2325 if (physmap[i] < physmap[i+1]) 2326 j += 2; 2327 } 2328 physmap_idx = j - 2; 2329 2330 /* 2331 * Align anything else used in the validation loop. 2332 * 2333 * Also make sure that our 2MB kernel text+data+bss mappings 2334 * do not overlap potentially allocatable space. 2335 */ 2336 first = (first + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK; 2337 2338 /* 2339 * Size up each available chunk of physical memory. 2340 */ 2341 pa_indx = 0; 2342 da_indx = 0; 2343 phys_avail[pa_indx].phys_beg = physmap[0]; 2344 phys_avail[pa_indx].phys_end = physmap[0]; 2345 dump_avail[da_indx].phys_beg = 0; 2346 dump_avail[da_indx].phys_end = physmap[0]; 2347 pte = CMAP1; 2348 2349 /* 2350 * Get dcons buffer address 2351 */ 2352 if (kgetenv_quad("dcons.addr", &dcons_addr) == 0 || 2353 kgetenv_quad("dcons.size", &dcons_size) == 0) 2354 dcons_addr = 0; 2355 2356 /* 2357 * Validate the physical memory. The physical memory segments 2358 * have already been aligned to PHYSMAP_ALIGN which is a multiple 2359 * of PAGE_SIZE. 2360 * 2361 * We no longer perform an exhaustive memory test. Instead we 2362 * simply test the first and last word in each physmap[] 2363 * segment. 2364 */ 2365 for (i = 0; i <= physmap_idx; i += 2) { 2366 vm_paddr_t end; 2367 vm_paddr_t incr; 2368 2369 end = physmap[i + 1]; 2370 2371 for (pa = physmap[i]; pa < end; pa += incr) { 2372 int page_bad, full; 2373 volatile uint64_t *ptr = (uint64_t *)CADDR1; 2374 uint64_t tmp; 2375 2376 full = FALSE; 2377 2378 /* 2379 * Calculate incr. Just test the first and 2380 * last page in each physmap[] segment. 2381 */ 2382 if (pa == end - PAGE_SIZE) 2383 incr = PAGE_SIZE; 2384 else 2385 incr = end - pa - PAGE_SIZE; 2386 2387 /* 2388 * Make sure we don't skip blacked out areas. 2389 */ 2390 if (pa < 0x200000 && 0x200000 < end) { 2391 incr = 0x200000 - pa; 2392 } 2393 if (dcons_addr > 0 && 2394 pa < dcons_addr && 2395 dcons_addr < end) { 2396 incr = dcons_addr - pa; 2397 } 2398 2399 /* 2400 * Block out kernel memory as not available. 2401 */ 2402 if (pa >= 0x200000 && pa < first) { 2403 incr = first - pa; 2404 if (pa + incr > end) 2405 incr = end - pa; 2406 goto do_dump_avail; 2407 } 2408 2409 /* 2410 * Block out the dcons buffer if it exists. 2411 */ 2412 if (dcons_addr > 0 && 2413 pa >= trunc_page(dcons_addr) && 2414 pa < dcons_addr + dcons_size) { 2415 incr = dcons_addr + dcons_size - pa; 2416 incr = (incr + PAGE_MASK) & 2417 ~(vm_paddr_t)PAGE_MASK; 2418 if (pa + incr > end) 2419 incr = end - pa; 2420 goto do_dump_avail; 2421 } 2422 2423 page_bad = FALSE; 2424 2425 /* 2426 * Map the page non-cacheable for the memory 2427 * test. 2428 */ 2429 *pte = pa | 2430 kernel_pmap.pmap_bits[PG_V_IDX] | 2431 kernel_pmap.pmap_bits[PG_RW_IDX] | 2432 kernel_pmap.pmap_bits[PG_N_IDX]; 2433 cpu_invlpg(__DEVOLATILE(void *, ptr)); 2434 cpu_mfence(); 2435 2436 /* 2437 * Save original value for restoration later. 2438 */ 2439 tmp = *ptr; 2440 2441 /* 2442 * Test for alternating 1's and 0's 2443 */ 2444 *ptr = 0xaaaaaaaaaaaaaaaaLLU; 2445 cpu_mfence(); 2446 if (*ptr != 0xaaaaaaaaaaaaaaaaLLU) 2447 page_bad = TRUE; 2448 /* 2449 * Test for alternating 0's and 1's 2450 */ 2451 *ptr = 0x5555555555555555LLU; 2452 cpu_mfence(); 2453 if (*ptr != 0x5555555555555555LLU) 2454 page_bad = TRUE; 2455 /* 2456 * Test for all 1's 2457 */ 2458 *ptr = 0xffffffffffffffffLLU; 2459 cpu_mfence(); 2460 if (*ptr != 0xffffffffffffffffLLU) 2461 page_bad = TRUE; 2462 /* 2463 * Test for all 0's 2464 */ 2465 *ptr = 0x0; 2466 cpu_mfence(); 2467 if (*ptr != 0x0) 2468 page_bad = TRUE; 2469 2470 /* 2471 * Restore original value. 2472 */ 2473 *ptr = tmp; 2474 2475 /* 2476 * Adjust array of valid/good pages. 2477 */ 2478 if (page_bad == TRUE) { 2479 incr = PAGE_SIZE; 2480 continue; 2481 } 2482 2483 /* 2484 * Collapse page address into phys_avail[]. Do a 2485 * continuation of the current phys_avail[] index 2486 * when possible. 2487 */ 2488 if (phys_avail[pa_indx].phys_end == pa) { 2489 /* 2490 * Continuation 2491 */ 2492 phys_avail[pa_indx].phys_end += incr; 2493 } else if (phys_avail[pa_indx].phys_beg == 2494 phys_avail[pa_indx].phys_end) { 2495 /* 2496 * Current phys_avail is completely empty, 2497 * reuse the index. 2498 */ 2499 phys_avail[pa_indx].phys_beg = pa; 2500 phys_avail[pa_indx].phys_end = pa + incr; 2501 } else { 2502 /* 2503 * Allocate next phys_avail index. 2504 */ 2505 ++pa_indx; 2506 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 2507 kprintf( 2508 "Too many holes in the physical address space, giving up\n"); 2509 --pa_indx; 2510 full = TRUE; 2511 goto do_dump_avail; 2512 } 2513 phys_avail[pa_indx].phys_beg = pa; 2514 phys_avail[pa_indx].phys_end = pa + incr; 2515 } 2516 physmem += incr / PAGE_SIZE; 2517 2518 /* 2519 * pa available for dumping 2520 */ 2521 do_dump_avail: 2522 if (dump_avail[da_indx].phys_end == pa) { 2523 dump_avail[da_indx].phys_end += incr; 2524 } else { 2525 ++da_indx; 2526 if (da_indx == DUMP_AVAIL_ARRAY_END) { 2527 --da_indx; 2528 goto do_next; 2529 } 2530 dump_avail[da_indx].phys_beg = pa; 2531 dump_avail[da_indx].phys_end = pa + incr; 2532 } 2533 do_next: 2534 if (full) 2535 break; 2536 } 2537 } 2538 *pte = 0; 2539 cpu_invltlb(); 2540 cpu_mfence(); 2541 2542 /* 2543 * The last chunk must contain at least one page plus the message 2544 * buffer to avoid complicating other code (message buffer address 2545 * calculation, etc.). 2546 */ 2547 msgbuf_size = (MSGBUF_SIZE + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK; 2548 2549 while (phys_avail[pa_indx].phys_beg + PHYSMAP_ALIGN + msgbuf_size >= 2550 phys_avail[pa_indx].phys_end) { 2551 physmem -= atop(phys_avail[pa_indx].phys_end - 2552 phys_avail[pa_indx].phys_beg); 2553 phys_avail[pa_indx].phys_beg = 0; 2554 phys_avail[pa_indx].phys_end = 0; 2555 --pa_indx; 2556 } 2557 2558 Maxmem = atop(phys_avail[pa_indx].phys_end); 2559 2560 /* Trim off space for the message buffer. */ 2561 phys_avail[pa_indx].phys_end -= msgbuf_size; 2562 2563 avail_end = phys_avail[pa_indx].phys_end; 2564 2565 /* Map the message buffer. */ 2566 for (off = 0; off < msgbuf_size; off += PAGE_SIZE) { 2567 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); 2568 } 2569 2570 /* 2571 * Try to get EFI framebuffer working as early as possible. 2572 * 2573 * WARN: Some BIOSes do not list the EFI framebuffer memory, causing 2574 * the pmap probe code to create a DMAP that does not cover its 2575 * physical address space, efi_fb_init_vaddr(1) might not return 2576 * an initialized framebuffer base pointer. In this situation the 2577 * later efi_fb_init_vaddr(0) call will deal with it. 2578 */ 2579 if (have_efi_framebuffer) 2580 efi_fb_init_vaddr(1); 2581 } 2582 2583 struct machintr_abi MachIntrABI; 2584 2585 /* 2586 * IDT VECTORS: 2587 * 0 Divide by zero 2588 * 1 Debug 2589 * 2 NMI 2590 * 3 BreakPoint 2591 * 4 OverFlow 2592 * 5 Bound-Range 2593 * 6 Invalid OpCode 2594 * 7 Device Not Available (x87) 2595 * 8 Double-Fault 2596 * 9 Coprocessor Segment overrun (unsupported, reserved) 2597 * 10 Invalid-TSS 2598 * 11 Segment not present 2599 * 12 Stack 2600 * 13 General Protection 2601 * 14 Page Fault 2602 * 15 Reserved 2603 * 16 x87 FP Exception pending 2604 * 17 Alignment Check 2605 * 18 Machine Check 2606 * 19 SIMD floating point 2607 * 20-31 reserved 2608 * 32-255 INTn/external sources 2609 */ 2610 u_int64_t 2611 hammer_time(u_int64_t modulep, u_int64_t physfree) 2612 { 2613 caddr_t kmdp; 2614 int gsel_tss, x, cpu; 2615 #if 0 /* JG */ 2616 int metadata_missing, off; 2617 #endif 2618 struct mdglobaldata *gd; 2619 struct privatespace *ps; 2620 u_int64_t msr; 2621 2622 /* 2623 * Prevent lowering of the ipl if we call tsleep() early. 2624 */ 2625 gd = &CPU_prvspace[0]->mdglobaldata; 2626 ps = (struct privatespace *)gd; 2627 bzero(gd, sizeof(*gd)); 2628 bzero(&ps->common_tss, sizeof(ps->common_tss)); 2629 2630 /* 2631 * Note: on both UP and SMP curthread must be set non-NULL 2632 * early in the boot sequence because the system assumes 2633 * that 'curthread' is never NULL. 2634 */ 2635 2636 gd->mi.gd_curthread = &thread0; 2637 thread0.td_gd = &gd->mi; 2638 2639 atdevbase = ISA_HOLE_START + PTOV_OFFSET; 2640 2641 #if 0 /* JG */ 2642 metadata_missing = 0; 2643 if (bootinfo.bi_modulep) { 2644 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; 2645 preload_bootstrap_relocate(KERNBASE); 2646 } else { 2647 metadata_missing = 1; 2648 } 2649 if (bootinfo.bi_envp) 2650 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; 2651 #endif 2652 2653 preload_metadata = (caddr_t)(uintptr_t)(modulep + PTOV_OFFSET); 2654 preload_bootstrap_relocate(PTOV_OFFSET); 2655 kmdp = preload_search_by_type("elf kernel"); 2656 if (kmdp == NULL) 2657 kmdp = preload_search_by_type("elf64 kernel"); 2658 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 2659 kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + PTOV_OFFSET; 2660 #ifdef DDB 2661 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 2662 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 2663 #endif 2664 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); 2665 2666 if (boothowto & RB_VERBOSE) 2667 bootverbose++; 2668 2669 /* 2670 * Default MachIntrABI to ICU 2671 */ 2672 MachIntrABI = MachIntrABI_ICU; 2673 2674 /* 2675 * start with one cpu. Note: with one cpu, ncpus_fit_mask remain 0. 2676 */ 2677 ncpus = 1; 2678 ncpus_fit = 1; 2679 /* Init basic tunables, hz etc */ 2680 init_param1(); 2681 2682 /* 2683 * make gdt memory segments 2684 */ 2685 gdt_segs[GPROC0_SEL].ssd_base = 2686 (uintptr_t) &CPU_prvspace[0]->common_tss; 2687 2688 gd->mi.gd_prvspace = CPU_prvspace[0]; 2689 2690 for (x = 0; x < NGDT; x++) { 2691 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1)) 2692 ssdtosd(&gdt_segs[x], &gdt[x]); 2693 } 2694 ssdtosyssd(&gdt_segs[GPROC0_SEL], 2695 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 2696 2697 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 2698 r_gdt.rd_base = (long) gdt; 2699 lgdt(&r_gdt); 2700 2701 wrmsr(MSR_FSBASE, 0); /* User value */ 2702 wrmsr(MSR_GSBASE, (u_int64_t)&gd->mi); 2703 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 2704 2705 mi_gdinit(&gd->mi, 0); 2706 cpu_gdinit(gd, 0); 2707 proc0paddr = proc0paddr_buff; 2708 mi_proc0init(&gd->mi, proc0paddr); 2709 safepri = TDPRI_MAX; 2710 2711 /* spinlocks and the BGL */ 2712 init_locks(); 2713 2714 /* exceptions */ 2715 for (x = 0; x < NIDT; x++) 2716 setidt_global(x, rsvdary[x], SDT_SYSIGT, SEL_KPL, 0); 2717 setidt_global(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); 2718 setidt_global(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 2); 2719 setidt_global(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 1); 2720 setidt_global(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0); 2721 setidt_global(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0); 2722 setidt_global(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); 2723 setidt_global(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); 2724 setidt_global(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); 2725 setidt_global(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 2726 setidt_global(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); 2727 setidt_global(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); 2728 setidt_global(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); 2729 setidt_global(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); 2730 setidt_global(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); 2731 setidt_global(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); 2732 setidt_global(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); 2733 setidt_global(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); 2734 setidt_global(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0); 2735 setidt_global(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); 2736 2737 for (cpu = 0; cpu < MAXCPU; ++cpu) { 2738 r_idt_arr[cpu].rd_limit = sizeof(idt_arr[cpu]) - 1; 2739 r_idt_arr[cpu].rd_base = (long) &idt_arr[cpu][0]; 2740 } 2741 2742 lidt(&r_idt_arr[0]); 2743 2744 /* 2745 * Initialize the console before we print anything out. 2746 */ 2747 cninit(); 2748 2749 #if 0 /* JG */ 2750 if (metadata_missing) 2751 kprintf("WARNING: loader(8) metadata is missing!\n"); 2752 #endif 2753 2754 #if NISA >0 2755 elcr_probe(); 2756 isa_defaultirq(); 2757 #endif 2758 rand_initialize(); 2759 2760 /* 2761 * Initialize IRQ mapping 2762 * 2763 * NOTE: 2764 * SHOULD be after elcr_probe() 2765 */ 2766 MachIntrABI_ICU.initmap(); 2767 MachIntrABI_IOAPIC.initmap(); 2768 2769 #ifdef DDB 2770 kdb_init(); 2771 if (boothowto & RB_KDB) 2772 Debugger("Boot flags requested debugger"); 2773 #endif 2774 2775 identify_cpu(); /* Final stage of CPU initialization */ 2776 initializecpu(0); /* Initialize CPU registers */ 2777 2778 /* 2779 * On modern Intel cpus, haswell or later, cpu_idle_hlt=1 is better 2780 * because the cpu does significant power management in MWAIT 2781 * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP). 2782 * 2783 * On many AMD cpus cpu_idle_hlt=3 is better, because the cpu does 2784 * significant power management only when using ACPI halt mode. 2785 * (However, on Ryzen, mode 4 (HLT) also does power management). 2786 * 2787 * On older AMD or Intel cpus, cpu_idle_hlt=2 is better because ACPI 2788 * is needed to reduce power consumption, but wakeup times are often 2789 * too long. 2790 */ 2791 if (cpu_vendor_id == CPU_VENDOR_INTEL && 2792 CPUID_TO_MODEL(cpu_id) >= 0x3C) { /* Haswell or later */ 2793 cpu_idle_hlt = 1; 2794 } 2795 if (cpu_vendor_id == CPU_VENDOR_AMD) { 2796 if (CPUID_TO_FAMILY(cpu_id) >= 0x17) { 2797 /* Ryzen or later */ 2798 cpu_idle_hlt = 3; 2799 } else if (CPUID_TO_FAMILY(cpu_id) >= 0x14) { 2800 /* Bobcat or later */ 2801 cpu_idle_hlt = 3; 2802 } 2803 } 2804 2805 TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable); /* for compat */ 2806 TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable); 2807 TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable); 2808 TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt); 2809 2810 /* 2811 * Some of the virtual machines do not work w/ I/O APIC 2812 * enabled. If the user does not explicitly enable or 2813 * disable the I/O APIC (ioapic_enable < 0), then we 2814 * disable I/O APIC on all virtual machines. 2815 * 2816 * NOTE: 2817 * This must be done after identify_cpu(), which sets 2818 * 'cpu_feature2' 2819 */ 2820 if (ioapic_enable < 0) { 2821 if (cpu_feature2 & CPUID2_VMM) 2822 ioapic_enable = 0; 2823 else 2824 ioapic_enable = 1; 2825 } 2826 2827 /* 2828 * TSS entry point for interrupts, traps, and exceptions 2829 * (sans NMI). This will always go to near the top of the pcpu 2830 * trampoline area. Hardware-pushed data will be copied into 2831 * the trap-frame on entry, and (if necessary) returned to the 2832 * trampoline on exit. 2833 * 2834 * We store some pcb data for the trampoline code above the 2835 * stack the cpu hw pushes into, and arrange things so the 2836 * address of tr_pcb_rsp is the same as the desired top of 2837 * stack. 2838 */ 2839 ps->common_tss.tss_rsp0 = (register_t)&ps->trampoline.tr_pcb_rsp; 2840 ps->trampoline.tr_pcb_rsp = ps->common_tss.tss_rsp0; 2841 ps->trampoline.tr_pcb_gs_kernel = (register_t)gd; 2842 ps->trampoline.tr_pcb_cr3 = KPML4phys; /* adj to user cr3 live */ 2843 ps->dbltramp.tr_pcb_gs_kernel = (register_t)gd; 2844 ps->dbltramp.tr_pcb_cr3 = KPML4phys; 2845 ps->dbgtramp.tr_pcb_gs_kernel = (register_t)gd; 2846 ps->dbgtramp.tr_pcb_cr3 = KPML4phys; 2847 2848 /* double fault stack */ 2849 ps->common_tss.tss_ist1 = (register_t)&ps->dbltramp.tr_pcb_rsp; 2850 /* #DB debugger needs its own stack */ 2851 ps->common_tss.tss_ist2 = (register_t)&ps->dbgtramp.tr_pcb_rsp; 2852 2853 /* Set the IO permission bitmap (empty due to tss seg limit) */ 2854 ps->common_tss.tss_iobase = sizeof(struct x86_64tss); 2855 2856 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 2857 gd->gd_tss_gdt = &gdt[GPROC0_SEL]; 2858 gd->gd_common_tssd = *gd->gd_tss_gdt; 2859 ltr(gsel_tss); 2860 2861 /* Set up the fast syscall stuff */ 2862 msr = rdmsr(MSR_EFER) | EFER_SCE; 2863 wrmsr(MSR_EFER, msr); 2864 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); 2865 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 2866 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 2867 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 2868 wrmsr(MSR_STAR, msr); 2869 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_IOPL|PSL_AC); 2870 2871 getmemsize(kmdp, physfree); 2872 init_param2(physmem); 2873 2874 /* now running on new page tables, configured,and u/iom is accessible */ 2875 2876 /* Map the message buffer. */ 2877 #if 0 /* JG */ 2878 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) 2879 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); 2880 #endif 2881 2882 msgbufinit(msgbufp, MSGBUF_SIZE); 2883 2884 2885 /* transfer to user mode */ 2886 2887 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 2888 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 2889 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 2890 2891 load_ds(_udatasel); 2892 load_es(_udatasel); 2893 load_fs(_udatasel); 2894 2895 /* setup proc 0's pcb */ 2896 thread0.td_pcb->pcb_flags = 0; 2897 thread0.td_pcb->pcb_cr3 = KPML4phys; 2898 thread0.td_pcb->pcb_cr3_iso = 0; 2899 thread0.td_pcb->pcb_ext = NULL; 2900 lwp0.lwp_md.md_regs = &proc0_tf; /* XXX needed? */ 2901 2902 /* Location of kernel stack for locore */ 2903 return ((u_int64_t)thread0.td_pcb); 2904 } 2905 2906 /* 2907 * Initialize machine-dependant portions of the global data structure. 2908 * Note that the global data area and cpu0's idlestack in the private 2909 * data space were allocated in locore. 2910 * 2911 * Note: the idlethread's cpl is 0 2912 * 2913 * WARNING! Called from early boot, 'mycpu' may not work yet. 2914 */ 2915 void 2916 cpu_gdinit(struct mdglobaldata *gd, int cpu) 2917 { 2918 if (cpu) 2919 gd->mi.gd_curthread = &gd->mi.gd_idlethread; 2920 2921 lwkt_init_thread(&gd->mi.gd_idlethread, 2922 gd->mi.gd_prvspace->idlestack, 2923 sizeof(gd->mi.gd_prvspace->idlestack), 2924 0, &gd->mi); 2925 lwkt_set_comm(&gd->mi.gd_idlethread, "idle_%d", cpu); 2926 gd->mi.gd_idlethread.td_switch = cpu_lwkt_switch; 2927 gd->mi.gd_idlethread.td_sp -= sizeof(void *); 2928 *(void **)gd->mi.gd_idlethread.td_sp = cpu_idle_restore; 2929 } 2930 2931 /* 2932 * We only have to check for DMAP bounds, the globaldata space is 2933 * actually part of the kernel_map so we don't have to waste time 2934 * checking CPU_prvspace[*]. 2935 */ 2936 int 2937 is_globaldata_space(vm_offset_t saddr, vm_offset_t eaddr) 2938 { 2939 #if 0 2940 if (saddr >= (vm_offset_t)&CPU_prvspace[0] && 2941 eaddr <= (vm_offset_t)&CPU_prvspace[MAXCPU]) { 2942 return (TRUE); 2943 } 2944 #endif 2945 if (saddr >= DMAP_MIN_ADDRESS && eaddr <= DMAP_MAX_ADDRESS) 2946 return (TRUE); 2947 return (FALSE); 2948 } 2949 2950 struct globaldata * 2951 globaldata_find(int cpu) 2952 { 2953 KKASSERT(cpu >= 0 && cpu < ncpus); 2954 return(&CPU_prvspace[cpu]->mdglobaldata.mi); 2955 } 2956 2957 /* 2958 * This path should be safe from the SYSRET issue because only stopped threads 2959 * can have their %rip adjusted this way (and all heavy weight thread switches 2960 * clear QUICKREF and thus do not use SYSRET). However, the code path is 2961 * convoluted so add a safety by forcing %rip to be cannonical. 2962 */ 2963 int 2964 ptrace_set_pc(struct lwp *lp, unsigned long addr) 2965 { 2966 if (addr & 0x0000800000000000LLU) 2967 lp->lwp_md.md_regs->tf_rip = addr | 0xFFFF000000000000LLU; 2968 else 2969 lp->lwp_md.md_regs->tf_rip = addr & 0x0000FFFFFFFFFFFFLLU; 2970 return (0); 2971 } 2972 2973 int 2974 ptrace_single_step(struct lwp *lp) 2975 { 2976 lp->lwp_md.md_regs->tf_rflags |= PSL_T; 2977 return (0); 2978 } 2979 2980 int 2981 fill_regs(struct lwp *lp, struct reg *regs) 2982 { 2983 struct trapframe *tp; 2984 2985 if ((tp = lp->lwp_md.md_regs) == NULL) 2986 return EINVAL; 2987 bcopy(&tp->tf_rdi, ®s->r_rdi, sizeof(*regs)); 2988 return (0); 2989 } 2990 2991 int 2992 set_regs(struct lwp *lp, struct reg *regs) 2993 { 2994 struct trapframe *tp; 2995 2996 tp = lp->lwp_md.md_regs; 2997 if (!EFL_SECURE(regs->r_rflags, tp->tf_rflags) || 2998 !CS_SECURE(regs->r_cs)) 2999 return (EINVAL); 3000 bcopy(®s->r_rdi, &tp->tf_rdi, sizeof(*regs)); 3001 clear_quickret(); 3002 return (0); 3003 } 3004 3005 static void 3006 fill_fpregs_xmm(struct savexmm *sv_xmm, struct save87 *sv_87) 3007 { 3008 struct env87 *penv_87 = &sv_87->sv_env; 3009 struct envxmm *penv_xmm = &sv_xmm->sv_env; 3010 int i; 3011 3012 /* FPU control/status */ 3013 penv_87->en_cw = penv_xmm->en_cw; 3014 penv_87->en_sw = penv_xmm->en_sw; 3015 penv_87->en_tw = penv_xmm->en_tw; 3016 penv_87->en_fip = penv_xmm->en_fip; 3017 penv_87->en_fcs = penv_xmm->en_fcs; 3018 penv_87->en_opcode = penv_xmm->en_opcode; 3019 penv_87->en_foo = penv_xmm->en_foo; 3020 penv_87->en_fos = penv_xmm->en_fos; 3021 3022 /* FPU registers */ 3023 for (i = 0; i < 8; ++i) 3024 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; 3025 } 3026 3027 static void 3028 set_fpregs_xmm(struct save87 *sv_87, struct savexmm *sv_xmm) 3029 { 3030 struct env87 *penv_87 = &sv_87->sv_env; 3031 struct envxmm *penv_xmm = &sv_xmm->sv_env; 3032 int i; 3033 3034 /* FPU control/status */ 3035 penv_xmm->en_cw = penv_87->en_cw; 3036 penv_xmm->en_sw = penv_87->en_sw; 3037 penv_xmm->en_tw = penv_87->en_tw; 3038 penv_xmm->en_fip = penv_87->en_fip; 3039 penv_xmm->en_fcs = penv_87->en_fcs; 3040 penv_xmm->en_opcode = penv_87->en_opcode; 3041 penv_xmm->en_foo = penv_87->en_foo; 3042 penv_xmm->en_fos = penv_87->en_fos; 3043 3044 /* FPU registers */ 3045 for (i = 0; i < 8; ++i) 3046 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; 3047 } 3048 3049 int 3050 fill_fpregs(struct lwp *lp, struct fpreg *fpregs) 3051 { 3052 if (lp->lwp_thread == NULL || lp->lwp_thread->td_pcb == NULL) 3053 return EINVAL; 3054 if (cpu_fxsr) { 3055 fill_fpregs_xmm(&lp->lwp_thread->td_pcb->pcb_save.sv_xmm, 3056 (struct save87 *)fpregs); 3057 return (0); 3058 } 3059 bcopy(&lp->lwp_thread->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs); 3060 return (0); 3061 } 3062 3063 int 3064 set_fpregs(struct lwp *lp, struct fpreg *fpregs) 3065 { 3066 if (cpu_fxsr) { 3067 set_fpregs_xmm((struct save87 *)fpregs, 3068 &lp->lwp_thread->td_pcb->pcb_save.sv_xmm); 3069 return (0); 3070 } 3071 bcopy(fpregs, &lp->lwp_thread->td_pcb->pcb_save.sv_87, sizeof *fpregs); 3072 return (0); 3073 } 3074 3075 int 3076 fill_dbregs(struct lwp *lp, struct dbreg *dbregs) 3077 { 3078 struct pcb *pcb; 3079 3080 if (lp == NULL) { 3081 dbregs->dr[0] = rdr0(); 3082 dbregs->dr[1] = rdr1(); 3083 dbregs->dr[2] = rdr2(); 3084 dbregs->dr[3] = rdr3(); 3085 dbregs->dr[4] = rdr4(); 3086 dbregs->dr[5] = rdr5(); 3087 dbregs->dr[6] = rdr6(); 3088 dbregs->dr[7] = rdr7(); 3089 return (0); 3090 } 3091 if (lp->lwp_thread == NULL || (pcb = lp->lwp_thread->td_pcb) == NULL) 3092 return EINVAL; 3093 dbregs->dr[0] = pcb->pcb_dr0; 3094 dbregs->dr[1] = pcb->pcb_dr1; 3095 dbregs->dr[2] = pcb->pcb_dr2; 3096 dbregs->dr[3] = pcb->pcb_dr3; 3097 dbregs->dr[4] = 0; 3098 dbregs->dr[5] = 0; 3099 dbregs->dr[6] = pcb->pcb_dr6; 3100 dbregs->dr[7] = pcb->pcb_dr7; 3101 return (0); 3102 } 3103 3104 int 3105 set_dbregs(struct lwp *lp, struct dbreg *dbregs) 3106 { 3107 if (lp == NULL) { 3108 load_dr0(dbregs->dr[0]); 3109 load_dr1(dbregs->dr[1]); 3110 load_dr2(dbregs->dr[2]); 3111 load_dr3(dbregs->dr[3]); 3112 load_dr4(dbregs->dr[4]); 3113 load_dr5(dbregs->dr[5]); 3114 load_dr6(dbregs->dr[6]); 3115 load_dr7(dbregs->dr[7]); 3116 } else { 3117 struct pcb *pcb; 3118 struct ucred *ucred; 3119 int i; 3120 uint64_t mask1, mask2; 3121 3122 /* 3123 * Don't let an illegal value for dr7 get set. Specifically, 3124 * check for undefined settings. Setting these bit patterns 3125 * result in undefined behaviour and can lead to an unexpected 3126 * TRCTRAP. 3127 */ 3128 /* JG this loop looks unreadable */ 3129 /* Check 4 2-bit fields for invalid patterns. 3130 * These fields are R/Wi, for i = 0..3 3131 */ 3132 /* Is 10 in LENi allowed when running in compatibility mode? */ 3133 /* Pattern 10 in R/Wi might be used to indicate 3134 * breakpoint on I/O. Further analysis should be 3135 * carried to decide if it is safe and useful to 3136 * provide access to that capability 3137 */ 3138 for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 4; 3139 i++, mask1 <<= 4, mask2 <<= 4) 3140 if ((dbregs->dr[7] & mask1) == mask2) 3141 return (EINVAL); 3142 3143 pcb = lp->lwp_thread->td_pcb; 3144 ucred = lp->lwp_proc->p_ucred; 3145 3146 /* 3147 * Don't let a process set a breakpoint that is not within the 3148 * process's address space. If a process could do this, it 3149 * could halt the system by setting a breakpoint in the kernel 3150 * (if ddb was enabled). Thus, we need to check to make sure 3151 * that no breakpoints are being enabled for addresses outside 3152 * process's address space, unless, perhaps, we were called by 3153 * uid 0. 3154 * 3155 * XXX - what about when the watched area of the user's 3156 * address space is written into from within the kernel 3157 * ... wouldn't that still cause a breakpoint to be generated 3158 * from within kernel mode? 3159 */ 3160 3161 if (priv_check_cred(ucred, PRIV_ROOT, 0) != 0) { 3162 if (dbregs->dr[7] & 0x3) { 3163 /* dr0 is enabled */ 3164 if (dbregs->dr[0] >= VM_MAX_USER_ADDRESS) 3165 return (EINVAL); 3166 } 3167 3168 if (dbregs->dr[7] & (0x3<<2)) { 3169 /* dr1 is enabled */ 3170 if (dbregs->dr[1] >= VM_MAX_USER_ADDRESS) 3171 return (EINVAL); 3172 } 3173 3174 if (dbregs->dr[7] & (0x3<<4)) { 3175 /* dr2 is enabled */ 3176 if (dbregs->dr[2] >= VM_MAX_USER_ADDRESS) 3177 return (EINVAL); 3178 } 3179 3180 if (dbregs->dr[7] & (0x3<<6)) { 3181 /* dr3 is enabled */ 3182 if (dbregs->dr[3] >= VM_MAX_USER_ADDRESS) 3183 return (EINVAL); 3184 } 3185 } 3186 3187 pcb->pcb_dr0 = dbregs->dr[0]; 3188 pcb->pcb_dr1 = dbregs->dr[1]; 3189 pcb->pcb_dr2 = dbregs->dr[2]; 3190 pcb->pcb_dr3 = dbregs->dr[3]; 3191 pcb->pcb_dr6 = dbregs->dr[6]; 3192 pcb->pcb_dr7 = dbregs->dr[7]; 3193 3194 pcb->pcb_flags |= PCB_DBREGS; 3195 } 3196 3197 return (0); 3198 } 3199 3200 /* 3201 * Return > 0 if a hardware breakpoint has been hit, and the 3202 * breakpoint was in user space. Return 0, otherwise. 3203 */ 3204 int 3205 user_dbreg_trap(void) 3206 { 3207 u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */ 3208 u_int64_t bp; /* breakpoint bits extracted from dr6 */ 3209 int nbp; /* number of breakpoints that triggered */ 3210 caddr_t addr[4]; /* breakpoint addresses */ 3211 int i; 3212 3213 dr7 = rdr7(); 3214 if ((dr7 & 0xff) == 0) { 3215 /* 3216 * all GE and LE bits in the dr7 register are zero, 3217 * thus the trap couldn't have been caused by the 3218 * hardware debug registers 3219 */ 3220 return 0; 3221 } 3222 3223 nbp = 0; 3224 dr6 = rdr6(); 3225 bp = dr6 & 0xf; 3226 3227 if (bp == 0) { 3228 /* 3229 * None of the breakpoint bits are set meaning this 3230 * trap was not caused by any of the debug registers 3231 */ 3232 return 0; 3233 } 3234 3235 /* 3236 * at least one of the breakpoints were hit, check to see 3237 * which ones and if any of them are user space addresses 3238 */ 3239 3240 if (bp & 0x01) { 3241 addr[nbp++] = (caddr_t)rdr0(); 3242 } 3243 if (bp & 0x02) { 3244 addr[nbp++] = (caddr_t)rdr1(); 3245 } 3246 if (bp & 0x04) { 3247 addr[nbp++] = (caddr_t)rdr2(); 3248 } 3249 if (bp & 0x08) { 3250 addr[nbp++] = (caddr_t)rdr3(); 3251 } 3252 3253 for (i = 0; i < nbp; i++) { 3254 if (addr[i] < (caddr_t)VM_MAX_USER_ADDRESS) { 3255 /* 3256 * addr[i] is in user space 3257 */ 3258 return nbp; 3259 } 3260 } 3261 3262 /* 3263 * None of the breakpoints are in user space. 3264 */ 3265 return 0; 3266 } 3267 3268 3269 #ifndef DDB 3270 void 3271 Debugger(const char *msg) 3272 { 3273 kprintf("Debugger(\"%s\") called.\n", msg); 3274 } 3275 #endif /* no DDB */ 3276 3277 #ifdef DDB 3278 3279 /* 3280 * Provide inb() and outb() as functions. They are normally only 3281 * available as macros calling inlined functions, thus cannot be 3282 * called inside DDB. 3283 * 3284 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined. 3285 */ 3286 3287 #undef inb 3288 #undef outb 3289 3290 /* silence compiler warnings */ 3291 u_char inb(u_int); 3292 void outb(u_int, u_char); 3293 3294 u_char 3295 inb(u_int port) 3296 { 3297 u_char data; 3298 /* 3299 * We use %%dx and not %1 here because i/o is done at %dx and not at 3300 * %edx, while gcc generates inferior code (movw instead of movl) 3301 * if we tell it to load (u_short) port. 3302 */ 3303 __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); 3304 return (data); 3305 } 3306 3307 void 3308 outb(u_int port, u_char data) 3309 { 3310 u_char al; 3311 /* 3312 * Use an unnecessary assignment to help gcc's register allocator. 3313 * This make a large difference for gcc-1.40 and a tiny difference 3314 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for 3315 * best results. gcc-2.6.0 can't handle this. 3316 */ 3317 al = data; 3318 __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); 3319 } 3320 3321 #endif /* DDB */ 3322 3323 3324 3325 /* 3326 * initialize all the SMP locks 3327 */ 3328 3329 /* critical region when masking or unmasking interupts */ 3330 struct spinlock_deprecated imen_spinlock; 3331 3332 /* locks com (tty) data/hardware accesses: a FASTINTR() */ 3333 struct spinlock_deprecated com_spinlock; 3334 3335 /* lock regions around the clock hardware */ 3336 struct spinlock_deprecated clock_spinlock; 3337 3338 static void 3339 init_locks(void) 3340 { 3341 /* 3342 * Get the initial mplock with a count of 1 for the BSP. 3343 * This uses a LOGICAL cpu ID, ie BSP == 0. 3344 */ 3345 cpu_get_initial_mplock(); 3346 /* DEPRECATED */ 3347 spin_init_deprecated(&imen_spinlock); 3348 spin_init_deprecated(&com_spinlock); 3349 spin_init_deprecated(&clock_spinlock); 3350 3351 /* our token pool needs to work early */ 3352 lwkt_token_pool_init(); 3353 } 3354 3355 boolean_t 3356 cpu_mwait_hint_valid(uint32_t hint) 3357 { 3358 int cx_idx, sub; 3359 3360 cx_idx = MWAIT_EAX_TO_CX(hint); 3361 if (cx_idx >= CPU_MWAIT_CX_MAX) 3362 return FALSE; 3363 3364 sub = MWAIT_EAX_TO_CX_SUB(hint); 3365 if (sub >= cpu_mwait_cx_info[cx_idx].subcnt) 3366 return FALSE; 3367 3368 return TRUE; 3369 } 3370 3371 void 3372 cpu_mwait_cx_no_bmsts(void) 3373 { 3374 atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_STS); 3375 } 3376 3377 void 3378 cpu_mwait_cx_no_bmarb(void) 3379 { 3380 atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_ARB); 3381 } 3382 3383 static int 3384 cpu_mwait_cx_hint2name(int hint, char *name, int namelen, boolean_t allow_auto) 3385 { 3386 int old_cx_idx, sub = 0; 3387 3388 if (hint >= 0) { 3389 old_cx_idx = MWAIT_EAX_TO_CX(hint); 3390 sub = MWAIT_EAX_TO_CX_SUB(hint); 3391 } else if (hint == CPU_MWAIT_HINT_AUTO) { 3392 old_cx_idx = allow_auto ? CPU_MWAIT_C2 : CPU_MWAIT_CX_MAX; 3393 } else if (hint == CPU_MWAIT_HINT_AUTODEEP) { 3394 old_cx_idx = allow_auto ? CPU_MWAIT_C3 : CPU_MWAIT_CX_MAX; 3395 } else { 3396 old_cx_idx = CPU_MWAIT_CX_MAX; 3397 } 3398 3399 if (!CPU_MWAIT_HAS_CX) 3400 strlcpy(name, "NONE", namelen); 3401 else if (allow_auto && hint == CPU_MWAIT_HINT_AUTO) 3402 strlcpy(name, "AUTO", namelen); 3403 else if (allow_auto && hint == CPU_MWAIT_HINT_AUTODEEP) 3404 strlcpy(name, "AUTODEEP", namelen); 3405 else if (old_cx_idx >= CPU_MWAIT_CX_MAX || 3406 sub >= cpu_mwait_cx_info[old_cx_idx].subcnt) 3407 strlcpy(name, "INVALID", namelen); 3408 else 3409 ksnprintf(name, namelen, "C%d/%d", old_cx_idx, sub); 3410 3411 return old_cx_idx; 3412 } 3413 3414 static int 3415 cpu_mwait_cx_name2hint(char *name, int *hint0, boolean_t allow_auto) 3416 { 3417 int cx_idx, sub, hint; 3418 char *ptr, *start; 3419 3420 if (allow_auto && strcmp(name, "AUTO") == 0) { 3421 hint = CPU_MWAIT_HINT_AUTO; 3422 cx_idx = CPU_MWAIT_C2; 3423 goto done; 3424 } 3425 if (allow_auto && strcmp(name, "AUTODEEP") == 0) { 3426 hint = CPU_MWAIT_HINT_AUTODEEP; 3427 cx_idx = CPU_MWAIT_C3; 3428 goto done; 3429 } 3430 3431 if (strlen(name) < 4 || toupper(name[0]) != 'C') 3432 return -1; 3433 start = &name[1]; 3434 ptr = NULL; 3435 3436 cx_idx = strtol(start, &ptr, 10); 3437 if (ptr == start || *ptr != '/') 3438 return -1; 3439 if (cx_idx < 0 || cx_idx >= CPU_MWAIT_CX_MAX) 3440 return -1; 3441 3442 start = ptr + 1; 3443 ptr = NULL; 3444 3445 sub = strtol(start, &ptr, 10); 3446 if (*ptr != '\0') 3447 return -1; 3448 if (sub < 0 || sub >= cpu_mwait_cx_info[cx_idx].subcnt) 3449 return -1; 3450 3451 hint = MWAIT_EAX_HINT(cx_idx, sub); 3452 done: 3453 *hint0 = hint; 3454 return cx_idx; 3455 } 3456 3457 static int 3458 cpu_mwait_cx_transit(int old_cx_idx, int cx_idx) 3459 { 3460 if (cx_idx >= CPU_MWAIT_C3 && cpu_mwait_c3_preamble) 3461 return EOPNOTSUPP; 3462 if (old_cx_idx < CPU_MWAIT_C3 && cx_idx >= CPU_MWAIT_C3) { 3463 int error; 3464 3465 error = cputimer_intr_powersave_addreq(); 3466 if (error) 3467 return error; 3468 } else if (old_cx_idx >= CPU_MWAIT_C3 && cx_idx < CPU_MWAIT_C3) { 3469 cputimer_intr_powersave_remreq(); 3470 } 3471 return 0; 3472 } 3473 3474 static int 3475 cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS, int *hint0, 3476 boolean_t allow_auto) 3477 { 3478 int error, cx_idx, old_cx_idx, hint; 3479 char name[CPU_MWAIT_CX_NAMELEN]; 3480 3481 hint = *hint0; 3482 old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name), 3483 allow_auto); 3484 3485 error = sysctl_handle_string(oidp, name, sizeof(name), req); 3486 if (error != 0 || req->newptr == NULL) 3487 return error; 3488 3489 if (!CPU_MWAIT_HAS_CX) 3490 return EOPNOTSUPP; 3491 3492 cx_idx = cpu_mwait_cx_name2hint(name, &hint, allow_auto); 3493 if (cx_idx < 0) 3494 return EINVAL; 3495 3496 error = cpu_mwait_cx_transit(old_cx_idx, cx_idx); 3497 if (error) 3498 return error; 3499 3500 *hint0 = hint; 3501 return 0; 3502 } 3503 3504 static int 3505 cpu_mwait_cx_setname(struct cpu_idle_stat *stat, const char *cx_name) 3506 { 3507 int error, cx_idx, old_cx_idx, hint; 3508 char name[CPU_MWAIT_CX_NAMELEN]; 3509 3510 KASSERT(CPU_MWAIT_HAS_CX, ("cpu does not support mwait CX extension")); 3511 3512 hint = stat->hint; 3513 old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE); 3514 3515 strlcpy(name, cx_name, sizeof(name)); 3516 cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE); 3517 if (cx_idx < 0) 3518 return EINVAL; 3519 3520 error = cpu_mwait_cx_transit(old_cx_idx, cx_idx); 3521 if (error) 3522 return error; 3523 3524 stat->hint = hint; 3525 return 0; 3526 } 3527 3528 static int 3529 cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS) 3530 { 3531 int hint = cpu_mwait_halt_global; 3532 int error, cx_idx, cpu; 3533 char name[CPU_MWAIT_CX_NAMELEN], cx_name[CPU_MWAIT_CX_NAMELEN]; 3534 3535 cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE); 3536 3537 error = sysctl_handle_string(oidp, name, sizeof(name), req); 3538 if (error != 0 || req->newptr == NULL) 3539 return error; 3540 3541 if (!CPU_MWAIT_HAS_CX) 3542 return EOPNOTSUPP; 3543 3544 /* Save name for later per-cpu CX configuration */ 3545 strlcpy(cx_name, name, sizeof(cx_name)); 3546 3547 cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE); 3548 if (cx_idx < 0) 3549 return EINVAL; 3550 3551 /* Change per-cpu CX configuration */ 3552 for (cpu = 0; cpu < ncpus; ++cpu) { 3553 error = cpu_mwait_cx_setname(&cpu_idle_stats[cpu], cx_name); 3554 if (error) 3555 return error; 3556 } 3557 3558 cpu_mwait_halt_global = hint; 3559 return 0; 3560 } 3561 3562 static int 3563 cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS) 3564 { 3565 struct cpu_idle_stat *stat = arg1; 3566 int error; 3567 3568 error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req, 3569 &stat->hint, TRUE); 3570 return error; 3571 } 3572 3573 static int 3574 cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS) 3575 { 3576 int error; 3577 3578 error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req, 3579 &cpu_mwait_spin, FALSE); 3580 return error; 3581 } 3582 3583 /* 3584 * This manual debugging code is called unconditionally from Xtimer 3585 * (the per-cpu timer interrupt) whether the current thread is in a 3586 * critical section or not) and can be useful in tracking down lockups. 3587 * 3588 * NOTE: MANUAL DEBUG CODE 3589 */ 3590 #if 0 3591 static int saveticks[SMP_MAXCPU]; 3592 static int savecounts[SMP_MAXCPU]; 3593 #endif 3594 3595 void 3596 pcpu_timer_always(struct intrframe *frame) 3597 { 3598 #if 0 3599 globaldata_t gd = mycpu; 3600 int cpu = gd->gd_cpuid; 3601 char buf[64]; 3602 short *gptr; 3603 int i; 3604 3605 if (cpu <= 20) { 3606 gptr = (short *)0xFFFFFFFF800b8000 + 80 * cpu; 3607 *gptr = ((*gptr + 1) & 0x00FF) | 0x0700; 3608 ++gptr; 3609 3610 ksnprintf(buf, sizeof(buf), " %p %16s %d %16s ", 3611 (void *)frame->if_rip, gd->gd_curthread->td_comm, ticks, 3612 gd->gd_infomsg); 3613 for (i = 0; buf[i]; ++i) { 3614 gptr[i] = 0x0700 | (unsigned char)buf[i]; 3615 } 3616 } 3617 #if 0 3618 if (saveticks[gd->gd_cpuid] != ticks) { 3619 saveticks[gd->gd_cpuid] = ticks; 3620 savecounts[gd->gd_cpuid] = 0; 3621 } 3622 ++savecounts[gd->gd_cpuid]; 3623 if (savecounts[gd->gd_cpuid] > 2000 && panicstr == NULL) { 3624 panic("cpud %d panicing on ticks failure", 3625 gd->gd_cpuid); 3626 } 3627 for (i = 0; i < ncpus; ++i) { 3628 int delta; 3629 if (saveticks[i] && panicstr == NULL) { 3630 delta = saveticks[i] - ticks; 3631 if (delta < -10 || delta > 10) { 3632 panic("cpu %d panicing on cpu %d watchdog", 3633 gd->gd_cpuid, i); 3634 } 3635 } 3636 } 3637 #endif 3638 #endif 3639 } 3640 3641 SET_DECLARE(smap_open, char); 3642 SET_DECLARE(smap_close, char); 3643 3644 static void 3645 cpu_implement_smap(void) 3646 { 3647 char **scan; 3648 3649 for (scan = SET_BEGIN(smap_open); /* nop -> stac */ 3650 scan < SET_LIMIT(smap_open); ++scan) { 3651 (*scan)[0] = 0x0F; 3652 (*scan)[1] = 0x01; 3653 (*scan)[2] = 0xCB; 3654 } 3655 for (scan = SET_BEGIN(smap_close); /* nop -> clac */ 3656 scan < SET_LIMIT(smap_close); ++scan) { 3657 (*scan)[0] = 0x0F; 3658 (*scan)[1] = 0x01; 3659 (*scan)[2] = 0xCA; 3660 } 3661 } 3662 3663 /* 3664 * From a hard interrupt 3665 */ 3666 int 3667 cpu_interrupt_running(struct thread *td) 3668 { 3669 struct mdglobaldata *gd = mdcpu; 3670 3671 if (clock_debug1 > 0) { 3672 --clock_debug1; 3673 kprintf("%d %016lx %016lx %016lx\n", 3674 ((td->td_flags & TDF_INTTHREAD) != 0), 3675 gd->gd_ipending[0], 3676 gd->gd_ipending[1], 3677 gd->gd_ipending[2]); 3678 if (td->td_flags & TDF_CLKTHREAD) { 3679 kprintf("CLKTD %s PREEMPT %s\n", 3680 td->td_comm, 3681 (td->td_preempted ? 3682 td->td_preempted->td_comm : "")); 3683 } else { 3684 kprintf("NORTD %s\n", td->td_comm); 3685 } 3686 } 3687 if ((td->td_flags & TDF_INTTHREAD) || 3688 gd->gd_ipending[0] || 3689 gd->gd_ipending[1] || 3690 gd->gd_ipending[2]) { 3691 return 1; 3692 } else { 3693 return 0; 3694 } 3695 } 3696