1 /*- 2 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 3 * Copyright (c) 1992 Terrence R. Lambert. 4 * Copyright (c) 2003 Peter Wemm. 5 * Copyright (c) 2008 The DragonFly Project. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * William Jolitz. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 40 * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $ 41 */ 42 43 //#include "use_npx.h" 44 #include "use_isa.h" 45 #include "opt_compat.h" 46 #include "opt_cpu.h" 47 #include "opt_ddb.h" 48 #include "opt_directio.h" 49 #include "opt_inet.h" 50 #include "opt_msgbuf.h" 51 #include "opt_swap.h" 52 53 #include <sys/param.h> 54 #include <sys/systm.h> 55 #include <sys/sysproto.h> 56 #include <sys/signalvar.h> 57 #include <sys/kernel.h> 58 #include <sys/linker.h> 59 #include <sys/malloc.h> 60 #include <sys/proc.h> 61 #include <sys/priv.h> 62 #include <sys/buf.h> 63 #include <sys/reboot.h> 64 #include <sys/mbuf.h> 65 #include <sys/msgbuf.h> 66 #include <sys/sysent.h> 67 #include <sys/sysctl.h> 68 #include <sys/vmmeter.h> 69 #include <sys/bus.h> 70 #include <sys/usched.h> 71 #include <sys/reg.h> 72 #include <sys/sbuf.h> 73 #include <sys/ctype.h> 74 #include <sys/serialize.h> 75 #include <sys/systimer.h> 76 77 #include <vm/vm.h> 78 #include <vm/vm_param.h> 79 #include <sys/lock.h> 80 #include <vm/vm_kern.h> 81 #include <vm/vm_object.h> 82 #include <vm/vm_page.h> 83 #include <vm/vm_map.h> 84 #include <vm/vm_pager.h> 85 #include <vm/vm_extern.h> 86 87 #include <sys/thread2.h> 88 #include <sys/mplock2.h> 89 #include <sys/mutex2.h> 90 91 #include <sys/user.h> 92 #include <sys/exec.h> 93 #include <sys/cons.h> 94 95 #include <sys/efi.h> 96 97 #include <ddb/ddb.h> 98 99 #include <machine/cpu.h> 100 #include <machine/clock.h> 101 #include <machine/specialreg.h> 102 #if 0 /* JG */ 103 #include <machine/bootinfo.h> 104 #endif 105 #include <machine/md_var.h> 106 #include <machine/metadata.h> 107 #include <machine/pc/bios.h> 108 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */ 109 #include <machine/globaldata.h> /* CPU_prvspace */ 110 #include <machine/smp.h> 111 #ifdef PERFMON 112 #include <machine/perfmon.h> 113 #endif 114 #include <machine/cputypes.h> 115 #include <machine/intr_machdep.h> 116 #include <machine/framebuffer.h> 117 118 #ifdef OLD_BUS_ARCH 119 #include <bus/isa/isa_device.h> 120 #endif 121 #include <machine_base/isa/isa_intr.h> 122 #include <bus/isa/rtc.h> 123 #include <sys/random.h> 124 #include <sys/ptrace.h> 125 #include <machine/sigframe.h> 126 127 #include <sys/machintr.h> 128 #include <machine_base/icu/icu_abi.h> 129 #include <machine_base/icu/elcr_var.h> 130 #include <machine_base/apic/lapic.h> 131 #include <machine_base/apic/ioapic.h> 132 #include <machine_base/apic/ioapic_abi.h> 133 #include <machine/mptable.h> 134 135 #define PHYSMAP_ENTRIES 10 136 137 extern u_int64_t hammer_time(u_int64_t, u_int64_t); 138 139 extern void printcpuinfo(void); /* XXX header file */ 140 extern void identify_cpu(void); 141 #if 0 /* JG */ 142 extern void finishidentcpu(void); 143 #endif 144 extern void panicifcpuunsupported(void); 145 146 static void cpu_startup(void *); 147 static void pic_finish(void *); 148 static void cpu_finish(void *); 149 150 static void set_fpregs_xmm(struct save87 *, struct savexmm *); 151 static void fill_fpregs_xmm(struct savexmm *, struct save87 *); 152 #ifdef DIRECTIO 153 extern void ffs_rawread_setup(void); 154 #endif /* DIRECTIO */ 155 static void init_locks(void); 156 157 SYSINIT(cpu, SI_BOOT2_START_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 158 SYSINIT(pic_finish, SI_BOOT2_FINISH_PIC, SI_ORDER_FIRST, pic_finish, NULL); 159 SYSINIT(cpu_finish, SI_BOOT2_FINISH_CPU, SI_ORDER_FIRST, cpu_finish, NULL); 160 161 #ifdef DDB 162 extern vm_offset_t ksym_start, ksym_end; 163 #endif 164 165 struct privatespace CPU_prvspace_bsp __aligned(4096); 166 struct privatespace *CPU_prvspace[MAXCPU] = { &CPU_prvspace_bsp }; 167 168 int _udatasel, _ucodesel, _ucode32sel; 169 u_long atdevbase; 170 int64_t tsc_offsets[MAXCPU]; 171 172 static int cpu_mwait_halt_global; /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */ 173 174 #if defined(SWTCH_OPTIM_STATS) 175 extern int swtch_optim_stats; 176 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats, 177 CTLFLAG_RD, &swtch_optim_stats, 0, ""); 178 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count, 179 CTLFLAG_RD, &tlb_flush_count, 0, ""); 180 #endif 181 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_halt, 182 CTLFLAG_RD, &cpu_mwait_halt_global, 0, ""); 183 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_spin, CTLFLAG_RD, &cpu_mwait_spin, 0, 184 "monitor/mwait target state"); 185 186 #define CPU_MWAIT_HAS_CX \ 187 ((cpu_feature2 & CPUID2_MON) && \ 188 (cpu_mwait_feature & CPUID_MWAIT_EXT)) 189 190 #define CPU_MWAIT_CX_NAMELEN 16 191 192 #define CPU_MWAIT_C1 1 193 #define CPU_MWAIT_C2 2 194 #define CPU_MWAIT_C3 3 195 #define CPU_MWAIT_CX_MAX 8 196 197 #define CPU_MWAIT_HINT_AUTO -1 /* C1 and C2 */ 198 #define CPU_MWAIT_HINT_AUTODEEP -2 /* C3+ */ 199 200 SYSCTL_NODE(_machdep, OID_AUTO, mwait, CTLFLAG_RW, 0, "MWAIT features"); 201 SYSCTL_NODE(_machdep_mwait, OID_AUTO, CX, CTLFLAG_RW, 0, "MWAIT Cx settings"); 202 203 struct cpu_mwait_cx { 204 int subcnt; 205 char name[4]; 206 struct sysctl_ctx_list sysctl_ctx; 207 struct sysctl_oid *sysctl_tree; 208 }; 209 static struct cpu_mwait_cx cpu_mwait_cx_info[CPU_MWAIT_CX_MAX]; 210 static char cpu_mwait_cx_supported[256]; 211 212 static int cpu_mwait_c1_hints_cnt; 213 static int cpu_mwait_hints_cnt; 214 static int *cpu_mwait_hints; 215 216 static int cpu_mwait_deep_hints_cnt; 217 static int *cpu_mwait_deep_hints; 218 219 #define CPU_IDLE_REPEAT_DEFAULT 750 220 221 static u_int cpu_idle_repeat = CPU_IDLE_REPEAT_DEFAULT; 222 static u_long cpu_idle_repeat_max = CPU_IDLE_REPEAT_DEFAULT; 223 static u_int cpu_mwait_repeat_shift = 1; 224 225 #define CPU_MWAIT_C3_PREAMBLE_BM_ARB 0x1 226 #define CPU_MWAIT_C3_PREAMBLE_BM_STS 0x2 227 228 static int cpu_mwait_c3_preamble = 229 CPU_MWAIT_C3_PREAMBLE_BM_ARB | 230 CPU_MWAIT_C3_PREAMBLE_BM_STS; 231 232 SYSCTL_STRING(_machdep_mwait_CX, OID_AUTO, supported, CTLFLAG_RD, 233 cpu_mwait_cx_supported, 0, "MWAIT supported C states"); 234 SYSCTL_INT(_machdep_mwait_CX, OID_AUTO, c3_preamble, CTLFLAG_RD, 235 &cpu_mwait_c3_preamble, 0, "C3+ preamble mask"); 236 237 static int cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS, 238 int *, boolean_t); 239 static int cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS); 240 static int cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS); 241 static int cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS); 242 243 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, idle, CTLTYPE_STRING|CTLFLAG_RW, 244 NULL, 0, cpu_mwait_cx_idle_sysctl, "A", ""); 245 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, spin, CTLTYPE_STRING|CTLFLAG_RW, 246 NULL, 0, cpu_mwait_cx_spin_sysctl, "A", ""); 247 SYSCTL_UINT(_machdep_mwait_CX, OID_AUTO, repeat_shift, CTLFLAG_RW, 248 &cpu_mwait_repeat_shift, 0, ""); 249 250 long physmem = 0; 251 252 u_long ebda_addr = 0; 253 254 int imcr_present = 0; 255 256 int naps = 0; /* # of Applications processors */ 257 258 u_int base_memory; 259 struct mtx dt_lock; /* lock for GDT and LDT */ 260 261 static int 262 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS) 263 { 264 u_long pmem = ctob(physmem); 265 266 int error = sysctl_handle_long(oidp, &pmem, 0, req); 267 return (error); 268 } 269 270 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_ULONG|CTLFLAG_RD, 271 0, 0, sysctl_hw_physmem, "LU", "Total system memory in bytes (number of pages * page size)"); 272 273 static int 274 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS) 275 { 276 int error = sysctl_handle_int(oidp, 0, 277 ctob(physmem - vmstats.v_wire_count), req); 278 return (error); 279 } 280 281 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD, 282 0, 0, sysctl_hw_usermem, "IU", ""); 283 284 static int 285 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS) 286 { 287 int error = sysctl_handle_int(oidp, 0, 288 x86_64_btop(avail_end - avail_start), req); 289 return (error); 290 } 291 292 SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_INT|CTLFLAG_RD, 293 0, 0, sysctl_hw_availpages, "I", ""); 294 295 vm_paddr_t Maxmem; 296 vm_paddr_t Realmem; 297 298 /* 299 * The number of PHYSMAP entries must be one less than the number of 300 * PHYSSEG entries because the PHYSMAP entry that spans the largest 301 * physical address that is accessible by ISA DMA is split into two 302 * PHYSSEG entries. 303 */ 304 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 305 306 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; 307 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; 308 309 /* must be 2 less so 0 0 can signal end of chunks */ 310 #define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 2) 311 #define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 2) 312 313 static vm_offset_t buffer_sva, buffer_eva; 314 vm_offset_t clean_sva, clean_eva; 315 static vm_offset_t pager_sva, pager_eva; 316 static struct trapframe proc0_tf; 317 318 static void 319 cpu_startup(void *dummy) 320 { 321 caddr_t v; 322 vm_size_t size = 0; 323 vm_offset_t firstaddr; 324 325 /* 326 * Good {morning,afternoon,evening,night}. 327 */ 328 kprintf("%s", version); 329 startrtclock(); 330 printcpuinfo(); 331 panicifcpuunsupported(); 332 #ifdef PERFMON 333 perfmon_init(); 334 #endif 335 kprintf("real memory = %ju (%ju MB)\n", 336 (intmax_t)Realmem, 337 (intmax_t)Realmem / 1024 / 1024); 338 /* 339 * Display any holes after the first chunk of extended memory. 340 */ 341 if (bootverbose) { 342 int indx; 343 344 kprintf("Physical memory chunk(s):\n"); 345 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 346 vm_paddr_t size1 = phys_avail[indx + 1] - phys_avail[indx]; 347 348 kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n", 349 (intmax_t)phys_avail[indx], 350 (intmax_t)phys_avail[indx + 1] - 1, 351 (intmax_t)size1, 352 (intmax_t)(size1 / PAGE_SIZE)); 353 } 354 } 355 356 /* 357 * Allocate space for system data structures. 358 * The first available kernel virtual address is in "v". 359 * As pages of kernel virtual memory are allocated, "v" is incremented. 360 * As pages of memory are allocated and cleared, 361 * "firstaddr" is incremented. 362 * An index into the kernel page table corresponding to the 363 * virtual memory address maintained in "v" is kept in "mapaddr". 364 */ 365 366 /* 367 * Make two passes. The first pass calculates how much memory is 368 * needed and allocates it. The second pass assigns virtual 369 * addresses to the various data structures. 370 */ 371 firstaddr = 0; 372 again: 373 v = (caddr_t)firstaddr; 374 375 #define valloc(name, type, num) \ 376 (name) = (type *)v; v = (caddr_t)((name)+(num)) 377 #define valloclim(name, type, num, lim) \ 378 (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num))) 379 380 /* 381 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. 382 * For the first 64MB of ram nominally allocate sufficient buffers to 383 * cover 1/4 of our ram. Beyond the first 64MB allocate additional 384 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing 385 * the buffer cache we limit the eventual kva reservation to 386 * maxbcache bytes. 387 * 388 * factor represents the 1/4 x ram conversion. 389 */ 390 if (nbuf == 0) { 391 long factor = 4 * BKVASIZE / 1024; 392 long kbytes = physmem * (PAGE_SIZE / 1024); 393 394 nbuf = 50; 395 if (kbytes > 4096) 396 nbuf += min((kbytes - 4096) / factor, 65536 / factor); 397 if (kbytes > 65536) 398 nbuf += (kbytes - 65536) * 2 / (factor * 5); 399 if (maxbcache && nbuf > maxbcache / BKVASIZE) 400 nbuf = maxbcache / BKVASIZE; 401 } 402 403 /* 404 * Do not allow the buffer_map to be more then 1/2 the size of the 405 * kernel_map. 406 */ 407 if (nbuf > (virtual_end - virtual_start + 408 virtual2_end - virtual2_start) / (BKVASIZE * 2)) { 409 nbuf = (virtual_end - virtual_start + 410 virtual2_end - virtual2_start) / (BKVASIZE * 2); 411 kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf); 412 } 413 414 /* 415 * Do not allow the buffer_map to use more than 50% of available 416 * physical-equivalent memory. Since the VM pages which back 417 * individual buffers are typically wired, having too many bufs 418 * can prevent the system from paging properly. 419 */ 420 if (nbuf > physmem * PAGE_SIZE / (BKVASIZE * 2)) { 421 nbuf = physmem * PAGE_SIZE / (BKVASIZE * 2); 422 kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf); 423 } 424 425 /* 426 * Do not allow the sizeof(struct buf) * nbuf to exceed half of 427 * the valloc space which is just the virtual_end - virtual_start 428 * section. We use valloc() to allocate the buf header array. 429 */ 430 if (nbuf > (virtual_end - virtual_start) / sizeof(struct buf) / 2) { 431 nbuf = (virtual_end - virtual_start) / 432 sizeof(struct buf) / 2; 433 kprintf("Warning: nbufs capped at %ld due to valloc " 434 "considerations", nbuf); 435 } 436 437 nswbuf = lmax(lmin(nbuf / 4, 256), 16); 438 #ifdef NSWBUF_MIN 439 if (nswbuf < NSWBUF_MIN) 440 nswbuf = NSWBUF_MIN; 441 #endif 442 #ifdef DIRECTIO 443 ffs_rawread_setup(); 444 #endif 445 446 valloc(swbuf, struct buf, nswbuf); 447 valloc(buf, struct buf, nbuf); 448 449 /* 450 * End of first pass, size has been calculated so allocate memory 451 */ 452 if (firstaddr == 0) { 453 size = (vm_size_t)(v - firstaddr); 454 firstaddr = kmem_alloc(&kernel_map, round_page(size)); 455 if (firstaddr == 0) 456 panic("startup: no room for tables"); 457 goto again; 458 } 459 460 /* 461 * End of second pass, addresses have been assigned 462 * 463 * nbuf is an int, make sure we don't overflow the field. 464 * 465 * On 64-bit systems we always reserve maximal allocations for 466 * buffer cache buffers and there are no fragmentation issues, 467 * so the KVA segment does not have to be excessively oversized. 468 */ 469 if ((vm_size_t)(v - firstaddr) != size) 470 panic("startup: table size inconsistency"); 471 472 kmem_suballoc(&kernel_map, &clean_map, &clean_sva, &clean_eva, 473 ((vm_offset_t)(nbuf + 16) * BKVASIZE) + 474 (nswbuf * MAXPHYS) + pager_map_size); 475 kmem_suballoc(&clean_map, &buffer_map, &buffer_sva, &buffer_eva, 476 ((vm_offset_t)(nbuf + 16) * BKVASIZE)); 477 buffer_map.system_map = 1; 478 kmem_suballoc(&clean_map, &pager_map, &pager_sva, &pager_eva, 479 ((vm_offset_t)nswbuf * MAXPHYS) + pager_map_size); 480 pager_map.system_map = 1; 481 kprintf("avail memory = %ju (%ju MB)\n", 482 (uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages), 483 (uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages) / 484 1024 / 1024); 485 } 486 487 struct cpu_idle_stat { 488 int hint; 489 int reserved; 490 u_long halt; 491 u_long spin; 492 u_long repeat; 493 u_long repeat_last; 494 u_long repeat_delta; 495 u_long mwait_cx[CPU_MWAIT_CX_MAX]; 496 } __cachealign; 497 498 #define CPU_IDLE_STAT_HALT -1 499 #define CPU_IDLE_STAT_SPIN -2 500 501 static struct cpu_idle_stat cpu_idle_stats[MAXCPU]; 502 503 static int 504 sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS) 505 { 506 int idx = arg2, cpu, error; 507 u_long val = 0; 508 509 if (idx == CPU_IDLE_STAT_HALT) { 510 for (cpu = 0; cpu < ncpus; ++cpu) 511 val += cpu_idle_stats[cpu].halt; 512 } else if (idx == CPU_IDLE_STAT_SPIN) { 513 for (cpu = 0; cpu < ncpus; ++cpu) 514 val += cpu_idle_stats[cpu].spin; 515 } else { 516 KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX, 517 ("invalid index %d", idx)); 518 for (cpu = 0; cpu < ncpus; ++cpu) 519 val += cpu_idle_stats[cpu].mwait_cx[idx]; 520 } 521 522 error = sysctl_handle_quad(oidp, &val, 0, req); 523 if (error || req->newptr == NULL) 524 return error; 525 526 if (idx == CPU_IDLE_STAT_HALT) { 527 for (cpu = 0; cpu < ncpus; ++cpu) 528 cpu_idle_stats[cpu].halt = 0; 529 cpu_idle_stats[0].halt = val; 530 } else if (idx == CPU_IDLE_STAT_SPIN) { 531 for (cpu = 0; cpu < ncpus; ++cpu) 532 cpu_idle_stats[cpu].spin = 0; 533 cpu_idle_stats[0].spin = val; 534 } else { 535 KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX, 536 ("invalid index %d", idx)); 537 for (cpu = 0; cpu < ncpus; ++cpu) 538 cpu_idle_stats[cpu].mwait_cx[idx] = 0; 539 cpu_idle_stats[0].mwait_cx[idx] = val; 540 } 541 return 0; 542 } 543 544 static void 545 cpu_mwait_attach(void) 546 { 547 struct sbuf sb; 548 int hint_idx, i; 549 550 if (!CPU_MWAIT_HAS_CX) 551 return; 552 553 if (cpu_vendor_id == CPU_VENDOR_INTEL && 554 (CPUID_TO_FAMILY(cpu_id) > 0xf || 555 (CPUID_TO_FAMILY(cpu_id) == 0x6 && 556 CPUID_TO_MODEL(cpu_id) >= 0xf))) { 557 int bm_sts = 1; 558 559 /* 560 * Pentium dual-core, Core 2 and beyond do not need any 561 * additional activities to enter deep C-state, i.e. C3(+). 562 */ 563 cpu_mwait_cx_no_bmarb(); 564 565 TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts); 566 if (!bm_sts) 567 cpu_mwait_cx_no_bmsts(); 568 } 569 570 sbuf_new(&sb, cpu_mwait_cx_supported, 571 sizeof(cpu_mwait_cx_supported), SBUF_FIXEDLEN); 572 573 for (i = 0; i < CPU_MWAIT_CX_MAX; ++i) { 574 struct cpu_mwait_cx *cx = &cpu_mwait_cx_info[i]; 575 int sub; 576 577 ksnprintf(cx->name, sizeof(cx->name), "C%d", i); 578 579 sysctl_ctx_init(&cx->sysctl_ctx); 580 cx->sysctl_tree = SYSCTL_ADD_NODE(&cx->sysctl_ctx, 581 SYSCTL_STATIC_CHILDREN(_machdep_mwait), OID_AUTO, 582 cx->name, CTLFLAG_RW, NULL, "Cx control/info"); 583 if (cx->sysctl_tree == NULL) 584 continue; 585 586 cx->subcnt = CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu, i); 587 SYSCTL_ADD_INT(&cx->sysctl_ctx, 588 SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO, 589 "subcnt", CTLFLAG_RD, &cx->subcnt, 0, 590 "sub-state count"); 591 SYSCTL_ADD_PROC(&cx->sysctl_ctx, 592 SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO, 593 "entered", (CTLTYPE_QUAD | CTLFLAG_RW), 0, 594 i, sysctl_cpu_idle_cnt, "Q", "# of times entered"); 595 596 for (sub = 0; sub < cx->subcnt; ++sub) 597 sbuf_printf(&sb, "C%d/%d ", i, sub); 598 } 599 sbuf_trim(&sb); 600 sbuf_finish(&sb); 601 602 /* 603 * Non-deep C-states 604 */ 605 cpu_mwait_c1_hints_cnt = cpu_mwait_cx_info[CPU_MWAIT_C1].subcnt; 606 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i) 607 cpu_mwait_hints_cnt += cpu_mwait_cx_info[i].subcnt; 608 cpu_mwait_hints = kmalloc(sizeof(int) * cpu_mwait_hints_cnt, 609 M_DEVBUF, M_WAITOK); 610 611 hint_idx = 0; 612 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i) { 613 int j, subcnt; 614 615 subcnt = cpu_mwait_cx_info[i].subcnt; 616 for (j = 0; j < subcnt; ++j) { 617 KASSERT(hint_idx < cpu_mwait_hints_cnt, 618 ("invalid mwait hint index %d", hint_idx)); 619 cpu_mwait_hints[hint_idx] = MWAIT_EAX_HINT(i, j); 620 ++hint_idx; 621 } 622 } 623 KASSERT(hint_idx == cpu_mwait_hints_cnt, 624 ("mwait hint count %d != index %d", 625 cpu_mwait_hints_cnt, hint_idx)); 626 627 if (bootverbose) { 628 kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt); 629 for (i = 0; i < cpu_mwait_hints_cnt; ++i) { 630 int hint = cpu_mwait_hints[i]; 631 632 kprintf(" C%d/%d hint 0x%04x\n", 633 MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint), 634 hint); 635 } 636 } 637 638 /* 639 * Deep C-states 640 */ 641 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i) 642 cpu_mwait_deep_hints_cnt += cpu_mwait_cx_info[i].subcnt; 643 cpu_mwait_deep_hints = kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt, 644 M_DEVBUF, M_WAITOK); 645 646 hint_idx = 0; 647 for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i) { 648 int j, subcnt; 649 650 subcnt = cpu_mwait_cx_info[i].subcnt; 651 for (j = 0; j < subcnt; ++j) { 652 KASSERT(hint_idx < cpu_mwait_deep_hints_cnt, 653 ("invalid mwait deep hint index %d", hint_idx)); 654 cpu_mwait_deep_hints[hint_idx] = MWAIT_EAX_HINT(i, j); 655 ++hint_idx; 656 } 657 } 658 KASSERT(hint_idx == cpu_mwait_deep_hints_cnt, 659 ("mwait deep hint count %d != index %d", 660 cpu_mwait_deep_hints_cnt, hint_idx)); 661 662 if (bootverbose) { 663 kprintf("MWAIT deep hints:\n"); 664 for (i = 0; i < cpu_mwait_deep_hints_cnt; ++i) { 665 int hint = cpu_mwait_deep_hints[i]; 666 667 kprintf(" C%d/%d hint 0x%04x\n", 668 MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint), 669 hint); 670 } 671 } 672 cpu_idle_repeat_max = 256 * cpu_mwait_deep_hints_cnt; 673 674 for (i = 0; i < ncpus; ++i) { 675 char name[16]; 676 677 ksnprintf(name, sizeof(name), "idle%d", i); 678 SYSCTL_ADD_PROC(NULL, 679 SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX), OID_AUTO, 680 name, (CTLTYPE_STRING | CTLFLAG_RW), &cpu_idle_stats[i], 681 0, cpu_mwait_cx_pcpu_idle_sysctl, "A", ""); 682 } 683 } 684 685 static void 686 cpu_finish(void *dummy __unused) 687 { 688 cpu_setregs(); 689 cpu_mwait_attach(); 690 } 691 692 static void 693 pic_finish(void *dummy __unused) 694 { 695 /* Log ELCR information */ 696 elcr_dump(); 697 698 /* Log MPTABLE information */ 699 mptable_pci_int_dump(); 700 701 /* Finalize PCI */ 702 MachIntrABI.finalize(); 703 } 704 705 /* 706 * Send an interrupt to process. 707 * 708 * Stack is set up to allow sigcode stored 709 * at top to call routine, followed by kcall 710 * to sigreturn routine below. After sigreturn 711 * resets the signal mask, the stack, and the 712 * frame pointer, it returns to the user 713 * specified pc, psl. 714 */ 715 void 716 sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code) 717 { 718 struct lwp *lp = curthread->td_lwp; 719 struct proc *p = lp->lwp_proc; 720 struct trapframe *regs; 721 struct sigacts *psp = p->p_sigacts; 722 struct sigframe sf, *sfp; 723 int oonstack; 724 char *sp; 725 726 regs = lp->lwp_md.md_regs; 727 oonstack = (lp->lwp_sigstk.ss_flags & SS_ONSTACK) ? 1 : 0; 728 729 /* Save user context */ 730 bzero(&sf, sizeof(struct sigframe)); 731 sf.sf_uc.uc_sigmask = *mask; 732 sf.sf_uc.uc_stack = lp->lwp_sigstk; 733 sf.sf_uc.uc_mcontext.mc_onstack = oonstack; 734 KKASSERT(__offsetof(struct trapframe, tf_rdi) == 0); 735 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(struct trapframe)); 736 737 /* Make the size of the saved context visible to userland */ 738 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); 739 740 /* Allocate and validate space for the signal handler context. */ 741 if ((lp->lwp_flags & LWP_ALTSTACK) != 0 && !oonstack && 742 SIGISMEMBER(psp->ps_sigonstack, sig)) { 743 sp = (char *)(lp->lwp_sigstk.ss_sp + lp->lwp_sigstk.ss_size - 744 sizeof(struct sigframe)); 745 lp->lwp_sigstk.ss_flags |= SS_ONSTACK; 746 } else { 747 /* We take red zone into account */ 748 sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128; 749 } 750 751 /* 752 * XXX AVX needs 64-byte alignment but sigframe has other fields and 753 * the embedded ucontext is not at the front, so aligning this won't 754 * help us. Fortunately we bcopy in/out of the sigframe, so the 755 * kernel is ok. 756 * 757 * The problem though is if userland winds up trying to use the 758 * context directly. 759 */ 760 sfp = (struct sigframe *)((intptr_t)sp & ~(intptr_t)0xF); 761 762 /* Translate the signal is appropriate */ 763 if (p->p_sysent->sv_sigtbl) { 764 if (sig <= p->p_sysent->sv_sigsize) 765 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; 766 } 767 768 /* 769 * Build the argument list for the signal handler. 770 * 771 * Arguments are in registers (%rdi, %rsi, %rdx, %rcx) 772 */ 773 regs->tf_rdi = sig; /* argument 1 */ 774 regs->tf_rdx = (register_t)&sfp->sf_uc; /* argument 3 */ 775 776 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 777 /* 778 * Signal handler installed with SA_SIGINFO. 779 * 780 * action(signo, siginfo, ucontext) 781 */ 782 regs->tf_rsi = (register_t)&sfp->sf_si; /* argument 2 */ 783 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */ 784 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 785 786 /* fill siginfo structure */ 787 sf.sf_si.si_signo = sig; 788 sf.sf_si.si_code = code; 789 sf.sf_si.si_addr = (void *)regs->tf_addr; 790 } else { 791 /* 792 * Old FreeBSD-style arguments. 793 * 794 * handler (signo, code, [uc], addr) 795 */ 796 regs->tf_rsi = (register_t)code; /* argument 2 */ 797 regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */ 798 sf.sf_ahu.sf_handler = catcher; 799 } 800 801 /* 802 * If we're a vm86 process, we want to save the segment registers. 803 * We also change eflags to be our emulated eflags, not the actual 804 * eflags. 805 */ 806 #if 0 /* JG */ 807 if (regs->tf_eflags & PSL_VM) { 808 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 809 struct vm86_kernel *vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86; 810 811 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; 812 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; 813 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; 814 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; 815 816 if (vm86->vm86_has_vme == 0) 817 sf.sf_uc.uc_mcontext.mc_eflags = 818 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | 819 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); 820 821 /* 822 * Clear PSL_NT to inhibit T_TSSFLT faults on return from 823 * syscalls made by the signal handler. This just avoids 824 * wasting time for our lazy fixup of such faults. PSL_NT 825 * does nothing in vm86 mode, but vm86 programs can set it 826 * almost legitimately in probes for old cpu types. 827 */ 828 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); 829 } 830 #endif 831 832 /* 833 * Save the FPU state and reinit the FP unit 834 */ 835 npxpush(&sf.sf_uc.uc_mcontext); 836 837 /* 838 * Copy the sigframe out to the user's stack. 839 */ 840 if (copyout(&sf, sfp, sizeof(struct sigframe)) != 0) { 841 /* 842 * Something is wrong with the stack pointer. 843 * ...Kill the process. 844 */ 845 sigexit(lp, SIGILL); 846 } 847 848 regs->tf_rsp = (register_t)sfp; 849 regs->tf_rip = PS_STRINGS - *(p->p_sysent->sv_szsigcode); 850 851 /* 852 * i386 abi specifies that the direction flag must be cleared 853 * on function entry 854 */ 855 regs->tf_rflags &= ~(PSL_T|PSL_D); 856 857 /* 858 * 64 bit mode has a code and stack selector but 859 * no data or extra selector. %fs and %gs are not 860 * stored in-context. 861 */ 862 regs->tf_cs = _ucodesel; 863 regs->tf_ss = _udatasel; 864 clear_quickret(); 865 } 866 867 /* 868 * Sanitize the trapframe for a virtual kernel passing control to a custom 869 * VM context. Remove any items that would otherwise create a privilage 870 * issue. 871 * 872 * XXX at the moment we allow userland to set the resume flag. Is this a 873 * bad idea? 874 */ 875 int 876 cpu_sanitize_frame(struct trapframe *frame) 877 { 878 frame->tf_cs = _ucodesel; 879 frame->tf_ss = _udatasel; 880 /* XXX VM (8086) mode not supported? */ 881 frame->tf_rflags &= (PSL_RF | PSL_USERCHANGE | PSL_VM_UNSUPP); 882 frame->tf_rflags |= PSL_RESERVED_DEFAULT | PSL_I; 883 884 return(0); 885 } 886 887 /* 888 * Sanitize the tls so loading the descriptor does not blow up 889 * on us. For x86_64 we don't have to do anything. 890 */ 891 int 892 cpu_sanitize_tls(struct savetls *tls) 893 { 894 return(0); 895 } 896 897 /* 898 * sigreturn(ucontext_t *sigcntxp) 899 * 900 * System call to cleanup state after a signal 901 * has been taken. Reset signal mask and 902 * stack state from context left by sendsig (above). 903 * Return to previous pc and psl as specified by 904 * context left by sendsig. Check carefully to 905 * make sure that the user has not modified the 906 * state to gain improper privileges. 907 * 908 * MPSAFE 909 */ 910 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 911 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 912 913 int 914 sys_sigreturn(struct sigreturn_args *uap) 915 { 916 struct lwp *lp = curthread->td_lwp; 917 struct trapframe *regs; 918 ucontext_t uc; 919 ucontext_t *ucp; 920 register_t rflags; 921 int cs; 922 int error; 923 924 /* 925 * We have to copy the information into kernel space so userland 926 * can't modify it while we are sniffing it. 927 */ 928 regs = lp->lwp_md.md_regs; 929 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 930 if (error) 931 return (error); 932 ucp = &uc; 933 rflags = ucp->uc_mcontext.mc_rflags; 934 935 /* VM (8086) mode not supported */ 936 rflags &= ~PSL_VM_UNSUPP; 937 938 #if 0 /* JG */ 939 if (eflags & PSL_VM) { 940 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; 941 struct vm86_kernel *vm86; 942 943 /* 944 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't 945 * set up the vm86 area, and we can't enter vm86 mode. 946 */ 947 if (lp->lwp_thread->td_pcb->pcb_ext == 0) 948 return (EINVAL); 949 vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86; 950 if (vm86->vm86_inited == 0) 951 return (EINVAL); 952 953 /* go back to user mode if both flags are set */ 954 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) 955 trapsignal(lp, SIGBUS, 0); 956 957 if (vm86->vm86_has_vme) { 958 eflags = (tf->tf_eflags & ~VME_USERCHANGE) | 959 (eflags & VME_USERCHANGE) | PSL_VM; 960 } else { 961 vm86->vm86_eflags = eflags; /* save VIF, VIP */ 962 eflags = (tf->tf_eflags & ~VM_USERCHANGE) | 963 (eflags & VM_USERCHANGE) | PSL_VM; 964 } 965 bcopy(&ucp->uc_mcontext.mc_gs, tf, sizeof(struct trapframe)); 966 tf->tf_eflags = eflags; 967 tf->tf_vm86_ds = tf->tf_ds; 968 tf->tf_vm86_es = tf->tf_es; 969 tf->tf_vm86_fs = tf->tf_fs; 970 tf->tf_vm86_gs = tf->tf_gs; 971 tf->tf_ds = _udatasel; 972 tf->tf_es = _udatasel; 973 tf->tf_fs = _udatasel; 974 tf->tf_gs = _udatasel; 975 } else 976 #endif 977 { 978 /* 979 * Don't allow users to change privileged or reserved flags. 980 */ 981 /* 982 * XXX do allow users to change the privileged flag PSL_RF. 983 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers 984 * should sometimes set it there too. tf_eflags is kept in 985 * the signal context during signal handling and there is no 986 * other place to remember it, so the PSL_RF bit may be 987 * corrupted by the signal handler without us knowing. 988 * Corruption of the PSL_RF bit at worst causes one more or 989 * one less debugger trap, so allowing it is fairly harmless. 990 */ 991 if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { 992 kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags); 993 return(EINVAL); 994 } 995 996 /* 997 * Don't allow users to load a valid privileged %cs. Let the 998 * hardware check for invalid selectors, excess privilege in 999 * other selectors, invalid %eip's and invalid %esp's. 1000 */ 1001 cs = ucp->uc_mcontext.mc_cs; 1002 if (!CS_SECURE(cs)) { 1003 kprintf("sigreturn: cs = 0x%x\n", cs); 1004 trapsignal(lp, SIGBUS, T_PROTFLT); 1005 return(EINVAL); 1006 } 1007 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(struct trapframe)); 1008 } 1009 1010 /* 1011 * Restore the FPU state from the frame 1012 */ 1013 crit_enter(); 1014 npxpop(&ucp->uc_mcontext); 1015 1016 if (ucp->uc_mcontext.mc_onstack & 1) 1017 lp->lwp_sigstk.ss_flags |= SS_ONSTACK; 1018 else 1019 lp->lwp_sigstk.ss_flags &= ~SS_ONSTACK; 1020 1021 lp->lwp_sigmask = ucp->uc_sigmask; 1022 SIG_CANTMASK(lp->lwp_sigmask); 1023 clear_quickret(); 1024 crit_exit(); 1025 return(EJUSTRETURN); 1026 } 1027 1028 /* 1029 * Machine dependent boot() routine 1030 * 1031 * I haven't seen anything to put here yet 1032 * Possibly some stuff might be grafted back here from boot() 1033 */ 1034 void 1035 cpu_boot(int howto) 1036 { 1037 } 1038 1039 /* 1040 * Shutdown the CPU as much as possible 1041 */ 1042 void 1043 cpu_halt(void) 1044 { 1045 for (;;) 1046 __asm__ __volatile("hlt"); 1047 } 1048 1049 /* 1050 * cpu_idle() represents the idle LWKT. You cannot return from this function 1051 * (unless you want to blow things up!). Instead we look for runnable threads 1052 * and loop or halt as appropriate. Giant is not held on entry to the thread. 1053 * 1054 * The main loop is entered with a critical section held, we must release 1055 * the critical section before doing anything else. lwkt_switch() will 1056 * check for pending interrupts due to entering and exiting its own 1057 * critical section. 1058 * 1059 * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up. 1060 * However, there are cases where the idlethread will be entered with 1061 * the possibility that no IPI will occur and in such cases 1062 * lwkt_switch() sets TDF_IDLE_NOHLT. 1063 * 1064 * NOTE: cpu_idle_repeat determines how many entries into the idle thread 1065 * must occur before it starts using ACPI halt. 1066 * 1067 * NOTE: Value overridden in hammer_time(). 1068 */ 1069 static int cpu_idle_hlt = 2; 1070 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW, 1071 &cpu_idle_hlt, 0, "Idle loop HLT enable"); 1072 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_repeat, CTLFLAG_RW, 1073 &cpu_idle_repeat, 0, "Idle entries before acpi hlt"); 1074 1075 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_hltcnt, (CTLTYPE_QUAD | CTLFLAG_RW), 1076 0, CPU_IDLE_STAT_HALT, sysctl_cpu_idle_cnt, "Q", "Idle loop entry halts"); 1077 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_spincnt, (CTLTYPE_QUAD | CTLFLAG_RW), 1078 0, CPU_IDLE_STAT_SPIN, sysctl_cpu_idle_cnt, "Q", "Idle loop entry spins"); 1079 1080 static void 1081 cpu_idle_default_hook(void) 1082 { 1083 /* 1084 * We must guarentee that hlt is exactly the instruction 1085 * following the sti. 1086 */ 1087 __asm __volatile("sti; hlt"); 1088 } 1089 1090 /* Other subsystems (e.g., ACPI) can hook this later. */ 1091 void (*cpu_idle_hook)(void) = cpu_idle_default_hook; 1092 1093 static __inline int 1094 cpu_mwait_cx_hint(struct cpu_idle_stat *stat) 1095 { 1096 int hint, cx_idx; 1097 u_int idx; 1098 1099 hint = stat->hint; 1100 if (hint >= 0) 1101 goto done; 1102 1103 idx = (stat->repeat + stat->repeat_last + stat->repeat_delta) >> 1104 cpu_mwait_repeat_shift; 1105 if (idx >= cpu_mwait_c1_hints_cnt) { 1106 /* Step up faster, once we walked through all C1 states */ 1107 stat->repeat_delta += 1 << (cpu_mwait_repeat_shift + 1); 1108 } 1109 if (hint == CPU_MWAIT_HINT_AUTODEEP) { 1110 if (idx >= cpu_mwait_deep_hints_cnt) 1111 idx = cpu_mwait_deep_hints_cnt - 1; 1112 hint = cpu_mwait_deep_hints[idx]; 1113 } else { 1114 if (idx >= cpu_mwait_hints_cnt) 1115 idx = cpu_mwait_hints_cnt - 1; 1116 hint = cpu_mwait_hints[idx]; 1117 } 1118 done: 1119 cx_idx = MWAIT_EAX_TO_CX(hint); 1120 if (cx_idx >= 0 && cx_idx < CPU_MWAIT_CX_MAX) 1121 stat->mwait_cx[cx_idx]++; 1122 return hint; 1123 } 1124 1125 void 1126 cpu_idle(void) 1127 { 1128 globaldata_t gd = mycpu; 1129 struct cpu_idle_stat *stat = &cpu_idle_stats[gd->gd_cpuid]; 1130 struct thread *td __debugvar = gd->gd_curthread; 1131 int reqflags; 1132 int quick; 1133 1134 stat->repeat = stat->repeat_last = cpu_idle_repeat_max; 1135 1136 crit_exit(); 1137 KKASSERT(td->td_critcount == 0); 1138 1139 for (;;) { 1140 /* 1141 * See if there are any LWKTs ready to go. 1142 */ 1143 lwkt_switch(); 1144 1145 /* 1146 * When halting inside a cli we must check for reqflags 1147 * races, particularly [re]schedule requests. Running 1148 * splz() does the job. 1149 * 1150 * cpu_idle_hlt: 1151 * 0 Never halt, just spin 1152 * 1153 * 1 Always use HLT (or MONITOR/MWAIT if avail). 1154 * 1155 * Better default for modern (Haswell+) Intel 1156 * cpus. 1157 * 1158 * 2 Use HLT/MONITOR/MWAIT up to a point and then 1159 * use the ACPI halt (default). This is a hybrid 1160 * approach. See machdep.cpu_idle_repeat. 1161 * 1162 * Better default for modern AMD cpus and older 1163 * Intel cpus. 1164 * 1165 * 3 Always use the ACPI halt. This typically 1166 * eats the least amount of power but the cpu 1167 * will be slow waking up. Slows down e.g. 1168 * compiles and other pipe/event oriented stuff. 1169 * 1170 * 4 Always use HLT. 1171 * 1172 * NOTE: Interrupts are enabled and we are not in a critical 1173 * section. 1174 * 1175 * NOTE: Preemptions do not reset gd_idle_repeat. Also we 1176 * don't bother capping gd_idle_repeat, it is ok if 1177 * it overflows. 1178 */ 1179 if (gd->gd_idle_repeat == 0) { 1180 stat->repeat = (stat->repeat + stat->repeat_last) >> 1; 1181 if (stat->repeat > cpu_idle_repeat_max) 1182 stat->repeat = cpu_idle_repeat_max; 1183 stat->repeat_last = 0; 1184 stat->repeat_delta = 0; 1185 } 1186 ++stat->repeat_last; 1187 1188 ++gd->gd_idle_repeat; 1189 reqflags = gd->gd_reqflags; 1190 quick = (cpu_idle_hlt == 1) || 1191 (cpu_idle_hlt < 3 && 1192 gd->gd_idle_repeat < cpu_idle_repeat); 1193 1194 if (quick && (cpu_mi_feature & CPU_MI_MONITOR) && 1195 (reqflags & RQF_IDLECHECK_WK_MASK) == 0) { 1196 splz(); /* XXX */ 1197 cpu_mmw_pause_int(&gd->gd_reqflags, reqflags, 1198 cpu_mwait_cx_hint(stat), 0); 1199 stat->halt++; 1200 } else if (cpu_idle_hlt) { 1201 __asm __volatile("cli"); 1202 splz(); 1203 if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) { 1204 if (quick) 1205 cpu_idle_default_hook(); 1206 else 1207 cpu_idle_hook(); 1208 } 1209 __asm __volatile("sti"); 1210 stat->halt++; 1211 } else { 1212 splz(); 1213 __asm __volatile("sti"); 1214 stat->spin++; 1215 } 1216 } 1217 } 1218 1219 /* 1220 * This routine is called if a spinlock has been held through the 1221 * exponential backoff period and is seriously contested. On a real cpu 1222 * we let it spin. 1223 */ 1224 void 1225 cpu_spinlock_contested(void) 1226 { 1227 cpu_pause(); 1228 } 1229 1230 /* 1231 * Clear registers on exec 1232 */ 1233 void 1234 exec_setregs(u_long entry, u_long stack, u_long ps_strings) 1235 { 1236 struct thread *td = curthread; 1237 struct lwp *lp = td->td_lwp; 1238 struct pcb *pcb = td->td_pcb; 1239 struct trapframe *regs = lp->lwp_md.md_regs; 1240 1241 /* was i386_user_cleanup() in NetBSD */ 1242 user_ldt_free(pcb); 1243 1244 clear_quickret(); 1245 bzero((char *)regs, sizeof(struct trapframe)); 1246 regs->tf_rip = entry; 1247 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; /* align the stack */ 1248 regs->tf_rdi = stack; /* argv */ 1249 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); 1250 regs->tf_ss = _udatasel; 1251 regs->tf_cs = _ucodesel; 1252 regs->tf_rbx = ps_strings; 1253 1254 /* 1255 * Reset the hardware debug registers if they were in use. 1256 * They won't have any meaning for the newly exec'd process. 1257 */ 1258 if (pcb->pcb_flags & PCB_DBREGS) { 1259 pcb->pcb_dr0 = 0; 1260 pcb->pcb_dr1 = 0; 1261 pcb->pcb_dr2 = 0; 1262 pcb->pcb_dr3 = 0; 1263 pcb->pcb_dr6 = 0; 1264 pcb->pcb_dr7 = 0; /* JG set bit 10? */ 1265 if (pcb == td->td_pcb) { 1266 /* 1267 * Clear the debug registers on the running 1268 * CPU, otherwise they will end up affecting 1269 * the next process we switch to. 1270 */ 1271 reset_dbregs(); 1272 } 1273 pcb->pcb_flags &= ~PCB_DBREGS; 1274 } 1275 1276 /* 1277 * Initialize the math emulator (if any) for the current process. 1278 * Actually, just clear the bit that says that the emulator has 1279 * been initialized. Initialization is delayed until the process 1280 * traps to the emulator (if it is done at all) mainly because 1281 * emulators don't provide an entry point for initialization. 1282 */ 1283 pcb->pcb_flags &= ~FP_SOFTFP; 1284 1285 /* 1286 * NOTE: do not set CR0_TS here. npxinit() must do it after clearing 1287 * gd_npxthread. Otherwise a preemptive interrupt thread 1288 * may panic in npxdna(). 1289 */ 1290 crit_enter(); 1291 load_cr0(rcr0() | CR0_MP); 1292 1293 /* 1294 * NOTE: The MSR values must be correct so we can return to 1295 * userland. gd_user_fs/gs must be correct so the switch 1296 * code knows what the current MSR values are. 1297 */ 1298 pcb->pcb_fsbase = 0; /* Values loaded from PCB on switch */ 1299 pcb->pcb_gsbase = 0; 1300 mdcpu->gd_user_fs = 0; /* Cache of current MSR values */ 1301 mdcpu->gd_user_gs = 0; 1302 wrmsr(MSR_FSBASE, 0); /* Set MSR values for return to userland */ 1303 wrmsr(MSR_KGSBASE, 0); 1304 1305 /* Initialize the npx (if any) for the current process. */ 1306 npxinit(); 1307 crit_exit(); 1308 1309 pcb->pcb_ds = _udatasel; 1310 pcb->pcb_es = _udatasel; 1311 pcb->pcb_fs = _udatasel; 1312 pcb->pcb_gs = _udatasel; 1313 } 1314 1315 void 1316 cpu_setregs(void) 1317 { 1318 register_t cr0; 1319 1320 cr0 = rcr0(); 1321 cr0 |= CR0_NE; /* Done by npxinit() */ 1322 cr0 |= CR0_MP | CR0_TS; /* Done at every execve() too. */ 1323 cr0 |= CR0_WP | CR0_AM; 1324 load_cr0(cr0); 1325 load_gs(_udatasel); 1326 } 1327 1328 static int 1329 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS) 1330 { 1331 int error; 1332 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, 1333 req); 1334 if (!error && req->newptr) 1335 resettodr(); 1336 return (error); 1337 } 1338 1339 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW, 1340 &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", ""); 1341 1342 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set, 1343 CTLFLAG_RW, &disable_rtc_set, 0, ""); 1344 1345 #if 0 /* JG */ 1346 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, 1347 CTLFLAG_RD, &bootinfo, bootinfo, ""); 1348 #endif 1349 1350 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock, 1351 CTLFLAG_RW, &wall_cmos_clock, 0, ""); 1352 1353 extern u_long bootdev; /* not a cdev_t - encoding is different */ 1354 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, 1355 CTLFLAG_RD, &bootdev, 0, "Boot device (not in cdev_t format)"); 1356 1357 /* 1358 * Initialize 386 and configure to run kernel 1359 */ 1360 1361 /* 1362 * Initialize segments & interrupt table 1363 */ 1364 1365 int _default_ldt; 1366 struct user_segment_descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ 1367 struct gate_descriptor idt_arr[MAXCPU][NIDT]; 1368 #if 0 /* JG */ 1369 union descriptor ldt[NLDT]; /* local descriptor table */ 1370 #endif 1371 1372 /* table descriptors - used to load tables by cpu */ 1373 struct region_descriptor r_gdt; 1374 struct region_descriptor r_idt_arr[MAXCPU]; 1375 1376 /* JG proc0paddr is a virtual address */ 1377 void *proc0paddr; 1378 /* JG alignment? */ 1379 char proc0paddr_buff[LWKT_THREAD_STACK]; 1380 1381 1382 /* software prototypes -- in more palatable form */ 1383 struct soft_segment_descriptor gdt_segs[] = { 1384 /* GNULL_SEL 0 Null Descriptor */ 1385 { 0x0, /* segment base address */ 1386 0x0, /* length */ 1387 0, /* segment type */ 1388 0, /* segment descriptor priority level */ 1389 0, /* segment descriptor present */ 1390 0, /* long */ 1391 0, /* default 32 vs 16 bit size */ 1392 0 /* limit granularity (byte/page units)*/ }, 1393 /* GCODE_SEL 1 Code Descriptor for kernel */ 1394 { 0x0, /* segment base address */ 1395 0xfffff, /* length - all address space */ 1396 SDT_MEMERA, /* segment type */ 1397 SEL_KPL, /* segment descriptor priority level */ 1398 1, /* segment descriptor present */ 1399 1, /* long */ 1400 0, /* default 32 vs 16 bit size */ 1401 1 /* limit granularity (byte/page units)*/ }, 1402 /* GDATA_SEL 2 Data Descriptor for kernel */ 1403 { 0x0, /* segment base address */ 1404 0xfffff, /* length - all address space */ 1405 SDT_MEMRWA, /* segment type */ 1406 SEL_KPL, /* segment descriptor priority level */ 1407 1, /* segment descriptor present */ 1408 1, /* long */ 1409 0, /* default 32 vs 16 bit size */ 1410 1 /* limit granularity (byte/page units)*/ }, 1411 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */ 1412 { 0x0, /* segment base address */ 1413 0xfffff, /* length - all address space */ 1414 SDT_MEMERA, /* segment type */ 1415 SEL_UPL, /* segment descriptor priority level */ 1416 1, /* segment descriptor present */ 1417 0, /* long */ 1418 1, /* default 32 vs 16 bit size */ 1419 1 /* limit granularity (byte/page units)*/ }, 1420 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */ 1421 { 0x0, /* segment base address */ 1422 0xfffff, /* length - all address space */ 1423 SDT_MEMRWA, /* segment type */ 1424 SEL_UPL, /* segment descriptor priority level */ 1425 1, /* segment descriptor present */ 1426 0, /* long */ 1427 1, /* default 32 vs 16 bit size */ 1428 1 /* limit granularity (byte/page units)*/ }, 1429 /* GUCODE_SEL 5 64 bit Code Descriptor for user */ 1430 { 0x0, /* segment base address */ 1431 0xfffff, /* length - all address space */ 1432 SDT_MEMERA, /* segment type */ 1433 SEL_UPL, /* segment descriptor priority level */ 1434 1, /* segment descriptor present */ 1435 1, /* long */ 1436 0, /* default 32 vs 16 bit size */ 1437 1 /* limit granularity (byte/page units)*/ }, 1438 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */ 1439 { 1440 0x0, /* segment base address */ 1441 sizeof(struct x86_64tss)-1,/* length - all address space */ 1442 SDT_SYSTSS, /* segment type */ 1443 SEL_KPL, /* segment descriptor priority level */ 1444 1, /* segment descriptor present */ 1445 0, /* long */ 1446 0, /* unused - default 32 vs 16 bit size */ 1447 0 /* limit granularity (byte/page units)*/ }, 1448 /* Actually, the TSS is a system descriptor which is double size */ 1449 { 0x0, /* segment base address */ 1450 0x0, /* length */ 1451 0, /* segment type */ 1452 0, /* segment descriptor priority level */ 1453 0, /* segment descriptor present */ 1454 0, /* long */ 1455 0, /* default 32 vs 16 bit size */ 1456 0 /* limit granularity (byte/page units)*/ }, 1457 /* GUGS32_SEL 8 32 bit GS Descriptor for user */ 1458 { 0x0, /* segment base address */ 1459 0xfffff, /* length - all address space */ 1460 SDT_MEMRWA, /* segment type */ 1461 SEL_UPL, /* segment descriptor priority level */ 1462 1, /* segment descriptor present */ 1463 0, /* long */ 1464 1, /* default 32 vs 16 bit size */ 1465 1 /* limit granularity (byte/page units)*/ }, 1466 }; 1467 1468 void 1469 setidt_global(int idx, inthand_t *func, int typ, int dpl, int ist) 1470 { 1471 int cpu; 1472 1473 for (cpu = 0; cpu < MAXCPU; ++cpu) { 1474 struct gate_descriptor *ip = &idt_arr[cpu][idx]; 1475 1476 ip->gd_looffset = (uintptr_t)func; 1477 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 1478 ip->gd_ist = ist; 1479 ip->gd_xx = 0; 1480 ip->gd_type = typ; 1481 ip->gd_dpl = dpl; 1482 ip->gd_p = 1; 1483 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 1484 } 1485 } 1486 1487 void 1488 setidt(int idx, inthand_t *func, int typ, int dpl, int ist, int cpu) 1489 { 1490 struct gate_descriptor *ip; 1491 1492 KASSERT(cpu >= 0 && cpu < ncpus, ("invalid cpu %d", cpu)); 1493 1494 ip = &idt_arr[cpu][idx]; 1495 ip->gd_looffset = (uintptr_t)func; 1496 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 1497 ip->gd_ist = ist; 1498 ip->gd_xx = 0; 1499 ip->gd_type = typ; 1500 ip->gd_dpl = dpl; 1501 ip->gd_p = 1; 1502 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 1503 } 1504 1505 #define IDTVEC(name) __CONCAT(X,name) 1506 1507 extern inthand_t 1508 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 1509 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 1510 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 1511 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 1512 IDTVEC(xmm), IDTVEC(dblfault), 1513 IDTVEC(fast_syscall), IDTVEC(fast_syscall32); 1514 1515 void 1516 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd) 1517 { 1518 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 1519 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 1520 ssd->ssd_type = sd->sd_type; 1521 ssd->ssd_dpl = sd->sd_dpl; 1522 ssd->ssd_p = sd->sd_p; 1523 ssd->ssd_def32 = sd->sd_def32; 1524 ssd->ssd_gran = sd->sd_gran; 1525 } 1526 1527 void 1528 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd) 1529 { 1530 1531 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 1532 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 1533 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 1534 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 1535 sd->sd_type = ssd->ssd_type; 1536 sd->sd_dpl = ssd->ssd_dpl; 1537 sd->sd_p = ssd->ssd_p; 1538 sd->sd_long = ssd->ssd_long; 1539 sd->sd_def32 = ssd->ssd_def32; 1540 sd->sd_gran = ssd->ssd_gran; 1541 } 1542 1543 void 1544 ssdtosyssd(struct soft_segment_descriptor *ssd, 1545 struct system_segment_descriptor *sd) 1546 { 1547 1548 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 1549 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 1550 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 1551 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 1552 sd->sd_type = ssd->ssd_type; 1553 sd->sd_dpl = ssd->ssd_dpl; 1554 sd->sd_p = ssd->ssd_p; 1555 sd->sd_gran = ssd->ssd_gran; 1556 } 1557 1558 /* 1559 * Populate the (physmap) array with base/bound pairs describing the 1560 * available physical memory in the system, then test this memory and 1561 * build the phys_avail array describing the actually-available memory. 1562 * 1563 * If we cannot accurately determine the physical memory map, then use 1564 * value from the 0xE801 call, and failing that, the RTC. 1565 * 1566 * Total memory size may be set by the kernel environment variable 1567 * hw.physmem or the compile-time define MAXMEM. 1568 * 1569 * Memory is aligned to PHYSMAP_ALIGN which must be a multiple 1570 * of PAGE_SIZE. This also greatly reduces the memory test time 1571 * which would otherwise be excessive on machines with > 8G of ram. 1572 * 1573 * XXX first should be vm_paddr_t. 1574 */ 1575 1576 #define PHYSMAP_ALIGN (vm_paddr_t)(128 * 1024) 1577 #define PHYSMAP_ALIGN_MASK (vm_paddr_t)(PHYSMAP_ALIGN - 1) 1578 vm_paddr_t physmap[PHYSMAP_SIZE]; 1579 struct bios_smap *smapbase, *smap, *smapend; 1580 struct efi_map_header *efihdrbase; 1581 u_int32_t smapsize; 1582 1583 static void 1584 add_smap_entries(int *physmap_idx) 1585 { 1586 int i; 1587 1588 smapsize = *((u_int32_t *)smapbase - 1); 1589 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1590 1591 for (smap = smapbase; smap < smapend; smap++) { 1592 if (boothowto & RB_VERBOSE) 1593 kprintf("SMAP type=%02x base=%016lx len=%016lx\n", 1594 smap->type, smap->base, smap->length); 1595 1596 if (smap->type != SMAP_TYPE_MEMORY) 1597 continue; 1598 1599 if (smap->length == 0) 1600 continue; 1601 1602 for (i = 0; i <= *physmap_idx; i += 2) { 1603 if (smap->base < physmap[i + 1]) { 1604 if (boothowto & RB_VERBOSE) { 1605 kprintf("Overlapping or non-monotonic " 1606 "memory region, ignoring " 1607 "second region\n"); 1608 } 1609 break; 1610 } 1611 } 1612 if (i <= *physmap_idx) 1613 continue; 1614 1615 Realmem += smap->length; 1616 1617 if (smap->base == physmap[*physmap_idx + 1]) { 1618 physmap[*physmap_idx + 1] += smap->length; 1619 continue; 1620 } 1621 1622 *physmap_idx += 2; 1623 if (*physmap_idx == PHYSMAP_SIZE) { 1624 kprintf("Too many segments in the physical " 1625 "address map, giving up\n"); 1626 break; 1627 } 1628 physmap[*physmap_idx] = smap->base; 1629 physmap[*physmap_idx + 1] = smap->base + smap->length; 1630 } 1631 } 1632 1633 #define efi_next_descriptor(ptr, size) \ 1634 ((struct efi_md *)(((uint8_t *) ptr) + size)) 1635 1636 static void 1637 add_efi_map_entries(int *physmap_idx) 1638 { 1639 struct efi_md *map, *p; 1640 const char *type; 1641 size_t efisz; 1642 int i, ndesc; 1643 1644 static const char *types[] = { 1645 "Reserved", 1646 "LoaderCode", 1647 "LoaderData", 1648 "BootServicesCode", 1649 "BootServicesData", 1650 "RuntimeServicesCode", 1651 "RuntimeServicesData", 1652 "ConventionalMemory", 1653 "UnusableMemory", 1654 "ACPIReclaimMemory", 1655 "ACPIMemoryNVS", 1656 "MemoryMappedIO", 1657 "MemoryMappedIOPortSpace", 1658 "PalCode" 1659 }; 1660 1661 /* 1662 * Memory map data provided by UEFI via the GetMemoryMap 1663 * Boot Services API. 1664 */ 1665 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; 1666 map = (struct efi_md *)((uint8_t *)efihdrbase + efisz); 1667 1668 if (efihdrbase->descriptor_size == 0) 1669 return; 1670 ndesc = efihdrbase->memory_size / efihdrbase->descriptor_size; 1671 1672 if (boothowto & RB_VERBOSE) 1673 kprintf("%23s %12s %12s %8s %4s\n", 1674 "Type", "Physical", "Virtual", "#Pages", "Attr"); 1675 1676 for (i = 0, p = map; i < ndesc; i++, 1677 p = efi_next_descriptor(p, efihdrbase->descriptor_size)) { 1678 if (boothowto & RB_VERBOSE) { 1679 if (p->md_type <= EFI_MD_TYPE_PALCODE) 1680 type = types[p->md_type]; 1681 else 1682 type = "<INVALID>"; 1683 kprintf("%23s %012lx %12p %08lx ", type, p->md_phys, 1684 p->md_virt, p->md_pages); 1685 if (p->md_attr & EFI_MD_ATTR_UC) 1686 kprintf("UC "); 1687 if (p->md_attr & EFI_MD_ATTR_WC) 1688 kprintf("WC "); 1689 if (p->md_attr & EFI_MD_ATTR_WT) 1690 kprintf("WT "); 1691 if (p->md_attr & EFI_MD_ATTR_WB) 1692 kprintf("WB "); 1693 if (p->md_attr & EFI_MD_ATTR_UCE) 1694 kprintf("UCE "); 1695 if (p->md_attr & EFI_MD_ATTR_WP) 1696 kprintf("WP "); 1697 if (p->md_attr & EFI_MD_ATTR_RP) 1698 kprintf("RP "); 1699 if (p->md_attr & EFI_MD_ATTR_XP) 1700 kprintf("XP "); 1701 if (p->md_attr & EFI_MD_ATTR_RT) 1702 kprintf("RUNTIME"); 1703 kprintf("\n"); 1704 } 1705 1706 switch (p->md_type) { 1707 case EFI_MD_TYPE_CODE: 1708 case EFI_MD_TYPE_DATA: 1709 case EFI_MD_TYPE_BS_CODE: 1710 case EFI_MD_TYPE_BS_DATA: 1711 case EFI_MD_TYPE_FREE: 1712 /* 1713 * We're allowed to use any entry with these types. 1714 */ 1715 break; 1716 default: 1717 continue; 1718 } 1719 1720 Realmem += p->md_pages * PAGE_SIZE; 1721 1722 if (p->md_phys == physmap[*physmap_idx + 1]) { 1723 physmap[*physmap_idx + 1] += p->md_pages * PAGE_SIZE; 1724 continue; 1725 } 1726 1727 *physmap_idx += 2; 1728 if (*physmap_idx == PHYSMAP_SIZE) { 1729 kprintf("Too many segments in the physical " 1730 "address map, giving up\n"); 1731 break; 1732 } 1733 physmap[*physmap_idx] = p->md_phys; 1734 physmap[*physmap_idx + 1] = p->md_phys + p->md_pages * PAGE_SIZE; 1735 } 1736 } 1737 1738 struct fb_info efi_fb_info; 1739 static int have_efi_framebuffer = 0; 1740 1741 static void 1742 efi_fb_init_vaddr(int direct_map) 1743 { 1744 uint64_t sz; 1745 vm_offset_t addr, v; 1746 1747 v = efi_fb_info.vaddr; 1748 sz = efi_fb_info.stride * efi_fb_info.height; 1749 1750 if (direct_map) { 1751 addr = PHYS_TO_DMAP(efi_fb_info.paddr); 1752 if (addr >= DMAP_MIN_ADDRESS && addr + sz < DMAP_MAX_ADDRESS) 1753 efi_fb_info.vaddr = addr; 1754 } else { 1755 efi_fb_info.vaddr = (vm_offset_t)pmap_mapdev_attr( 1756 efi_fb_info.paddr, sz, PAT_WRITE_COMBINING); 1757 } 1758 1759 if (v == 0 && efi_fb_info.vaddr != 0) 1760 memset((void *)efi_fb_info.vaddr, 0x77, sz); 1761 } 1762 1763 int 1764 probe_efi_fb(int early) 1765 { 1766 struct efi_fb *efifb; 1767 caddr_t kmdp; 1768 1769 if (have_efi_framebuffer) { 1770 if (!early && 1771 (efi_fb_info.vaddr == 0 || 1772 efi_fb_info.vaddr == PHYS_TO_DMAP(efi_fb_info.paddr))) 1773 efi_fb_init_vaddr(0); 1774 return 0; 1775 } 1776 1777 kmdp = preload_search_by_type("elf kernel"); 1778 if (kmdp == NULL) 1779 kmdp = preload_search_by_type("elf64 kernel"); 1780 efifb = (struct efi_fb *)preload_search_info(kmdp, 1781 MODINFO_METADATA | MODINFOMD_EFI_FB); 1782 if (efifb == NULL) 1783 return 1; 1784 1785 have_efi_framebuffer = 1; 1786 1787 efi_fb_info.is_vga_boot_display = 1; 1788 efi_fb_info.width = efifb->fb_width; 1789 efi_fb_info.height = efifb->fb_height; 1790 efi_fb_info.stride = efifb->fb_stride * 4; 1791 efi_fb_info.depth = 32; 1792 efi_fb_info.paddr = efifb->fb_addr; 1793 if (early) { 1794 efi_fb_info.vaddr = 0; 1795 } else { 1796 efi_fb_init_vaddr(0); 1797 } 1798 efi_fb_info.restore = NULL; 1799 efi_fb_info.device = NULL; 1800 1801 return 0; 1802 } 1803 1804 static void 1805 efifb_startup(void *arg) 1806 { 1807 probe_efi_fb(0); 1808 } 1809 1810 SYSINIT(efi_fb_info, SI_BOOT1_POST, SI_ORDER_FIRST, efifb_startup, NULL); 1811 1812 static void 1813 getmemsize(caddr_t kmdp, u_int64_t first) 1814 { 1815 int off, physmap_idx, pa_indx, da_indx; 1816 int i, j; 1817 vm_paddr_t pa; 1818 vm_paddr_t msgbuf_size; 1819 u_long physmem_tunable; 1820 pt_entry_t *pte; 1821 quad_t dcons_addr, dcons_size; 1822 1823 bzero(physmap, sizeof(physmap)); 1824 physmap_idx = 0; 1825 1826 /* 1827 * get memory map from INT 15:E820, kindly supplied by the loader. 1828 * 1829 * subr_module.c says: 1830 * "Consumer may safely assume that size value precedes data." 1831 * ie: an int32_t immediately precedes smap. 1832 */ 1833 efihdrbase = (struct efi_map_header *)preload_search_info(kmdp, 1834 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1835 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1836 MODINFO_METADATA | MODINFOMD_SMAP); 1837 if (smapbase == NULL && efihdrbase == NULL) 1838 panic("No BIOS smap or EFI map info from loader!"); 1839 1840 if (efihdrbase == NULL) 1841 add_smap_entries(&physmap_idx); 1842 else 1843 add_efi_map_entries(&physmap_idx); 1844 1845 base_memory = physmap[1] / 1024; 1846 /* make hole for AP bootstrap code */ 1847 physmap[1] = mp_bootaddress(base_memory); 1848 1849 /* Save EBDA address, if any */ 1850 ebda_addr = (u_long)(*(u_short *)(KERNBASE + 0x40e)); 1851 ebda_addr <<= 4; 1852 1853 /* 1854 * Maxmem isn't the "maximum memory", it's one larger than the 1855 * highest page of the physical address space. It should be 1856 * called something like "Maxphyspage". We may adjust this 1857 * based on ``hw.physmem'' and the results of the memory test. 1858 */ 1859 Maxmem = atop(physmap[physmap_idx + 1]); 1860 1861 #ifdef MAXMEM 1862 Maxmem = MAXMEM / 4; 1863 #endif 1864 1865 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 1866 Maxmem = atop(physmem_tunable); 1867 1868 /* 1869 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 1870 * in the system. 1871 */ 1872 if (Maxmem > atop(physmap[physmap_idx + 1])) 1873 Maxmem = atop(physmap[physmap_idx + 1]); 1874 1875 /* 1876 * Blowing out the DMAP will blow up the system. 1877 */ 1878 if (Maxmem > atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS)) { 1879 kprintf("Limiting Maxmem due to DMAP size\n"); 1880 Maxmem = atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS); 1881 } 1882 1883 if (atop(physmap[physmap_idx + 1]) != Maxmem && 1884 (boothowto & RB_VERBOSE)) { 1885 kprintf("Physical memory use set to %ldK\n", Maxmem * 4); 1886 } 1887 1888 /* 1889 * Call pmap initialization to make new kernel address space 1890 * 1891 * Mask off page 0. 1892 */ 1893 pmap_bootstrap(&first); 1894 physmap[0] = PAGE_SIZE; 1895 1896 /* 1897 * Align the physmap to PHYSMAP_ALIGN and cut out anything 1898 * exceeding Maxmem. 1899 */ 1900 for (i = j = 0; i <= physmap_idx; i += 2) { 1901 if (physmap[i+1] > ptoa(Maxmem)) 1902 physmap[i+1] = ptoa(Maxmem); 1903 physmap[i] = (physmap[i] + PHYSMAP_ALIGN_MASK) & 1904 ~PHYSMAP_ALIGN_MASK; 1905 physmap[i+1] = physmap[i+1] & ~PHYSMAP_ALIGN_MASK; 1906 1907 physmap[j] = physmap[i]; 1908 physmap[j+1] = physmap[i+1]; 1909 1910 if (physmap[i] < physmap[i+1]) 1911 j += 2; 1912 } 1913 physmap_idx = j - 2; 1914 1915 /* 1916 * Align anything else used in the validation loop. 1917 */ 1918 first = (first + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK; 1919 1920 /* 1921 * Size up each available chunk of physical memory. 1922 */ 1923 pa_indx = 0; 1924 da_indx = 1; 1925 phys_avail[pa_indx++] = physmap[0]; 1926 phys_avail[pa_indx] = physmap[0]; 1927 dump_avail[da_indx] = physmap[0]; 1928 pte = CMAP1; 1929 1930 /* 1931 * Get dcons buffer address 1932 */ 1933 if (kgetenv_quad("dcons.addr", &dcons_addr) == 0 || 1934 kgetenv_quad("dcons.size", &dcons_size) == 0) 1935 dcons_addr = 0; 1936 1937 /* 1938 * Validate the physical memory. The physical memory segments 1939 * have already been aligned to PHYSMAP_ALIGN which is a multiple 1940 * of PAGE_SIZE. 1941 */ 1942 for (i = 0; i <= physmap_idx; i += 2) { 1943 vm_paddr_t end; 1944 1945 end = physmap[i + 1]; 1946 1947 for (pa = physmap[i]; pa < end; pa += PHYSMAP_ALIGN) { 1948 int tmp, page_bad, full; 1949 int *ptr = (int *)CADDR1; 1950 1951 full = FALSE; 1952 /* 1953 * block out kernel memory as not available. 1954 */ 1955 if (pa >= 0x200000 && pa < first) 1956 goto do_dump_avail; 1957 1958 /* 1959 * block out dcons buffer 1960 */ 1961 if (dcons_addr > 0 1962 && pa >= trunc_page(dcons_addr) 1963 && pa < dcons_addr + dcons_size) { 1964 goto do_dump_avail; 1965 } 1966 1967 page_bad = FALSE; 1968 1969 /* 1970 * map page into kernel: valid, read/write,non-cacheable 1971 */ 1972 *pte = pa | 1973 kernel_pmap.pmap_bits[PG_V_IDX] | 1974 kernel_pmap.pmap_bits[PG_RW_IDX] | 1975 kernel_pmap.pmap_bits[PG_N_IDX]; 1976 cpu_invltlb(); 1977 1978 tmp = *ptr; 1979 /* 1980 * Test for alternating 1's and 0's 1981 */ 1982 *(volatile int *)ptr = 0xaaaaaaaa; 1983 cpu_mfence(); 1984 if (*(volatile int *)ptr != 0xaaaaaaaa) 1985 page_bad = TRUE; 1986 /* 1987 * Test for alternating 0's and 1's 1988 */ 1989 *(volatile int *)ptr = 0x55555555; 1990 cpu_mfence(); 1991 if (*(volatile int *)ptr != 0x55555555) 1992 page_bad = TRUE; 1993 /* 1994 * Test for all 1's 1995 */ 1996 *(volatile int *)ptr = 0xffffffff; 1997 cpu_mfence(); 1998 if (*(volatile int *)ptr != 0xffffffff) 1999 page_bad = TRUE; 2000 /* 2001 * Test for all 0's 2002 */ 2003 *(volatile int *)ptr = 0x0; 2004 cpu_mfence(); 2005 if (*(volatile int *)ptr != 0x0) 2006 page_bad = TRUE; 2007 /* 2008 * Restore original value. 2009 */ 2010 *ptr = tmp; 2011 2012 /* 2013 * Adjust array of valid/good pages. 2014 */ 2015 if (page_bad == TRUE) 2016 continue; 2017 /* 2018 * If this good page is a continuation of the 2019 * previous set of good pages, then just increase 2020 * the end pointer. Otherwise start a new chunk. 2021 * Note that "end" points one higher than end, 2022 * making the range >= start and < end. 2023 * If we're also doing a speculative memory 2024 * test and we at or past the end, bump up Maxmem 2025 * so that we keep going. The first bad page 2026 * will terminate the loop. 2027 */ 2028 if (phys_avail[pa_indx] == pa) { 2029 phys_avail[pa_indx] += PHYSMAP_ALIGN; 2030 } else { 2031 pa_indx++; 2032 if (pa_indx == PHYS_AVAIL_ARRAY_END) { 2033 kprintf( 2034 "Too many holes in the physical address space, giving up\n"); 2035 pa_indx--; 2036 full = TRUE; 2037 goto do_dump_avail; 2038 } 2039 phys_avail[pa_indx++] = pa; 2040 phys_avail[pa_indx] = pa + PHYSMAP_ALIGN; 2041 } 2042 physmem += PHYSMAP_ALIGN / PAGE_SIZE; 2043 do_dump_avail: 2044 if (dump_avail[da_indx] == pa) { 2045 dump_avail[da_indx] += PHYSMAP_ALIGN; 2046 } else { 2047 da_indx++; 2048 if (da_indx == DUMP_AVAIL_ARRAY_END) { 2049 da_indx--; 2050 goto do_next; 2051 } 2052 dump_avail[da_indx++] = pa; 2053 dump_avail[da_indx] = pa + PHYSMAP_ALIGN; 2054 } 2055 do_next: 2056 if (full) 2057 break; 2058 } 2059 } 2060 *pte = 0; 2061 cpu_invltlb(); 2062 2063 /* 2064 * The last chunk must contain at least one page plus the message 2065 * buffer to avoid complicating other code (message buffer address 2066 * calculation, etc.). 2067 */ 2068 msgbuf_size = (MSGBUF_SIZE + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK; 2069 2070 while (phys_avail[pa_indx - 1] + PHYSMAP_ALIGN + 2071 msgbuf_size >= phys_avail[pa_indx]) { 2072 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 2073 phys_avail[pa_indx--] = 0; 2074 phys_avail[pa_indx--] = 0; 2075 } 2076 2077 Maxmem = atop(phys_avail[pa_indx]); 2078 2079 /* Trim off space for the message buffer. */ 2080 phys_avail[pa_indx] -= msgbuf_size; 2081 2082 avail_end = phys_avail[pa_indx]; 2083 2084 /* Map the message buffer. */ 2085 for (off = 0; off < msgbuf_size; off += PAGE_SIZE) { 2086 pmap_kenter((vm_offset_t)msgbufp + off, 2087 phys_avail[pa_indx] + off); 2088 } 2089 /* Try to get EFI framebuffer working as early as possible */ 2090 if (have_efi_framebuffer) 2091 efi_fb_init_vaddr(1); 2092 } 2093 2094 struct machintr_abi MachIntrABI; 2095 2096 /* 2097 * IDT VECTORS: 2098 * 0 Divide by zero 2099 * 1 Debug 2100 * 2 NMI 2101 * 3 BreakPoint 2102 * 4 OverFlow 2103 * 5 Bound-Range 2104 * 6 Invalid OpCode 2105 * 7 Device Not Available (x87) 2106 * 8 Double-Fault 2107 * 9 Coprocessor Segment overrun (unsupported, reserved) 2108 * 10 Invalid-TSS 2109 * 11 Segment not present 2110 * 12 Stack 2111 * 13 General Protection 2112 * 14 Page Fault 2113 * 15 Reserved 2114 * 16 x87 FP Exception pending 2115 * 17 Alignment Check 2116 * 18 Machine Check 2117 * 19 SIMD floating point 2118 * 20-31 reserved 2119 * 32-255 INTn/external sources 2120 */ 2121 u_int64_t 2122 hammer_time(u_int64_t modulep, u_int64_t physfree) 2123 { 2124 caddr_t kmdp; 2125 int gsel_tss, x, cpu; 2126 #if 0 /* JG */ 2127 int metadata_missing, off; 2128 #endif 2129 struct mdglobaldata *gd; 2130 u_int64_t msr; 2131 2132 /* 2133 * Prevent lowering of the ipl if we call tsleep() early. 2134 */ 2135 gd = &CPU_prvspace[0]->mdglobaldata; 2136 bzero(gd, sizeof(*gd)); 2137 2138 /* 2139 * Note: on both UP and SMP curthread must be set non-NULL 2140 * early in the boot sequence because the system assumes 2141 * that 'curthread' is never NULL. 2142 */ 2143 2144 gd->mi.gd_curthread = &thread0; 2145 thread0.td_gd = &gd->mi; 2146 2147 atdevbase = ISA_HOLE_START + PTOV_OFFSET; 2148 2149 #if 0 /* JG */ 2150 metadata_missing = 0; 2151 if (bootinfo.bi_modulep) { 2152 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; 2153 preload_bootstrap_relocate(KERNBASE); 2154 } else { 2155 metadata_missing = 1; 2156 } 2157 if (bootinfo.bi_envp) 2158 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE; 2159 #endif 2160 2161 preload_metadata = (caddr_t)(uintptr_t)(modulep + PTOV_OFFSET); 2162 preload_bootstrap_relocate(PTOV_OFFSET); 2163 kmdp = preload_search_by_type("elf kernel"); 2164 if (kmdp == NULL) 2165 kmdp = preload_search_by_type("elf64 kernel"); 2166 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 2167 kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + PTOV_OFFSET; 2168 #ifdef DDB 2169 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 2170 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 2171 #endif 2172 2173 if (boothowto & RB_VERBOSE) 2174 bootverbose++; 2175 2176 /* 2177 * Default MachIntrABI to ICU 2178 */ 2179 MachIntrABI = MachIntrABI_ICU; 2180 2181 /* 2182 * start with one cpu. Note: with one cpu, ncpus2_shift, ncpus2_mask, 2183 * and ncpus_fit_mask remain 0. 2184 */ 2185 ncpus = 1; 2186 ncpus2 = 1; 2187 ncpus_fit = 1; 2188 /* Init basic tunables, hz etc */ 2189 init_param1(); 2190 2191 /* 2192 * make gdt memory segments 2193 */ 2194 gdt_segs[GPROC0_SEL].ssd_base = 2195 (uintptr_t) &CPU_prvspace[0]->mdglobaldata.gd_common_tss; 2196 2197 gd->mi.gd_prvspace = CPU_prvspace[0]; 2198 2199 for (x = 0; x < NGDT; x++) { 2200 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1)) 2201 ssdtosd(&gdt_segs[x], &gdt[x]); 2202 } 2203 ssdtosyssd(&gdt_segs[GPROC0_SEL], 2204 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 2205 2206 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 2207 r_gdt.rd_base = (long) gdt; 2208 lgdt(&r_gdt); 2209 2210 wrmsr(MSR_FSBASE, 0); /* User value */ 2211 wrmsr(MSR_GSBASE, (u_int64_t)&gd->mi); 2212 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 2213 2214 mi_gdinit(&gd->mi, 0); 2215 cpu_gdinit(gd, 0); 2216 proc0paddr = proc0paddr_buff; 2217 mi_proc0init(&gd->mi, proc0paddr); 2218 safepri = TDPRI_MAX; 2219 2220 /* spinlocks and the BGL */ 2221 init_locks(); 2222 2223 /* exceptions */ 2224 for (x = 0; x < NIDT; x++) 2225 setidt_global(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); 2226 setidt_global(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); 2227 setidt_global(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0); 2228 setidt_global(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 1); 2229 setidt_global(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0); 2230 setidt_global(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0); 2231 setidt_global(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); 2232 setidt_global(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); 2233 setidt_global(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); 2234 setidt_global(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 2235 setidt_global(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); 2236 setidt_global(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); 2237 setidt_global(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); 2238 setidt_global(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); 2239 setidt_global(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); 2240 setidt_global(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); 2241 setidt_global(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); 2242 setidt_global(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); 2243 setidt_global(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0); 2244 setidt_global(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); 2245 2246 for (cpu = 0; cpu < MAXCPU; ++cpu) { 2247 r_idt_arr[cpu].rd_limit = sizeof(idt_arr[cpu]) - 1; 2248 r_idt_arr[cpu].rd_base = (long) &idt_arr[cpu][0]; 2249 } 2250 2251 lidt(&r_idt_arr[0]); 2252 2253 /* 2254 * Initialize the console before we print anything out. 2255 */ 2256 cninit(); 2257 2258 #if 0 /* JG */ 2259 if (metadata_missing) 2260 kprintf("WARNING: loader(8) metadata is missing!\n"); 2261 #endif 2262 2263 #if NISA >0 2264 elcr_probe(); 2265 isa_defaultirq(); 2266 #endif 2267 rand_initialize(); 2268 2269 /* 2270 * Initialize IRQ mapping 2271 * 2272 * NOTE: 2273 * SHOULD be after elcr_probe() 2274 */ 2275 MachIntrABI_ICU.initmap(); 2276 MachIntrABI_IOAPIC.initmap(); 2277 2278 #ifdef DDB 2279 kdb_init(); 2280 if (boothowto & RB_KDB) 2281 Debugger("Boot flags requested debugger"); 2282 #endif 2283 2284 #if 0 /* JG */ 2285 finishidentcpu(); /* Final stage of CPU initialization */ 2286 setidt(6, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 2287 setidt(13, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); 2288 #endif 2289 identify_cpu(); /* Final stage of CPU initialization */ 2290 initializecpu(0); /* Initialize CPU registers */ 2291 2292 /* 2293 * On modern intel cpus, haswell or later, cpu_idle_hlt=1 is better 2294 * becaue the cpu does significant power management in HLT 2295 * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP). 2296 * 2297 * On modern amd cpus or on any older amd or intel cpu, 2298 * cpu_idle_hlt=2 is better because ACPI is needed to reduce power 2299 * consumption. 2300 */ 2301 if (cpu_vendor_id == CPU_VENDOR_INTEL && 2302 CPUID_TO_MODEL(cpu_id) >= 0x3C) { /* Haswell or later */ 2303 cpu_idle_hlt = 1; 2304 } 2305 2306 TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable); /* for compat */ 2307 TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable); 2308 TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable); 2309 TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt); 2310 2311 /* 2312 * Some of the virtual machines do not work w/ I/O APIC 2313 * enabled. If the user does not explicitly enable or 2314 * disable the I/O APIC (ioapic_enable < 0), then we 2315 * disable I/O APIC on all virtual machines. 2316 * 2317 * NOTE: 2318 * This must be done after identify_cpu(), which sets 2319 * 'cpu_feature2' 2320 */ 2321 if (ioapic_enable < 0) { 2322 if (cpu_feature2 & CPUID2_VMM) 2323 ioapic_enable = 0; 2324 else 2325 ioapic_enable = 1; 2326 } 2327 2328 /* make an initial tss so cpu can get interrupt stack on syscall! */ 2329 gd->gd_common_tss.tss_rsp0 = 2330 (register_t)(thread0.td_kstack + 2331 KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb)); 2332 /* Ensure the stack is aligned to 16 bytes */ 2333 gd->gd_common_tss.tss_rsp0 &= ~(register_t)0xF; 2334 2335 /* double fault stack */ 2336 gd->gd_common_tss.tss_ist1 = 2337 (long)&gd->mi.gd_prvspace->idlestack[ 2338 sizeof(gd->mi.gd_prvspace->idlestack)]; 2339 2340 /* Set the IO permission bitmap (empty due to tss seg limit) */ 2341 gd->gd_common_tss.tss_iobase = sizeof(struct x86_64tss); 2342 2343 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 2344 gd->gd_tss_gdt = &gdt[GPROC0_SEL]; 2345 gd->gd_common_tssd = *gd->gd_tss_gdt; 2346 ltr(gsel_tss); 2347 2348 /* Set up the fast syscall stuff */ 2349 msr = rdmsr(MSR_EFER) | EFER_SCE; 2350 wrmsr(MSR_EFER, msr); 2351 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); 2352 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 2353 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 2354 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 2355 wrmsr(MSR_STAR, msr); 2356 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_IOPL); 2357 2358 getmemsize(kmdp, physfree); 2359 init_param2(physmem); 2360 2361 /* now running on new page tables, configured,and u/iom is accessible */ 2362 2363 /* Map the message buffer. */ 2364 #if 0 /* JG */ 2365 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) 2366 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); 2367 #endif 2368 2369 msgbufinit(msgbufp, MSGBUF_SIZE); 2370 2371 2372 /* transfer to user mode */ 2373 2374 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 2375 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 2376 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 2377 2378 load_ds(_udatasel); 2379 load_es(_udatasel); 2380 load_fs(_udatasel); 2381 2382 /* setup proc 0's pcb */ 2383 thread0.td_pcb->pcb_flags = 0; 2384 thread0.td_pcb->pcb_cr3 = KPML4phys; 2385 thread0.td_pcb->pcb_ext = NULL; 2386 lwp0.lwp_md.md_regs = &proc0_tf; /* XXX needed? */ 2387 2388 /* Location of kernel stack for locore */ 2389 return ((u_int64_t)thread0.td_pcb); 2390 } 2391 2392 /* 2393 * Initialize machine-dependant portions of the global data structure. 2394 * Note that the global data area and cpu0's idlestack in the private 2395 * data space were allocated in locore. 2396 * 2397 * Note: the idlethread's cpl is 0 2398 * 2399 * WARNING! Called from early boot, 'mycpu' may not work yet. 2400 */ 2401 void 2402 cpu_gdinit(struct mdglobaldata *gd, int cpu) 2403 { 2404 if (cpu) 2405 gd->mi.gd_curthread = &gd->mi.gd_idlethread; 2406 2407 lwkt_init_thread(&gd->mi.gd_idlethread, 2408 gd->mi.gd_prvspace->idlestack, 2409 sizeof(gd->mi.gd_prvspace->idlestack), 2410 0, &gd->mi); 2411 lwkt_set_comm(&gd->mi.gd_idlethread, "idle_%d", cpu); 2412 gd->mi.gd_idlethread.td_switch = cpu_lwkt_switch; 2413 gd->mi.gd_idlethread.td_sp -= sizeof(void *); 2414 *(void **)gd->mi.gd_idlethread.td_sp = cpu_idle_restore; 2415 } 2416 2417 /* 2418 * We only have to check for DMAP bounds, the globaldata space is 2419 * actually part of the kernel_map so we don't have to waste time 2420 * checking CPU_prvspace[*]. 2421 */ 2422 int 2423 is_globaldata_space(vm_offset_t saddr, vm_offset_t eaddr) 2424 { 2425 #if 0 2426 if (saddr >= (vm_offset_t)&CPU_prvspace[0] && 2427 eaddr <= (vm_offset_t)&CPU_prvspace[MAXCPU]) { 2428 return (TRUE); 2429 } 2430 #endif 2431 if (saddr >= DMAP_MIN_ADDRESS && eaddr <= DMAP_MAX_ADDRESS) 2432 return (TRUE); 2433 return (FALSE); 2434 } 2435 2436 struct globaldata * 2437 globaldata_find(int cpu) 2438 { 2439 KKASSERT(cpu >= 0 && cpu < ncpus); 2440 return(&CPU_prvspace[cpu]->mdglobaldata.mi); 2441 } 2442 2443 /* 2444 * This path should be safe from the SYSRET issue because only stopped threads 2445 * can have their %rip adjusted this way (and all heavy weight thread switches 2446 * clear QUICKREF and thus do not use SYSRET). However, the code path is 2447 * convoluted so add a safety by forcing %rip to be cannonical. 2448 */ 2449 int 2450 ptrace_set_pc(struct lwp *lp, unsigned long addr) 2451 { 2452 if (addr & 0x0000800000000000LLU) 2453 lp->lwp_md.md_regs->tf_rip = addr | 0xFFFF000000000000LLU; 2454 else 2455 lp->lwp_md.md_regs->tf_rip = addr & 0x0000FFFFFFFFFFFFLLU; 2456 return (0); 2457 } 2458 2459 int 2460 ptrace_single_step(struct lwp *lp) 2461 { 2462 lp->lwp_md.md_regs->tf_rflags |= PSL_T; 2463 return (0); 2464 } 2465 2466 int 2467 fill_regs(struct lwp *lp, struct reg *regs) 2468 { 2469 struct trapframe *tp; 2470 2471 if ((tp = lp->lwp_md.md_regs) == NULL) 2472 return EINVAL; 2473 bcopy(&tp->tf_rdi, ®s->r_rdi, sizeof(*regs)); 2474 return (0); 2475 } 2476 2477 int 2478 set_regs(struct lwp *lp, struct reg *regs) 2479 { 2480 struct trapframe *tp; 2481 2482 tp = lp->lwp_md.md_regs; 2483 if (!EFL_SECURE(regs->r_rflags, tp->tf_rflags) || 2484 !CS_SECURE(regs->r_cs)) 2485 return (EINVAL); 2486 bcopy(®s->r_rdi, &tp->tf_rdi, sizeof(*regs)); 2487 clear_quickret(); 2488 return (0); 2489 } 2490 2491 static void 2492 fill_fpregs_xmm(struct savexmm *sv_xmm, struct save87 *sv_87) 2493 { 2494 struct env87 *penv_87 = &sv_87->sv_env; 2495 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2496 int i; 2497 2498 /* FPU control/status */ 2499 penv_87->en_cw = penv_xmm->en_cw; 2500 penv_87->en_sw = penv_xmm->en_sw; 2501 penv_87->en_tw = penv_xmm->en_tw; 2502 penv_87->en_fip = penv_xmm->en_fip; 2503 penv_87->en_fcs = penv_xmm->en_fcs; 2504 penv_87->en_opcode = penv_xmm->en_opcode; 2505 penv_87->en_foo = penv_xmm->en_foo; 2506 penv_87->en_fos = penv_xmm->en_fos; 2507 2508 /* FPU registers */ 2509 for (i = 0; i < 8; ++i) 2510 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; 2511 } 2512 2513 static void 2514 set_fpregs_xmm(struct save87 *sv_87, struct savexmm *sv_xmm) 2515 { 2516 struct env87 *penv_87 = &sv_87->sv_env; 2517 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2518 int i; 2519 2520 /* FPU control/status */ 2521 penv_xmm->en_cw = penv_87->en_cw; 2522 penv_xmm->en_sw = penv_87->en_sw; 2523 penv_xmm->en_tw = penv_87->en_tw; 2524 penv_xmm->en_fip = penv_87->en_fip; 2525 penv_xmm->en_fcs = penv_87->en_fcs; 2526 penv_xmm->en_opcode = penv_87->en_opcode; 2527 penv_xmm->en_foo = penv_87->en_foo; 2528 penv_xmm->en_fos = penv_87->en_fos; 2529 2530 /* FPU registers */ 2531 for (i = 0; i < 8; ++i) 2532 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; 2533 } 2534 2535 int 2536 fill_fpregs(struct lwp *lp, struct fpreg *fpregs) 2537 { 2538 if (lp->lwp_thread == NULL || lp->lwp_thread->td_pcb == NULL) 2539 return EINVAL; 2540 if (cpu_fxsr) { 2541 fill_fpregs_xmm(&lp->lwp_thread->td_pcb->pcb_save.sv_xmm, 2542 (struct save87 *)fpregs); 2543 return (0); 2544 } 2545 bcopy(&lp->lwp_thread->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs); 2546 return (0); 2547 } 2548 2549 int 2550 set_fpregs(struct lwp *lp, struct fpreg *fpregs) 2551 { 2552 if (cpu_fxsr) { 2553 set_fpregs_xmm((struct save87 *)fpregs, 2554 &lp->lwp_thread->td_pcb->pcb_save.sv_xmm); 2555 return (0); 2556 } 2557 bcopy(fpregs, &lp->lwp_thread->td_pcb->pcb_save.sv_87, sizeof *fpregs); 2558 return (0); 2559 } 2560 2561 int 2562 fill_dbregs(struct lwp *lp, struct dbreg *dbregs) 2563 { 2564 struct pcb *pcb; 2565 2566 if (lp == NULL) { 2567 dbregs->dr[0] = rdr0(); 2568 dbregs->dr[1] = rdr1(); 2569 dbregs->dr[2] = rdr2(); 2570 dbregs->dr[3] = rdr3(); 2571 dbregs->dr[4] = rdr4(); 2572 dbregs->dr[5] = rdr5(); 2573 dbregs->dr[6] = rdr6(); 2574 dbregs->dr[7] = rdr7(); 2575 return (0); 2576 } 2577 if (lp->lwp_thread == NULL || (pcb = lp->lwp_thread->td_pcb) == NULL) 2578 return EINVAL; 2579 dbregs->dr[0] = pcb->pcb_dr0; 2580 dbregs->dr[1] = pcb->pcb_dr1; 2581 dbregs->dr[2] = pcb->pcb_dr2; 2582 dbregs->dr[3] = pcb->pcb_dr3; 2583 dbregs->dr[4] = 0; 2584 dbregs->dr[5] = 0; 2585 dbregs->dr[6] = pcb->pcb_dr6; 2586 dbregs->dr[7] = pcb->pcb_dr7; 2587 return (0); 2588 } 2589 2590 int 2591 set_dbregs(struct lwp *lp, struct dbreg *dbregs) 2592 { 2593 if (lp == NULL) { 2594 load_dr0(dbregs->dr[0]); 2595 load_dr1(dbregs->dr[1]); 2596 load_dr2(dbregs->dr[2]); 2597 load_dr3(dbregs->dr[3]); 2598 load_dr4(dbregs->dr[4]); 2599 load_dr5(dbregs->dr[5]); 2600 load_dr6(dbregs->dr[6]); 2601 load_dr7(dbregs->dr[7]); 2602 } else { 2603 struct pcb *pcb; 2604 struct ucred *ucred; 2605 int i; 2606 uint64_t mask1, mask2; 2607 2608 /* 2609 * Don't let an illegal value for dr7 get set. Specifically, 2610 * check for undefined settings. Setting these bit patterns 2611 * result in undefined behaviour and can lead to an unexpected 2612 * TRCTRAP. 2613 */ 2614 /* JG this loop looks unreadable */ 2615 /* Check 4 2-bit fields for invalid patterns. 2616 * These fields are R/Wi, for i = 0..3 2617 */ 2618 /* Is 10 in LENi allowed when running in compatibility mode? */ 2619 /* Pattern 10 in R/Wi might be used to indicate 2620 * breakpoint on I/O. Further analysis should be 2621 * carried to decide if it is safe and useful to 2622 * provide access to that capability 2623 */ 2624 for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 4; 2625 i++, mask1 <<= 4, mask2 <<= 4) 2626 if ((dbregs->dr[7] & mask1) == mask2) 2627 return (EINVAL); 2628 2629 pcb = lp->lwp_thread->td_pcb; 2630 ucred = lp->lwp_proc->p_ucred; 2631 2632 /* 2633 * Don't let a process set a breakpoint that is not within the 2634 * process's address space. If a process could do this, it 2635 * could halt the system by setting a breakpoint in the kernel 2636 * (if ddb was enabled). Thus, we need to check to make sure 2637 * that no breakpoints are being enabled for addresses outside 2638 * process's address space, unless, perhaps, we were called by 2639 * uid 0. 2640 * 2641 * XXX - what about when the watched area of the user's 2642 * address space is written into from within the kernel 2643 * ... wouldn't that still cause a breakpoint to be generated 2644 * from within kernel mode? 2645 */ 2646 2647 if (priv_check_cred(ucred, PRIV_ROOT, 0) != 0) { 2648 if (dbregs->dr[7] & 0x3) { 2649 /* dr0 is enabled */ 2650 if (dbregs->dr[0] >= VM_MAX_USER_ADDRESS) 2651 return (EINVAL); 2652 } 2653 2654 if (dbregs->dr[7] & (0x3<<2)) { 2655 /* dr1 is enabled */ 2656 if (dbregs->dr[1] >= VM_MAX_USER_ADDRESS) 2657 return (EINVAL); 2658 } 2659 2660 if (dbregs->dr[7] & (0x3<<4)) { 2661 /* dr2 is enabled */ 2662 if (dbregs->dr[2] >= VM_MAX_USER_ADDRESS) 2663 return (EINVAL); 2664 } 2665 2666 if (dbregs->dr[7] & (0x3<<6)) { 2667 /* dr3 is enabled */ 2668 if (dbregs->dr[3] >= VM_MAX_USER_ADDRESS) 2669 return (EINVAL); 2670 } 2671 } 2672 2673 pcb->pcb_dr0 = dbregs->dr[0]; 2674 pcb->pcb_dr1 = dbregs->dr[1]; 2675 pcb->pcb_dr2 = dbregs->dr[2]; 2676 pcb->pcb_dr3 = dbregs->dr[3]; 2677 pcb->pcb_dr6 = dbregs->dr[6]; 2678 pcb->pcb_dr7 = dbregs->dr[7]; 2679 2680 pcb->pcb_flags |= PCB_DBREGS; 2681 } 2682 2683 return (0); 2684 } 2685 2686 /* 2687 * Return > 0 if a hardware breakpoint has been hit, and the 2688 * breakpoint was in user space. Return 0, otherwise. 2689 */ 2690 int 2691 user_dbreg_trap(void) 2692 { 2693 u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */ 2694 u_int64_t bp; /* breakpoint bits extracted from dr6 */ 2695 int nbp; /* number of breakpoints that triggered */ 2696 caddr_t addr[4]; /* breakpoint addresses */ 2697 int i; 2698 2699 dr7 = rdr7(); 2700 if ((dr7 & 0xff) == 0) { 2701 /* 2702 * all GE and LE bits in the dr7 register are zero, 2703 * thus the trap couldn't have been caused by the 2704 * hardware debug registers 2705 */ 2706 return 0; 2707 } 2708 2709 nbp = 0; 2710 dr6 = rdr6(); 2711 bp = dr6 & 0xf; 2712 2713 if (bp == 0) { 2714 /* 2715 * None of the breakpoint bits are set meaning this 2716 * trap was not caused by any of the debug registers 2717 */ 2718 return 0; 2719 } 2720 2721 /* 2722 * at least one of the breakpoints were hit, check to see 2723 * which ones and if any of them are user space addresses 2724 */ 2725 2726 if (bp & 0x01) { 2727 addr[nbp++] = (caddr_t)rdr0(); 2728 } 2729 if (bp & 0x02) { 2730 addr[nbp++] = (caddr_t)rdr1(); 2731 } 2732 if (bp & 0x04) { 2733 addr[nbp++] = (caddr_t)rdr2(); 2734 } 2735 if (bp & 0x08) { 2736 addr[nbp++] = (caddr_t)rdr3(); 2737 } 2738 2739 for (i=0; i<nbp; i++) { 2740 if (addr[i] < 2741 (caddr_t)VM_MAX_USER_ADDRESS) { 2742 /* 2743 * addr[i] is in user space 2744 */ 2745 return nbp; 2746 } 2747 } 2748 2749 /* 2750 * None of the breakpoints are in user space. 2751 */ 2752 return 0; 2753 } 2754 2755 2756 #ifndef DDB 2757 void 2758 Debugger(const char *msg) 2759 { 2760 kprintf("Debugger(\"%s\") called.\n", msg); 2761 } 2762 #endif /* no DDB */ 2763 2764 #ifdef DDB 2765 2766 /* 2767 * Provide inb() and outb() as functions. They are normally only 2768 * available as macros calling inlined functions, thus cannot be 2769 * called inside DDB. 2770 * 2771 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined. 2772 */ 2773 2774 #undef inb 2775 #undef outb 2776 2777 /* silence compiler warnings */ 2778 u_char inb(u_int); 2779 void outb(u_int, u_char); 2780 2781 u_char 2782 inb(u_int port) 2783 { 2784 u_char data; 2785 /* 2786 * We use %%dx and not %1 here because i/o is done at %dx and not at 2787 * %edx, while gcc generates inferior code (movw instead of movl) 2788 * if we tell it to load (u_short) port. 2789 */ 2790 __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port)); 2791 return (data); 2792 } 2793 2794 void 2795 outb(u_int port, u_char data) 2796 { 2797 u_char al; 2798 /* 2799 * Use an unnecessary assignment to help gcc's register allocator. 2800 * This make a large difference for gcc-1.40 and a tiny difference 2801 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for 2802 * best results. gcc-2.6.0 can't handle this. 2803 */ 2804 al = data; 2805 __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port)); 2806 } 2807 2808 #endif /* DDB */ 2809 2810 2811 2812 /* 2813 * initialize all the SMP locks 2814 */ 2815 2816 /* critical region when masking or unmasking interupts */ 2817 struct spinlock_deprecated imen_spinlock; 2818 2819 /* lock region used by kernel profiling */ 2820 struct spinlock_deprecated mcount_spinlock; 2821 2822 /* locks com (tty) data/hardware accesses: a FASTINTR() */ 2823 struct spinlock_deprecated com_spinlock; 2824 2825 /* lock regions around the clock hardware */ 2826 struct spinlock_deprecated clock_spinlock; 2827 2828 static void 2829 init_locks(void) 2830 { 2831 /* 2832 * Get the initial mplock with a count of 1 for the BSP. 2833 * This uses a LOGICAL cpu ID, ie BSP == 0. 2834 */ 2835 cpu_get_initial_mplock(); 2836 /* DEPRECATED */ 2837 spin_init_deprecated(&mcount_spinlock); 2838 spin_init_deprecated(&imen_spinlock); 2839 spin_init_deprecated(&com_spinlock); 2840 spin_init_deprecated(&clock_spinlock); 2841 2842 /* our token pool needs to work early */ 2843 lwkt_token_pool_init(); 2844 } 2845 2846 boolean_t 2847 cpu_mwait_hint_valid(uint32_t hint) 2848 { 2849 int cx_idx, sub; 2850 2851 cx_idx = MWAIT_EAX_TO_CX(hint); 2852 if (cx_idx >= CPU_MWAIT_CX_MAX) 2853 return FALSE; 2854 2855 sub = MWAIT_EAX_TO_CX_SUB(hint); 2856 if (sub >= cpu_mwait_cx_info[cx_idx].subcnt) 2857 return FALSE; 2858 2859 return TRUE; 2860 } 2861 2862 void 2863 cpu_mwait_cx_no_bmsts(void) 2864 { 2865 atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_STS); 2866 } 2867 2868 void 2869 cpu_mwait_cx_no_bmarb(void) 2870 { 2871 atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_ARB); 2872 } 2873 2874 static int 2875 cpu_mwait_cx_hint2name(int hint, char *name, int namelen, boolean_t allow_auto) 2876 { 2877 int old_cx_idx, sub = 0; 2878 2879 if (hint >= 0) { 2880 old_cx_idx = MWAIT_EAX_TO_CX(hint); 2881 sub = MWAIT_EAX_TO_CX_SUB(hint); 2882 } else if (hint == CPU_MWAIT_HINT_AUTO) { 2883 old_cx_idx = allow_auto ? CPU_MWAIT_C2 : CPU_MWAIT_CX_MAX; 2884 } else if (hint == CPU_MWAIT_HINT_AUTODEEP) { 2885 old_cx_idx = allow_auto ? CPU_MWAIT_C3 : CPU_MWAIT_CX_MAX; 2886 } else { 2887 old_cx_idx = CPU_MWAIT_CX_MAX; 2888 } 2889 2890 if (!CPU_MWAIT_HAS_CX) 2891 strlcpy(name, "NONE", namelen); 2892 else if (allow_auto && hint == CPU_MWAIT_HINT_AUTO) 2893 strlcpy(name, "AUTO", namelen); 2894 else if (allow_auto && hint == CPU_MWAIT_HINT_AUTODEEP) 2895 strlcpy(name, "AUTODEEP", namelen); 2896 else if (old_cx_idx >= CPU_MWAIT_CX_MAX || 2897 sub >= cpu_mwait_cx_info[old_cx_idx].subcnt) 2898 strlcpy(name, "INVALID", namelen); 2899 else 2900 ksnprintf(name, namelen, "C%d/%d", old_cx_idx, sub); 2901 2902 return old_cx_idx; 2903 } 2904 2905 static int 2906 cpu_mwait_cx_name2hint(char *name, int *hint0, boolean_t allow_auto) 2907 { 2908 int cx_idx, sub, hint; 2909 char *ptr, *start; 2910 2911 if (allow_auto && strcmp(name, "AUTO") == 0) { 2912 hint = CPU_MWAIT_HINT_AUTO; 2913 cx_idx = CPU_MWAIT_C2; 2914 goto done; 2915 } 2916 if (allow_auto && strcmp(name, "AUTODEEP") == 0) { 2917 hint = CPU_MWAIT_HINT_AUTODEEP; 2918 cx_idx = CPU_MWAIT_C3; 2919 goto done; 2920 } 2921 2922 if (strlen(name) < 4 || toupper(name[0]) != 'C') 2923 return -1; 2924 start = &name[1]; 2925 ptr = NULL; 2926 2927 cx_idx = strtol(start, &ptr, 10); 2928 if (ptr == start || *ptr != '/') 2929 return -1; 2930 if (cx_idx < 0 || cx_idx >= CPU_MWAIT_CX_MAX) 2931 return -1; 2932 2933 start = ptr + 1; 2934 ptr = NULL; 2935 2936 sub = strtol(start, &ptr, 10); 2937 if (*ptr != '\0') 2938 return -1; 2939 if (sub < 0 || sub >= cpu_mwait_cx_info[cx_idx].subcnt) 2940 return -1; 2941 2942 hint = MWAIT_EAX_HINT(cx_idx, sub); 2943 done: 2944 *hint0 = hint; 2945 return cx_idx; 2946 } 2947 2948 static int 2949 cpu_mwait_cx_transit(int old_cx_idx, int cx_idx) 2950 { 2951 if (cx_idx >= CPU_MWAIT_C3 && cpu_mwait_c3_preamble) 2952 return EOPNOTSUPP; 2953 if (old_cx_idx < CPU_MWAIT_C3 && cx_idx >= CPU_MWAIT_C3) { 2954 int error; 2955 2956 error = cputimer_intr_powersave_addreq(); 2957 if (error) 2958 return error; 2959 } else if (old_cx_idx >= CPU_MWAIT_C3 && cx_idx < CPU_MWAIT_C3) { 2960 cputimer_intr_powersave_remreq(); 2961 } 2962 return 0; 2963 } 2964 2965 static int 2966 cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS, int *hint0, 2967 boolean_t allow_auto) 2968 { 2969 int error, cx_idx, old_cx_idx, hint; 2970 char name[CPU_MWAIT_CX_NAMELEN]; 2971 2972 hint = *hint0; 2973 old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name), 2974 allow_auto); 2975 2976 error = sysctl_handle_string(oidp, name, sizeof(name), req); 2977 if (error != 0 || req->newptr == NULL) 2978 return error; 2979 2980 if (!CPU_MWAIT_HAS_CX) 2981 return EOPNOTSUPP; 2982 2983 cx_idx = cpu_mwait_cx_name2hint(name, &hint, allow_auto); 2984 if (cx_idx < 0) 2985 return EINVAL; 2986 2987 error = cpu_mwait_cx_transit(old_cx_idx, cx_idx); 2988 if (error) 2989 return error; 2990 2991 *hint0 = hint; 2992 return 0; 2993 } 2994 2995 static int 2996 cpu_mwait_cx_setname(struct cpu_idle_stat *stat, const char *cx_name) 2997 { 2998 int error, cx_idx, old_cx_idx, hint; 2999 char name[CPU_MWAIT_CX_NAMELEN]; 3000 3001 KASSERT(CPU_MWAIT_HAS_CX, ("cpu does not support mwait CX extension")); 3002 3003 hint = stat->hint; 3004 old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE); 3005 3006 strlcpy(name, cx_name, sizeof(name)); 3007 cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE); 3008 if (cx_idx < 0) 3009 return EINVAL; 3010 3011 error = cpu_mwait_cx_transit(old_cx_idx, cx_idx); 3012 if (error) 3013 return error; 3014 3015 stat->hint = hint; 3016 return 0; 3017 } 3018 3019 static int 3020 cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS) 3021 { 3022 int hint = cpu_mwait_halt_global; 3023 int error, cx_idx, cpu; 3024 char name[CPU_MWAIT_CX_NAMELEN], cx_name[CPU_MWAIT_CX_NAMELEN]; 3025 3026 cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE); 3027 3028 error = sysctl_handle_string(oidp, name, sizeof(name), req); 3029 if (error != 0 || req->newptr == NULL) 3030 return error; 3031 3032 if (!CPU_MWAIT_HAS_CX) 3033 return EOPNOTSUPP; 3034 3035 /* Save name for later per-cpu CX configuration */ 3036 strlcpy(cx_name, name, sizeof(cx_name)); 3037 3038 cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE); 3039 if (cx_idx < 0) 3040 return EINVAL; 3041 3042 /* Change per-cpu CX configuration */ 3043 for (cpu = 0; cpu < ncpus; ++cpu) { 3044 error = cpu_mwait_cx_setname(&cpu_idle_stats[cpu], cx_name); 3045 if (error) 3046 return error; 3047 } 3048 3049 cpu_mwait_halt_global = hint; 3050 return 0; 3051 } 3052 3053 static int 3054 cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS) 3055 { 3056 struct cpu_idle_stat *stat = arg1; 3057 int error; 3058 3059 error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req, 3060 &stat->hint, TRUE); 3061 return error; 3062 } 3063 3064 static int 3065 cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS) 3066 { 3067 int error; 3068 3069 error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req, 3070 &cpu_mwait_spin, FALSE); 3071 return error; 3072 } 3073