1 /*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2003, by Peter Wemm 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #ifdef __i386__ 31 #include "opt_apic.h" 32 #endif 33 #include "opt_cpu.h" 34 #include "opt_kstack_pages.h" 35 #include "opt_pmap.h" 36 #include "opt_sched.h" 37 #include "opt_smp.h" 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/bus.h> 42 #include <sys/cons.h> /* cngetc() */ 43 #include <sys/cpuset.h> 44 #ifdef GPROF 45 #include <sys/gmon.h> 46 #endif 47 #include <sys/kernel.h> 48 #include <sys/ktr.h> 49 #include <sys/lock.h> 50 #include <sys/malloc.h> 51 #include <sys/memrange.h> 52 #include <sys/mutex.h> 53 #include <sys/pcpu.h> 54 #include <sys/proc.h> 55 #include <sys/sched.h> 56 #include <sys/smp.h> 57 #include <sys/sysctl.h> 58 59 #include <vm/vm.h> 60 #include <vm/vm_param.h> 61 #include <vm/pmap.h> 62 #include <vm/vm_kern.h> 63 #include <vm/vm_extern.h> 64 65 #include <x86/apicreg.h> 66 #include <machine/clock.h> 67 #include <machine/cputypes.h> 68 #include <x86/mca.h> 69 #include <machine/md_var.h> 70 #include <machine/pcb.h> 71 #include <machine/psl.h> 72 #include <machine/smp.h> 73 #include <machine/specialreg.h> 74 #include <machine/cpu.h> 75 76 #define WARMBOOT_TARGET 0 77 #define WARMBOOT_OFF (KERNBASE + 0x0467) 78 #define WARMBOOT_SEG (KERNBASE + 0x0469) 79 80 #define CMOS_REG (0x70) 81 #define CMOS_DATA (0x71) 82 #define BIOS_RESET (0x0f) 83 #define BIOS_WARM (0x0a) 84 85 /* lock region used by kernel profiling */ 86 int mcount_lock; 87 88 int mp_naps; /* # of Applications processors */ 89 int boot_cpu_id = -1; /* designated BSP */ 90 91 extern struct pcpu __pcpu[]; 92 93 /* AP uses this during bootstrap. Do not staticize. */ 94 char *bootSTK; 95 int bootAP; 96 97 /* Free these after use */ 98 void *bootstacks[MAXCPU]; 99 void *dpcpu; 100 101 struct pcb stoppcbs[MAXCPU]; 102 struct susppcb **susppcbs; 103 104 #ifdef COUNT_IPIS 105 /* Interrupt counts. */ 106 static u_long *ipi_preempt_counts[MAXCPU]; 107 static u_long *ipi_ast_counts[MAXCPU]; 108 u_long *ipi_invltlb_counts[MAXCPU]; 109 u_long *ipi_invlrng_counts[MAXCPU]; 110 u_long *ipi_invlpg_counts[MAXCPU]; 111 u_long *ipi_invlcache_counts[MAXCPU]; 112 u_long *ipi_rendezvous_counts[MAXCPU]; 113 static u_long *ipi_hardclock_counts[MAXCPU]; 114 #endif 115 116 /* Default cpu_ops implementation. */ 117 struct cpu_ops cpu_ops; 118 119 /* 120 * Local data and functions. 121 */ 122 123 static volatile cpuset_t ipi_stop_nmi_pending; 124 125 /* used to hold the AP's until we are ready to release them */ 126 struct mtx ap_boot_mtx; 127 128 /* Set to 1 once we're ready to let the APs out of the pen. */ 129 volatile int aps_ready = 0; 130 131 /* 132 * Store data from cpu_add() until later in the boot when we actually setup 133 * the APs. 134 */ 135 struct cpu_info cpu_info[MAX_APIC_ID + 1]; 136 int cpu_apic_ids[MAXCPU]; 137 int apic_cpuids[MAX_APIC_ID + 1]; 138 139 /* Holds pending bitmap based IPIs per CPU */ 140 volatile u_int cpu_ipi_pending[MAXCPU]; 141 142 int cpu_logical; /* logical cpus per core */ 143 int cpu_cores; /* cores per package */ 144 145 static void release_aps(void *dummy); 146 147 static u_int hyperthreading_cpus; /* logical cpus sharing L1 cache */ 148 static int hyperthreading_allowed = 1; 149 150 void 151 mem_range_AP_init(void) 152 { 153 154 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 155 mem_range_softc.mr_op->initAP(&mem_range_softc); 156 } 157 158 static void 159 topo_probe_amd(void) 160 { 161 int core_id_bits; 162 int id; 163 164 /* AMD processors do not support HTT. */ 165 cpu_logical = 1; 166 167 if ((amd_feature2 & AMDID2_CMP) == 0) { 168 cpu_cores = 1; 169 return; 170 } 171 172 core_id_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) >> 173 AMDID_COREID_SIZE_SHIFT; 174 if (core_id_bits == 0) { 175 cpu_cores = (cpu_procinfo2 & AMDID_CMP_CORES) + 1; 176 return; 177 } 178 179 /* Fam 10h and newer should get here. */ 180 for (id = 0; id <= MAX_APIC_ID; id++) { 181 /* Check logical CPU availability. */ 182 if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) 183 continue; 184 /* Check if logical CPU has the same package ID. */ 185 if ((id >> core_id_bits) != (boot_cpu_id >> core_id_bits)) 186 continue; 187 cpu_cores++; 188 } 189 } 190 191 /* 192 * Round up to the next power of two, if necessary, and then 193 * take log2. 194 * Returns -1 if argument is zero. 195 */ 196 static __inline int 197 mask_width(u_int x) 198 { 199 200 return (fls(x << (1 - powerof2(x))) - 1); 201 } 202 203 static void 204 topo_probe_0x4(void) 205 { 206 u_int p[4]; 207 int pkg_id_bits; 208 int core_id_bits; 209 int max_cores; 210 int max_logical; 211 int id; 212 213 /* Both zero and one here mean one logical processor per package. */ 214 max_logical = (cpu_feature & CPUID_HTT) != 0 ? 215 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; 216 if (max_logical <= 1) 217 return; 218 219 /* 220 * Because of uniformity assumption we examine only 221 * those logical processors that belong to the same 222 * package as BSP. Further, we count number of 223 * logical processors that belong to the same core 224 * as BSP thus deducing number of threads per core. 225 */ 226 if (cpu_high >= 0x4) { 227 cpuid_count(0x04, 0, p); 228 max_cores = ((p[0] >> 26) & 0x3f) + 1; 229 } else 230 max_cores = 1; 231 core_id_bits = mask_width(max_logical/max_cores); 232 if (core_id_bits < 0) 233 return; 234 pkg_id_bits = core_id_bits + mask_width(max_cores); 235 236 for (id = 0; id <= MAX_APIC_ID; id++) { 237 /* Check logical CPU availability. */ 238 if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) 239 continue; 240 /* Check if logical CPU has the same package ID. */ 241 if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits)) 242 continue; 243 cpu_cores++; 244 /* Check if logical CPU has the same package and core IDs. */ 245 if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits)) 246 cpu_logical++; 247 } 248 249 KASSERT(cpu_cores >= 1 && cpu_logical >= 1, 250 ("topo_probe_0x4 couldn't find BSP")); 251 252 cpu_cores /= cpu_logical; 253 hyperthreading_cpus = cpu_logical; 254 } 255 256 static void 257 topo_probe_0xb(void) 258 { 259 u_int p[4]; 260 int bits; 261 int cnt; 262 int i; 263 int logical; 264 int type; 265 int x; 266 267 /* We only support three levels for now. */ 268 for (i = 0; i < 3; i++) { 269 cpuid_count(0x0b, i, p); 270 271 /* Fall back if CPU leaf 11 doesn't really exist. */ 272 if (i == 0 && p[1] == 0) { 273 topo_probe_0x4(); 274 return; 275 } 276 277 bits = p[0] & 0x1f; 278 logical = p[1] &= 0xffff; 279 type = (p[2] >> 8) & 0xff; 280 if (type == 0 || logical == 0) 281 break; 282 /* 283 * Because of uniformity assumption we examine only 284 * those logical processors that belong to the same 285 * package as BSP. 286 */ 287 for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) { 288 if (!cpu_info[x].cpu_present || 289 cpu_info[x].cpu_disabled) 290 continue; 291 if (x >> bits == boot_cpu_id >> bits) 292 cnt++; 293 } 294 if (type == CPUID_TYPE_SMT) 295 cpu_logical = cnt; 296 else if (type == CPUID_TYPE_CORE) 297 cpu_cores = cnt; 298 } 299 if (cpu_logical == 0) 300 cpu_logical = 1; 301 cpu_cores /= cpu_logical; 302 } 303 304 /* 305 * Both topology discovery code and code that consumes topology 306 * information assume top-down uniformity of the topology. 307 * That is, all physical packages must be identical and each 308 * core in a package must have the same number of threads. 309 * Topology information is queried only on BSP, on which this 310 * code runs and for which it can query CPUID information. 311 * Then topology is extrapolated on all packages using the 312 * uniformity assumption. 313 */ 314 void 315 topo_probe(void) 316 { 317 static int cpu_topo_probed = 0; 318 319 if (cpu_topo_probed) 320 return; 321 322 CPU_ZERO(&logical_cpus_mask); 323 if (mp_ncpus <= 1) 324 cpu_cores = cpu_logical = 1; 325 else if (cpu_vendor_id == CPU_VENDOR_AMD) 326 topo_probe_amd(); 327 else if (cpu_vendor_id == CPU_VENDOR_INTEL) { 328 /* 329 * See Intel(R) 64 Architecture Processor 330 * Topology Enumeration article for details. 331 * 332 * Note that 0x1 <= cpu_high < 4 case should be 333 * compatible with topo_probe_0x4() logic when 334 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) 335 * or it should trigger the fallback otherwise. 336 */ 337 if (cpu_high >= 0xb) 338 topo_probe_0xb(); 339 else if (cpu_high >= 0x1) 340 topo_probe_0x4(); 341 } 342 343 /* 344 * Fallback: assume each logical CPU is in separate 345 * physical package. That is, no multi-core, no SMT. 346 */ 347 if (cpu_cores == 0 || cpu_logical == 0) 348 cpu_cores = cpu_logical = 1; 349 cpu_topo_probed = 1; 350 } 351 352 struct cpu_group * 353 cpu_topo(void) 354 { 355 int cg_flags; 356 357 /* 358 * Determine whether any threading flags are 359 * necessry. 360 */ 361 topo_probe(); 362 if (cpu_logical > 1 && hyperthreading_cpus) 363 cg_flags = CG_FLAG_HTT; 364 else if (cpu_logical > 1) 365 cg_flags = CG_FLAG_SMT; 366 else 367 cg_flags = 0; 368 if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { 369 printf("WARNING: Non-uniform processors.\n"); 370 printf("WARNING: Using suboptimal topology.\n"); 371 return (smp_topo_none()); 372 } 373 /* 374 * No multi-core or hyper-threaded. 375 */ 376 if (cpu_logical * cpu_cores == 1) 377 return (smp_topo_none()); 378 /* 379 * Only HTT no multi-core. 380 */ 381 if (cpu_logical > 1 && cpu_cores == 1) 382 return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags)); 383 /* 384 * Only multi-core no HTT. 385 */ 386 if (cpu_cores > 1 && cpu_logical == 1) 387 return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags)); 388 /* 389 * Both HTT and multi-core. 390 */ 391 return (smp_topo_2level(CG_SHARE_L2, cpu_cores, 392 CG_SHARE_L1, cpu_logical, cg_flags)); 393 } 394 395 396 void 397 cpu_add(u_int apic_id, char boot_cpu) 398 { 399 400 if (apic_id > MAX_APIC_ID) { 401 panic("SMP: APIC ID %d too high", apic_id); 402 return; 403 } 404 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", 405 apic_id)); 406 cpu_info[apic_id].cpu_present = 1; 407 if (boot_cpu) { 408 KASSERT(boot_cpu_id == -1, 409 ("CPU %d claims to be BSP, but CPU %d already is", apic_id, 410 boot_cpu_id)); 411 boot_cpu_id = apic_id; 412 cpu_info[apic_id].cpu_bsp = 1; 413 } 414 if (mp_ncpus < MAXCPU) { 415 mp_ncpus++; 416 mp_maxid = mp_ncpus - 1; 417 } 418 if (bootverbose) 419 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : 420 "AP"); 421 } 422 423 void 424 cpu_mp_setmaxid(void) 425 { 426 427 /* 428 * mp_maxid should be already set by calls to cpu_add(). 429 * Just sanity check its value here. 430 */ 431 if (mp_ncpus == 0) 432 KASSERT(mp_maxid == 0, 433 ("%s: mp_ncpus is zero, but mp_maxid is not", __func__)); 434 else if (mp_ncpus == 1) 435 mp_maxid = 0; 436 else 437 KASSERT(mp_maxid >= mp_ncpus - 1, 438 ("%s: counters out of sync: max %d, count %d", __func__, 439 mp_maxid, mp_ncpus)); 440 } 441 442 int 443 cpu_mp_probe(void) 444 { 445 446 /* 447 * Always record BSP in CPU map so that the mbuf init code works 448 * correctly. 449 */ 450 CPU_SETOF(0, &all_cpus); 451 if (mp_ncpus == 0) { 452 /* 453 * No CPUs were found, so this must be a UP system. Setup 454 * the variables to represent a system with a single CPU 455 * with an id of 0. 456 */ 457 mp_ncpus = 1; 458 return (0); 459 } 460 461 /* At least one CPU was found. */ 462 if (mp_ncpus == 1) { 463 /* 464 * One CPU was found, so this must be a UP system with 465 * an I/O APIC. 466 */ 467 mp_maxid = 0; 468 return (0); 469 } 470 471 /* At least two CPUs were found. */ 472 return (1); 473 } 474 475 /* 476 * Print various information about the SMP system hardware and setup. 477 */ 478 void 479 cpu_mp_announce(void) 480 { 481 const char *hyperthread; 482 int i; 483 484 printf("FreeBSD/SMP: %d package(s) x %d core(s)", 485 mp_ncpus / (cpu_cores * cpu_logical), cpu_cores); 486 if (hyperthreading_cpus > 1) 487 printf(" x %d HTT threads", cpu_logical); 488 else if (cpu_logical > 1) 489 printf(" x %d SMT threads", cpu_logical); 490 printf("\n"); 491 492 /* List active CPUs first. */ 493 printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); 494 for (i = 1; i < mp_ncpus; i++) { 495 if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread) 496 hyperthread = "/HT"; 497 else 498 hyperthread = ""; 499 printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread, 500 cpu_apic_ids[i]); 501 } 502 503 /* List disabled CPUs last. */ 504 for (i = 0; i <= MAX_APIC_ID; i++) { 505 if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled) 506 continue; 507 if (cpu_info[i].cpu_hyperthread) 508 hyperthread = "/HT"; 509 else 510 hyperthread = ""; 511 printf(" cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread, 512 i); 513 } 514 } 515 516 void 517 init_secondary_tail(void) 518 { 519 u_int cpuid; 520 521 /* 522 * On real hardware, switch to x2apic mode if possible. Do it 523 * after aps_ready was signalled, to avoid manipulating the 524 * mode while BSP might still want to send some IPI to us 525 * (second startup IPI is ignored on modern hardware etc). 526 */ 527 lapic_xapic_mode(); 528 529 /* Initialize the PAT MSR. */ 530 pmap_init_pat(); 531 532 /* set up CPU registers and state */ 533 cpu_setregs(); 534 535 /* set up SSE/NX */ 536 initializecpu(); 537 538 /* set up FPU state on the AP */ 539 #ifdef __amd64__ 540 fpuinit(); 541 #else 542 npxinit(false); 543 #endif 544 545 if (cpu_ops.cpu_init) 546 cpu_ops.cpu_init(); 547 548 /* A quick check from sanity claus */ 549 cpuid = PCPU_GET(cpuid); 550 if (PCPU_GET(apic_id) != lapic_id()) { 551 printf("SMP: cpuid = %d\n", cpuid); 552 printf("SMP: actual apic_id = %d\n", lapic_id()); 553 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 554 panic("cpuid mismatch! boom!!"); 555 } 556 557 /* Initialize curthread. */ 558 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 559 PCPU_SET(curthread, PCPU_GET(idlethread)); 560 561 mca_init(); 562 563 mtx_lock_spin(&ap_boot_mtx); 564 565 /* Init local apic for irq's */ 566 lapic_setup(1); 567 568 /* Set memory range attributes for this CPU to match the BSP */ 569 mem_range_AP_init(); 570 571 smp_cpus++; 572 573 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); 574 printf("SMP: AP CPU #%d Launched!\n", cpuid); 575 576 /* Determine if we are a logical CPU. */ 577 /* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */ 578 if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0) 579 CPU_SET(cpuid, &logical_cpus_mask); 580 581 if (bootverbose) 582 lapic_dump("AP"); 583 584 if (smp_cpus == mp_ncpus) { 585 /* enable IPI's, tlb shootdown, freezes etc */ 586 atomic_store_rel_int(&smp_started, 1); 587 } 588 589 #ifdef __amd64__ 590 /* 591 * Enable global pages TLB extension 592 * This also implicitly flushes the TLB 593 */ 594 load_cr4(rcr4() | CR4_PGE); 595 if (pmap_pcid_enabled) 596 load_cr4(rcr4() | CR4_PCIDE); 597 load_ds(_udatasel); 598 load_es(_udatasel); 599 load_fs(_ufssel); 600 #endif 601 602 mtx_unlock_spin(&ap_boot_mtx); 603 604 /* Wait until all the AP's are up. */ 605 while (atomic_load_acq_int(&smp_started) == 0) 606 ia32_pause(); 607 608 /* Start per-CPU event timers. */ 609 cpu_initclocks_ap(); 610 611 sched_throw(NULL); 612 613 panic("scheduler returned us to %s", __func__); 614 /* NOTREACHED */ 615 } 616 617 /******************************************************************* 618 * local functions and data 619 */ 620 621 /* 622 * We tell the I/O APIC code about all the CPUs we want to receive 623 * interrupts. If we don't want certain CPUs to receive IRQs we 624 * can simply not tell the I/O APIC code about them in this function. 625 * We also do not tell it about the BSP since it tells itself about 626 * the BSP internally to work with UP kernels and on UP machines. 627 */ 628 void 629 set_interrupt_apic_ids(void) 630 { 631 u_int i, apic_id; 632 633 for (i = 0; i < MAXCPU; i++) { 634 apic_id = cpu_apic_ids[i]; 635 if (apic_id == -1) 636 continue; 637 if (cpu_info[apic_id].cpu_bsp) 638 continue; 639 if (cpu_info[apic_id].cpu_disabled) 640 continue; 641 642 /* Don't let hyperthreads service interrupts. */ 643 if (cpu_logical > 1 && 644 apic_id % cpu_logical != 0) 645 continue; 646 647 intr_add_cpu(i); 648 } 649 } 650 651 /* 652 * Assign logical CPU IDs to local APICs. 653 */ 654 void 655 assign_cpu_ids(void) 656 { 657 u_int i; 658 659 TUNABLE_INT_FETCH("machdep.hyperthreading_allowed", 660 &hyperthreading_allowed); 661 662 /* Check for explicitly disabled CPUs. */ 663 for (i = 0; i <= MAX_APIC_ID; i++) { 664 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) 665 continue; 666 667 if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) { 668 cpu_info[i].cpu_hyperthread = 1; 669 670 /* 671 * Don't use HT CPU if it has been disabled by a 672 * tunable. 673 */ 674 if (hyperthreading_allowed == 0) { 675 cpu_info[i].cpu_disabled = 1; 676 continue; 677 } 678 } 679 680 /* Don't use this CPU if it has been disabled by a tunable. */ 681 if (resource_disabled("lapic", i)) { 682 cpu_info[i].cpu_disabled = 1; 683 continue; 684 } 685 } 686 687 if (hyperthreading_allowed == 0 && hyperthreading_cpus > 1) { 688 hyperthreading_cpus = 0; 689 cpu_logical = 1; 690 } 691 692 /* 693 * Assign CPU IDs to local APIC IDs and disable any CPUs 694 * beyond MAXCPU. CPU 0 is always assigned to the BSP. 695 * 696 * To minimize confusion for userland, we attempt to number 697 * CPUs such that all threads and cores in a package are 698 * grouped together. For now we assume that the BSP is always 699 * the first thread in a package and just start adding APs 700 * starting with the BSP's APIC ID. 701 */ 702 mp_ncpus = 1; 703 cpu_apic_ids[0] = boot_cpu_id; 704 apic_cpuids[boot_cpu_id] = 0; 705 for (i = boot_cpu_id + 1; i != boot_cpu_id; 706 i == MAX_APIC_ID ? i = 0 : i++) { 707 if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || 708 cpu_info[i].cpu_disabled) 709 continue; 710 711 if (mp_ncpus < MAXCPU) { 712 cpu_apic_ids[mp_ncpus] = i; 713 apic_cpuids[i] = mp_ncpus; 714 mp_ncpus++; 715 } else 716 cpu_info[i].cpu_disabled = 1; 717 } 718 KASSERT(mp_maxid >= mp_ncpus - 1, 719 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 720 mp_ncpus)); 721 } 722 723 #ifdef COUNT_XINVLTLB_HITS 724 u_int xhits_gbl[MAXCPU]; 725 u_int xhits_pg[MAXCPU]; 726 u_int xhits_rng[MAXCPU]; 727 static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); 728 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 729 sizeof(xhits_gbl), "IU", ""); 730 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 731 sizeof(xhits_pg), "IU", ""); 732 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 733 sizeof(xhits_rng), "IU", ""); 734 735 u_int ipi_global; 736 u_int ipi_page; 737 u_int ipi_range; 738 u_int ipi_range_size; 739 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 740 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 741 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 742 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 743 0, ""); 744 745 u_int ipi_masked_global; 746 u_int ipi_masked_page; 747 u_int ipi_masked_range; 748 u_int ipi_masked_range_size; 749 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, 750 &ipi_masked_global, 0, ""); 751 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, 752 &ipi_masked_page, 0, ""); 753 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, 754 &ipi_masked_range, 0, ""); 755 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, 756 &ipi_masked_range_size, 0, ""); 757 #endif /* COUNT_XINVLTLB_HITS */ 758 759 /* 760 * Init and startup IPI. 761 */ 762 void 763 ipi_startup(int apic_id, int vector) 764 { 765 766 /* 767 * This attempts to follow the algorithm described in the 768 * Intel Multiprocessor Specification v1.4 in section B.4. 769 * For each IPI, we allow the local APIC ~20us to deliver the 770 * IPI. If that times out, we panic. 771 */ 772 773 /* 774 * first we do an INIT IPI: this INIT IPI might be run, resetting 775 * and running the target CPU. OR this INIT IPI might be latched (P5 776 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 777 * ignored. 778 */ 779 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 780 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 781 lapic_ipi_wait(100); 782 783 /* Explicitly deassert the INIT IPI. */ 784 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 785 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 786 apic_id); 787 788 DELAY(10000); /* wait ~10mS */ 789 790 /* 791 * next we do a STARTUP IPI: the previous INIT IPI might still be 792 * latched, (P5 bug) this 1st STARTUP would then terminate 793 * immediately, and the previously started INIT IPI would continue. OR 794 * the previous INIT IPI has already run. and this STARTUP IPI will 795 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 796 * will run. 797 */ 798 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 799 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 800 vector, apic_id); 801 if (!lapic_ipi_wait(100)) 802 panic("Failed to deliver first STARTUP IPI to APIC %d", 803 apic_id); 804 DELAY(200); /* wait ~200uS */ 805 806 /* 807 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 808 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 809 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 810 * recognized after hardware RESET or INIT IPI. 811 */ 812 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 813 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 814 vector, apic_id); 815 if (!lapic_ipi_wait(100)) 816 panic("Failed to deliver second STARTUP IPI to APIC %d", 817 apic_id); 818 819 DELAY(200); /* wait ~200uS */ 820 } 821 822 /* 823 * Send an IPI to specified CPU handling the bitmap logic. 824 */ 825 void 826 ipi_send_cpu(int cpu, u_int ipi) 827 { 828 u_int bitmap, old_pending, new_pending; 829 830 KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); 831 832 if (IPI_IS_BITMAPED(ipi)) { 833 bitmap = 1 << ipi; 834 ipi = IPI_BITMAP_VECTOR; 835 do { 836 old_pending = cpu_ipi_pending[cpu]; 837 new_pending = old_pending | bitmap; 838 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], 839 old_pending, new_pending)); 840 if (old_pending) 841 return; 842 } 843 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); 844 } 845 846 void 847 ipi_bitmap_handler(struct trapframe frame) 848 { 849 struct trapframe *oldframe; 850 struct thread *td; 851 int cpu = PCPU_GET(cpuid); 852 u_int ipi_bitmap; 853 854 critical_enter(); 855 td = curthread; 856 td->td_intr_nesting_level++; 857 oldframe = td->td_intr_frame; 858 td->td_intr_frame = &frame; 859 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); 860 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 861 #ifdef COUNT_IPIS 862 (*ipi_preempt_counts[cpu])++; 863 #endif 864 sched_preempt(td); 865 } 866 if (ipi_bitmap & (1 << IPI_AST)) { 867 #ifdef COUNT_IPIS 868 (*ipi_ast_counts[cpu])++; 869 #endif 870 /* Nothing to do for AST */ 871 } 872 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { 873 #ifdef COUNT_IPIS 874 (*ipi_hardclock_counts[cpu])++; 875 #endif 876 hardclockintr(); 877 } 878 td->td_intr_frame = oldframe; 879 td->td_intr_nesting_level--; 880 critical_exit(); 881 } 882 883 /* 884 * send an IPI to a set of cpus. 885 */ 886 void 887 ipi_selected(cpuset_t cpus, u_int ipi) 888 { 889 int cpu; 890 891 /* 892 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 893 * of help in order to understand what is the source. 894 * Set the mask of receiving CPUs for this purpose. 895 */ 896 if (ipi == IPI_STOP_HARD) 897 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus); 898 899 while ((cpu = CPU_FFS(&cpus)) != 0) { 900 cpu--; 901 CPU_CLR(cpu, &cpus); 902 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 903 ipi_send_cpu(cpu, ipi); 904 } 905 } 906 907 /* 908 * send an IPI to a specific CPU. 909 */ 910 void 911 ipi_cpu(int cpu, u_int ipi) 912 { 913 914 /* 915 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 916 * of help in order to understand what is the source. 917 * Set the mask of receiving CPUs for this purpose. 918 */ 919 if (ipi == IPI_STOP_HARD) 920 CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending); 921 922 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 923 ipi_send_cpu(cpu, ipi); 924 } 925 926 /* 927 * send an IPI to all CPUs EXCEPT myself 928 */ 929 void 930 ipi_all_but_self(u_int ipi) 931 { 932 cpuset_t other_cpus; 933 934 other_cpus = all_cpus; 935 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 936 if (IPI_IS_BITMAPED(ipi)) { 937 ipi_selected(other_cpus, ipi); 938 return; 939 } 940 941 /* 942 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 943 * of help in order to understand what is the source. 944 * Set the mask of receiving CPUs for this purpose. 945 */ 946 if (ipi == IPI_STOP_HARD) 947 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus); 948 949 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 950 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 951 } 952 953 int 954 ipi_nmi_handler() 955 { 956 u_int cpuid; 957 958 /* 959 * As long as there is not a simple way to know about a NMI's 960 * source, if the bitmask for the current CPU is present in 961 * the global pending bitword an IPI_STOP_HARD has been issued 962 * and should be handled. 963 */ 964 cpuid = PCPU_GET(cpuid); 965 if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending)) 966 return (1); 967 968 CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending); 969 cpustop_handler(); 970 return (0); 971 } 972 973 /* 974 * Handle an IPI_STOP by saving our current context and spinning until we 975 * are resumed. 976 */ 977 void 978 cpustop_handler(void) 979 { 980 u_int cpu; 981 982 cpu = PCPU_GET(cpuid); 983 984 savectx(&stoppcbs[cpu]); 985 986 /* Indicate that we are stopped */ 987 CPU_SET_ATOMIC(cpu, &stopped_cpus); 988 989 /* Wait for restart */ 990 while (!CPU_ISSET(cpu, &started_cpus)) 991 ia32_pause(); 992 993 CPU_CLR_ATOMIC(cpu, &started_cpus); 994 CPU_CLR_ATOMIC(cpu, &stopped_cpus); 995 996 #if defined(__amd64__) && defined(DDB) 997 amd64_db_resume_dbreg(); 998 #endif 999 1000 if (cpu == 0 && cpustop_restartfunc != NULL) { 1001 cpustop_restartfunc(); 1002 cpustop_restartfunc = NULL; 1003 } 1004 } 1005 1006 /* 1007 * Handle an IPI_SUSPEND by saving our current context and spinning until we 1008 * are resumed. 1009 */ 1010 void 1011 cpususpend_handler(void) 1012 { 1013 u_int cpu; 1014 1015 mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); 1016 1017 cpu = PCPU_GET(cpuid); 1018 if (savectx(&susppcbs[cpu]->sp_pcb)) { 1019 #ifdef __amd64__ 1020 fpususpend(susppcbs[cpu]->sp_fpususpend); 1021 #else 1022 npxsuspend(susppcbs[cpu]->sp_fpususpend); 1023 #endif 1024 wbinvd(); 1025 CPU_SET_ATOMIC(cpu, &suspended_cpus); 1026 } else { 1027 #ifdef __amd64__ 1028 fpuresume(susppcbs[cpu]->sp_fpususpend); 1029 #else 1030 npxresume(susppcbs[cpu]->sp_fpususpend); 1031 #endif 1032 pmap_init_pat(); 1033 initializecpu(); 1034 PCPU_SET(switchtime, 0); 1035 PCPU_SET(switchticks, ticks); 1036 1037 /* Indicate that we are resumed */ 1038 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1039 } 1040 1041 /* Wait for resume */ 1042 while (!CPU_ISSET(cpu, &started_cpus)) 1043 ia32_pause(); 1044 1045 if (cpu_ops.cpu_resume) 1046 cpu_ops.cpu_resume(); 1047 #ifdef __amd64__ 1048 if (vmm_resume_p) 1049 vmm_resume_p(); 1050 #endif 1051 1052 /* Resume MCA and local APIC */ 1053 lapic_xapic_mode(); 1054 mca_resume(); 1055 lapic_setup(0); 1056 1057 /* Indicate that we are resumed */ 1058 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1059 CPU_CLR_ATOMIC(cpu, &started_cpus); 1060 } 1061 1062 1063 void 1064 invlcache_handler(void) 1065 { 1066 #ifdef COUNT_IPIS 1067 (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; 1068 #endif /* COUNT_IPIS */ 1069 1070 wbinvd(); 1071 atomic_add_int(&smp_tlb_wait, 1); 1072 } 1073 1074 /* 1075 * This is called once the rest of the system is up and running and we're 1076 * ready to let the AP's out of the pen. 1077 */ 1078 static void 1079 release_aps(void *dummy __unused) 1080 { 1081 1082 if (mp_ncpus == 1) 1083 return; 1084 atomic_store_rel_int(&aps_ready, 1); 1085 while (smp_started == 0) 1086 ia32_pause(); 1087 } 1088 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1089 1090 #ifdef COUNT_IPIS 1091 /* 1092 * Setup interrupt counters for IPI handlers. 1093 */ 1094 static void 1095 mp_ipi_intrcnt(void *dummy) 1096 { 1097 char buf[64]; 1098 int i; 1099 1100 CPU_FOREACH(i) { 1101 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); 1102 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1103 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); 1104 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1105 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); 1106 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1107 snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); 1108 intrcnt_add(buf, &ipi_invlcache_counts[i]); 1109 snprintf(buf, sizeof(buf), "cpu%d:preempt", i); 1110 intrcnt_add(buf, &ipi_preempt_counts[i]); 1111 snprintf(buf, sizeof(buf), "cpu%d:ast", i); 1112 intrcnt_add(buf, &ipi_ast_counts[i]); 1113 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); 1114 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1115 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); 1116 intrcnt_add(buf, &ipi_hardclock_counts[i]); 1117 } 1118 } 1119 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1120 #endif 1121