1 /*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2003, by Peter Wemm 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #ifdef __i386__ 31 #include "opt_apic.h" 32 #endif 33 #include "opt_cpu.h" 34 #include "opt_ddb.h" 35 #include "opt_kstack_pages.h" 36 #include "opt_pmap.h" 37 #include "opt_sched.h" 38 #include "opt_smp.h" 39 #include "opt_stack.h" 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/bus.h> 44 #include <sys/cons.h> /* cngetc() */ 45 #include <sys/cpuset.h> 46 #include <sys/csan.h> 47 #ifdef GPROF 48 #include <sys/gmon.h> 49 #endif 50 #include <sys/kdb.h> 51 #include <sys/kernel.h> 52 #include <sys/ktr.h> 53 #include <sys/lock.h> 54 #include <sys/malloc.h> 55 #include <sys/memrange.h> 56 #include <sys/mutex.h> 57 #include <sys/pcpu.h> 58 #include <sys/proc.h> 59 #include <sys/sched.h> 60 #include <sys/smp.h> 61 #include <sys/sysctl.h> 62 63 #include <vm/vm.h> 64 #include <vm/vm_param.h> 65 #include <vm/pmap.h> 66 #include <vm/vm_kern.h> 67 #include <vm/vm_extern.h> 68 #include <vm/vm_map.h> 69 70 #include <x86/apicreg.h> 71 #include <machine/clock.h> 72 #include <machine/cpu.h> 73 #include <machine/cputypes.h> 74 #include <x86/mca.h> 75 #include <machine/md_var.h> 76 #include <machine/pcb.h> 77 #include <machine/psl.h> 78 #include <machine/smp.h> 79 #include <machine/specialreg.h> 80 #include <machine/stack.h> 81 #include <x86/ucode.h> 82 83 static MALLOC_DEFINE(M_CPUS, "cpus", "CPU items"); 84 85 /* lock region used by kernel profiling */ 86 int mcount_lock; 87 88 int mp_naps; /* # of Applications processors */ 89 int boot_cpu_id = -1; /* designated BSP */ 90 91 /* AP uses this during bootstrap. Do not staticize. */ 92 char *bootSTK; 93 int bootAP; 94 95 /* Free these after use */ 96 void *bootstacks[MAXCPU]; 97 void *dpcpu; 98 99 struct pcb stoppcbs[MAXCPU]; 100 struct susppcb **susppcbs; 101 102 #ifdef COUNT_IPIS 103 /* Interrupt counts. */ 104 static u_long *ipi_preempt_counts[MAXCPU]; 105 static u_long *ipi_ast_counts[MAXCPU]; 106 u_long *ipi_invltlb_counts[MAXCPU]; 107 u_long *ipi_invlrng_counts[MAXCPU]; 108 u_long *ipi_invlpg_counts[MAXCPU]; 109 u_long *ipi_invlcache_counts[MAXCPU]; 110 u_long *ipi_rendezvous_counts[MAXCPU]; 111 static u_long *ipi_hardclock_counts[MAXCPU]; 112 #endif 113 114 /* Default cpu_ops implementation. */ 115 struct cpu_ops cpu_ops; 116 117 /* 118 * Local data and functions. 119 */ 120 121 static volatile cpuset_t ipi_stop_nmi_pending; 122 123 volatile cpuset_t resuming_cpus; 124 volatile cpuset_t toresume_cpus; 125 126 /* used to hold the AP's until we are ready to release them */ 127 struct mtx ap_boot_mtx; 128 129 /* Set to 1 once we're ready to let the APs out of the pen. */ 130 volatile int aps_ready = 0; 131 132 /* 133 * Store data from cpu_add() until later in the boot when we actually setup 134 * the APs. 135 */ 136 struct cpu_info *cpu_info; 137 int *apic_cpuids; 138 int cpu_apic_ids[MAXCPU]; 139 _Static_assert(MAXCPU <= MAX_APIC_ID, 140 "MAXCPU cannot be larger that MAX_APIC_ID"); 141 _Static_assert(xAPIC_MAX_APIC_ID <= MAX_APIC_ID, 142 "xAPIC_MAX_APIC_ID cannot be larger that MAX_APIC_ID"); 143 144 static void release_aps(void *dummy); 145 static void cpustop_handler_post(u_int cpu); 146 147 static int hyperthreading_allowed = 1; 148 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN, 149 &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs"); 150 151 static int hyperthreading_intr_allowed = 0; 152 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_intr_allowed, CTLFLAG_RDTUN, 153 &hyperthreading_intr_allowed, 0, 154 "Allow interrupts on HTT logical CPUs"); 155 156 static struct topo_node topo_root; 157 158 static int pkg_id_shift; 159 static int node_id_shift; 160 static int core_id_shift; 161 static int disabled_cpus; 162 163 struct cache_info { 164 int id_shift; 165 int present; 166 } static caches[MAX_CACHE_LEVELS]; 167 168 unsigned int boot_address; 169 170 static bool stop_mwait = false; 171 SYSCTL_BOOL(_machdep, OID_AUTO, stop_mwait, CTLFLAG_RWTUN, &stop_mwait, 0, 172 "Use MONITOR/MWAIT when stopping CPU, if available"); 173 174 #define MiB(v) (v ## ULL << 20) 175 176 void 177 mem_range_AP_init(void) 178 { 179 180 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 181 mem_range_softc.mr_op->initAP(&mem_range_softc); 182 } 183 184 /* 185 * Round up to the next power of two, if necessary, and then 186 * take log2. 187 * Returns -1 if argument is zero. 188 */ 189 static __inline int 190 mask_width(u_int x) 191 { 192 193 return (fls(x << (1 - powerof2(x))) - 1); 194 } 195 196 /* 197 * Add a cache level to the cache topology description. 198 */ 199 static int 200 add_deterministic_cache(int type, int level, int share_count) 201 { 202 203 if (type == 0) 204 return (0); 205 if (type > 3) { 206 printf("unexpected cache type %d\n", type); 207 return (1); 208 } 209 if (type == 2) /* ignore instruction cache */ 210 return (1); 211 if (level == 0 || level > MAX_CACHE_LEVELS) { 212 printf("unexpected cache level %d\n", type); 213 return (1); 214 } 215 216 if (caches[level - 1].present) { 217 printf("WARNING: multiple entries for L%u data cache\n", level); 218 printf("%u => %u\n", caches[level - 1].id_shift, 219 mask_width(share_count)); 220 } 221 caches[level - 1].id_shift = mask_width(share_count); 222 caches[level - 1].present = 1; 223 224 if (caches[level - 1].id_shift > pkg_id_shift) { 225 printf("WARNING: L%u data cache covers more " 226 "APIC IDs than a package (%u > %u)\n", level, 227 caches[level - 1].id_shift, pkg_id_shift); 228 caches[level - 1].id_shift = pkg_id_shift; 229 } 230 if (caches[level - 1].id_shift < core_id_shift) { 231 printf("WARNING: L%u data cache covers fewer " 232 "APIC IDs than a core (%u < %u)\n", level, 233 caches[level - 1].id_shift, core_id_shift); 234 caches[level - 1].id_shift = core_id_shift; 235 } 236 237 return (1); 238 } 239 240 /* 241 * Determine topology of processing units and caches for AMD CPUs. 242 * See: 243 * - AMD CPUID Specification (Publication # 25481) 244 * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559) 245 * - BKDG For AMD Family 10h Processors (Publication # 31116) 246 * - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301) 247 * - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751) 248 * - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945) 249 */ 250 static void 251 topo_probe_amd(void) 252 { 253 u_int p[4]; 254 uint64_t v; 255 int level; 256 int nodes_per_socket; 257 int share_count; 258 int type; 259 int i; 260 261 /* No multi-core capability. */ 262 if ((amd_feature2 & AMDID2_CMP) == 0) 263 return; 264 265 /* For families 10h and newer. */ 266 pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >> 267 AMDID_COREID_SIZE_SHIFT; 268 269 /* For 0Fh family. */ 270 if (pkg_id_shift == 0) 271 pkg_id_shift = 272 mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1); 273 274 /* 275 * Families prior to 16h define the following value as 276 * cores per compute unit and we don't really care about the AMD 277 * compute units at the moment. Perhaps we should treat them as 278 * cores and cores within the compute units as hardware threads, 279 * but that's up for debate. 280 * Later families define the value as threads per compute unit, 281 * so we are following AMD's nomenclature here. 282 */ 283 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 && 284 CPUID_TO_FAMILY(cpu_id) >= 0x16) { 285 cpuid_count(0x8000001e, 0, p); 286 share_count = ((p[1] >> 8) & 0xff) + 1; 287 core_id_shift = mask_width(share_count); 288 289 /* 290 * For Zen (17h), gather Nodes per Processor. Each node is a 291 * Zeppelin die; TR and EPYC CPUs will have multiple dies per 292 * package. Communication latency between dies is higher than 293 * within them. 294 */ 295 nodes_per_socket = ((p[2] >> 8) & 0x7) + 1; 296 node_id_shift = pkg_id_shift - mask_width(nodes_per_socket); 297 } 298 299 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) { 300 for (i = 0; ; i++) { 301 cpuid_count(0x8000001d, i, p); 302 type = p[0] & 0x1f; 303 level = (p[0] >> 5) & 0x7; 304 share_count = 1 + ((p[0] >> 14) & 0xfff); 305 306 if (!add_deterministic_cache(type, level, share_count)) 307 break; 308 } 309 } else { 310 if (cpu_exthigh >= 0x80000005) { 311 cpuid_count(0x80000005, 0, p); 312 if (((p[2] >> 24) & 0xff) != 0) { 313 caches[0].id_shift = 0; 314 caches[0].present = 1; 315 } 316 } 317 if (cpu_exthigh >= 0x80000006) { 318 cpuid_count(0x80000006, 0, p); 319 if (((p[2] >> 16) & 0xffff) != 0) { 320 caches[1].id_shift = 0; 321 caches[1].present = 1; 322 } 323 if (((p[3] >> 18) & 0x3fff) != 0) { 324 nodes_per_socket = 1; 325 if ((amd_feature2 & AMDID2_NODE_ID) != 0) { 326 /* 327 * Handle multi-node processors that 328 * have multiple chips, each with its 329 * own L3 cache, on the same die. 330 */ 331 v = rdmsr(0xc001100c); 332 nodes_per_socket = 1 + ((v >> 3) & 0x7); 333 } 334 caches[2].id_shift = 335 pkg_id_shift - mask_width(nodes_per_socket); 336 caches[2].present = 1; 337 } 338 } 339 } 340 } 341 342 /* 343 * Determine topology of processing units for Intel CPUs 344 * using CPUID Leaf 1 and Leaf 4, if supported. 345 * See: 346 * - Intel 64 Architecture Processor Topology Enumeration 347 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, 348 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 349 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 350 */ 351 static void 352 topo_probe_intel_0x4(void) 353 { 354 u_int p[4]; 355 int max_cores; 356 int max_logical; 357 358 /* Both zero and one here mean one logical processor per package. */ 359 max_logical = (cpu_feature & CPUID_HTT) != 0 ? 360 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; 361 if (max_logical <= 1) 362 return; 363 364 if (cpu_high >= 0x4) { 365 cpuid_count(0x04, 0, p); 366 max_cores = ((p[0] >> 26) & 0x3f) + 1; 367 } else 368 max_cores = 1; 369 370 core_id_shift = mask_width(max_logical/max_cores); 371 KASSERT(core_id_shift >= 0, 372 ("intel topo: max_cores > max_logical\n")); 373 pkg_id_shift = core_id_shift + mask_width(max_cores); 374 } 375 376 /* 377 * Determine topology of processing units for Intel CPUs 378 * using CPUID Leaf 11, if supported. 379 * See: 380 * - Intel 64 Architecture Processor Topology Enumeration 381 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, 382 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 383 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 384 */ 385 static void 386 topo_probe_intel_0xb(void) 387 { 388 u_int p[4]; 389 int bits; 390 int type; 391 int i; 392 393 /* Fall back if CPU leaf 11 doesn't really exist. */ 394 cpuid_count(0x0b, 0, p); 395 if (p[1] == 0) { 396 topo_probe_intel_0x4(); 397 return; 398 } 399 400 /* We only support three levels for now. */ 401 for (i = 0; ; i++) { 402 cpuid_count(0x0b, i, p); 403 404 bits = p[0] & 0x1f; 405 type = (p[2] >> 8) & 0xff; 406 407 if (type == 0) 408 break; 409 410 /* TODO: check for duplicate (re-)assignment */ 411 if (type == CPUID_TYPE_SMT) 412 core_id_shift = bits; 413 else if (type == CPUID_TYPE_CORE) 414 pkg_id_shift = bits; 415 else 416 printf("unknown CPU level type %d\n", type); 417 } 418 419 if (pkg_id_shift < core_id_shift) { 420 printf("WARNING: core covers more APIC IDs than a package\n"); 421 core_id_shift = pkg_id_shift; 422 } 423 } 424 425 /* 426 * Determine topology of caches for Intel CPUs. 427 * See: 428 * - Intel 64 Architecture Processor Topology Enumeration 429 * - Intel 64 and IA-32 Architectures Software Developer’s Manual 430 * Volume 2A: Instruction Set Reference, A-M, 431 * CPUID instruction 432 */ 433 static void 434 topo_probe_intel_caches(void) 435 { 436 u_int p[4]; 437 int level; 438 int share_count; 439 int type; 440 int i; 441 442 if (cpu_high < 0x4) { 443 /* 444 * Available cache level and sizes can be determined 445 * via CPUID leaf 2, but that requires a huge table of hardcoded 446 * values, so for now just assume L1 and L2 caches potentially 447 * shared only by HTT processing units, if HTT is present. 448 */ 449 caches[0].id_shift = pkg_id_shift; 450 caches[0].present = 1; 451 caches[1].id_shift = pkg_id_shift; 452 caches[1].present = 1; 453 return; 454 } 455 456 for (i = 0; ; i++) { 457 cpuid_count(0x4, i, p); 458 type = p[0] & 0x1f; 459 level = (p[0] >> 5) & 0x7; 460 share_count = 1 + ((p[0] >> 14) & 0xfff); 461 462 if (!add_deterministic_cache(type, level, share_count)) 463 break; 464 } 465 } 466 467 /* 468 * Determine topology of processing units and caches for Intel CPUs. 469 * See: 470 * - Intel 64 Architecture Processor Topology Enumeration 471 */ 472 static void 473 topo_probe_intel(void) 474 { 475 476 /* 477 * Note that 0x1 <= cpu_high < 4 case should be 478 * compatible with topo_probe_intel_0x4() logic when 479 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) 480 * or it should trigger the fallback otherwise. 481 */ 482 if (cpu_high >= 0xb) 483 topo_probe_intel_0xb(); 484 else if (cpu_high >= 0x1) 485 topo_probe_intel_0x4(); 486 487 topo_probe_intel_caches(); 488 } 489 490 /* 491 * Topology information is queried only on BSP, on which this 492 * code runs and for which it can query CPUID information. 493 * Then topology is extrapolated on all packages using an 494 * assumption that APIC ID to hardware component ID mapping is 495 * homogenious. 496 * That doesn't necesserily imply that the topology is uniform. 497 */ 498 void 499 topo_probe(void) 500 { 501 static int cpu_topo_probed = 0; 502 struct x86_topo_layer { 503 int type; 504 int subtype; 505 int id_shift; 506 } topo_layers[MAX_CACHE_LEVELS + 4]; 507 struct topo_node *parent; 508 struct topo_node *node; 509 int layer; 510 int nlayers; 511 int node_id; 512 int i; 513 514 if (cpu_topo_probed) 515 return; 516 517 CPU_ZERO(&logical_cpus_mask); 518 519 if (mp_ncpus <= 1) 520 ; /* nothing */ 521 else if (cpu_vendor_id == CPU_VENDOR_AMD || 522 cpu_vendor_id == CPU_VENDOR_HYGON) 523 topo_probe_amd(); 524 else if (cpu_vendor_id == CPU_VENDOR_INTEL) 525 topo_probe_intel(); 526 527 KASSERT(pkg_id_shift >= core_id_shift, 528 ("bug in APIC topology discovery")); 529 530 nlayers = 0; 531 bzero(topo_layers, sizeof(topo_layers)); 532 533 topo_layers[nlayers].type = TOPO_TYPE_PKG; 534 topo_layers[nlayers].id_shift = pkg_id_shift; 535 if (bootverbose) 536 printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift); 537 nlayers++; 538 539 if (pkg_id_shift > node_id_shift && node_id_shift != 0) { 540 topo_layers[nlayers].type = TOPO_TYPE_GROUP; 541 topo_layers[nlayers].id_shift = node_id_shift; 542 if (bootverbose) 543 printf("Node ID shift: %u\n", 544 topo_layers[nlayers].id_shift); 545 nlayers++; 546 } 547 548 /* 549 * Consider all caches to be within a package/chip 550 * and "in front" of all sub-components like 551 * cores and hardware threads. 552 */ 553 for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) { 554 if (caches[i].present) { 555 if (node_id_shift != 0) 556 KASSERT(caches[i].id_shift <= node_id_shift, 557 ("bug in APIC topology discovery")); 558 KASSERT(caches[i].id_shift <= pkg_id_shift, 559 ("bug in APIC topology discovery")); 560 KASSERT(caches[i].id_shift >= core_id_shift, 561 ("bug in APIC topology discovery")); 562 563 topo_layers[nlayers].type = TOPO_TYPE_CACHE; 564 topo_layers[nlayers].subtype = i + 1; 565 topo_layers[nlayers].id_shift = caches[i].id_shift; 566 if (bootverbose) 567 printf("L%u cache ID shift: %u\n", 568 topo_layers[nlayers].subtype, 569 topo_layers[nlayers].id_shift); 570 nlayers++; 571 } 572 } 573 574 if (pkg_id_shift > core_id_shift) { 575 topo_layers[nlayers].type = TOPO_TYPE_CORE; 576 topo_layers[nlayers].id_shift = core_id_shift; 577 if (bootverbose) 578 printf("Core ID shift: %u\n", 579 topo_layers[nlayers].id_shift); 580 nlayers++; 581 } 582 583 topo_layers[nlayers].type = TOPO_TYPE_PU; 584 topo_layers[nlayers].id_shift = 0; 585 nlayers++; 586 587 topo_init_root(&topo_root); 588 for (i = 0; i <= max_apic_id; ++i) { 589 if (!cpu_info[i].cpu_present) 590 continue; 591 592 parent = &topo_root; 593 for (layer = 0; layer < nlayers; ++layer) { 594 node_id = i >> topo_layers[layer].id_shift; 595 parent = topo_add_node_by_hwid(parent, node_id, 596 topo_layers[layer].type, 597 topo_layers[layer].subtype); 598 } 599 } 600 601 parent = &topo_root; 602 for (layer = 0; layer < nlayers; ++layer) { 603 node_id = boot_cpu_id >> topo_layers[layer].id_shift; 604 node = topo_find_node_by_hwid(parent, node_id, 605 topo_layers[layer].type, 606 topo_layers[layer].subtype); 607 topo_promote_child(node); 608 parent = node; 609 } 610 611 cpu_topo_probed = 1; 612 } 613 614 /* 615 * Assign logical CPU IDs to local APICs. 616 */ 617 void 618 assign_cpu_ids(void) 619 { 620 struct topo_node *node; 621 u_int smt_mask; 622 int nhyper; 623 624 smt_mask = (1u << core_id_shift) - 1; 625 626 /* 627 * Assign CPU IDs to local APIC IDs and disable any CPUs 628 * beyond MAXCPU. CPU 0 is always assigned to the BSP. 629 */ 630 mp_ncpus = 0; 631 nhyper = 0; 632 TOPO_FOREACH(node, &topo_root) { 633 if (node->type != TOPO_TYPE_PU) 634 continue; 635 636 if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask)) 637 cpu_info[node->hwid].cpu_hyperthread = 1; 638 639 if (resource_disabled("lapic", node->hwid)) { 640 if (node->hwid != boot_cpu_id) 641 cpu_info[node->hwid].cpu_disabled = 1; 642 else 643 printf("Cannot disable BSP, APIC ID = %d\n", 644 node->hwid); 645 } 646 647 if (!hyperthreading_allowed && 648 cpu_info[node->hwid].cpu_hyperthread) 649 cpu_info[node->hwid].cpu_disabled = 1; 650 651 if (mp_ncpus >= MAXCPU) 652 cpu_info[node->hwid].cpu_disabled = 1; 653 654 if (cpu_info[node->hwid].cpu_disabled) { 655 disabled_cpus++; 656 continue; 657 } 658 659 if (cpu_info[node->hwid].cpu_hyperthread) 660 nhyper++; 661 662 cpu_apic_ids[mp_ncpus] = node->hwid; 663 apic_cpuids[node->hwid] = mp_ncpus; 664 topo_set_pu_id(node, mp_ncpus); 665 mp_ncpus++; 666 } 667 668 KASSERT(mp_maxid >= mp_ncpus - 1, 669 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 670 mp_ncpus)); 671 672 mp_ncores = mp_ncpus - nhyper; 673 smp_threads_per_core = mp_ncpus / mp_ncores; 674 } 675 676 /* 677 * Print various information about the SMP system hardware and setup. 678 */ 679 void 680 cpu_mp_announce(void) 681 { 682 struct topo_node *node; 683 const char *hyperthread; 684 struct topo_analysis topology; 685 686 printf("FreeBSD/SMP: "); 687 if (topo_analyze(&topo_root, 1, &topology)) { 688 printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]); 689 if (topology.entities[TOPO_LEVEL_GROUP] > 1) 690 printf(" x %d groups", 691 topology.entities[TOPO_LEVEL_GROUP]); 692 if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) 693 printf(" x %d cache groups", 694 topology.entities[TOPO_LEVEL_CACHEGROUP]); 695 if (topology.entities[TOPO_LEVEL_CORE] > 0) 696 printf(" x %d core(s)", 697 topology.entities[TOPO_LEVEL_CORE]); 698 if (topology.entities[TOPO_LEVEL_THREAD] > 1) 699 printf(" x %d hardware threads", 700 topology.entities[TOPO_LEVEL_THREAD]); 701 } else { 702 printf("Non-uniform topology"); 703 } 704 printf("\n"); 705 706 if (disabled_cpus) { 707 printf("FreeBSD/SMP Online: "); 708 if (topo_analyze(&topo_root, 0, &topology)) { 709 printf("%d package(s)", 710 topology.entities[TOPO_LEVEL_PKG]); 711 if (topology.entities[TOPO_LEVEL_GROUP] > 1) 712 printf(" x %d groups", 713 topology.entities[TOPO_LEVEL_GROUP]); 714 if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) 715 printf(" x %d cache groups", 716 topology.entities[TOPO_LEVEL_CACHEGROUP]); 717 if (topology.entities[TOPO_LEVEL_CORE] > 0) 718 printf(" x %d core(s)", 719 topology.entities[TOPO_LEVEL_CORE]); 720 if (topology.entities[TOPO_LEVEL_THREAD] > 1) 721 printf(" x %d hardware threads", 722 topology.entities[TOPO_LEVEL_THREAD]); 723 } else { 724 printf("Non-uniform topology"); 725 } 726 printf("\n"); 727 } 728 729 if (!bootverbose) 730 return; 731 732 TOPO_FOREACH(node, &topo_root) { 733 switch (node->type) { 734 case TOPO_TYPE_PKG: 735 printf("Package HW ID = %u\n", node->hwid); 736 break; 737 case TOPO_TYPE_CORE: 738 printf("\tCore HW ID = %u\n", node->hwid); 739 break; 740 case TOPO_TYPE_PU: 741 if (cpu_info[node->hwid].cpu_hyperthread) 742 hyperthread = "/HT"; 743 else 744 hyperthread = ""; 745 746 if (node->subtype == 0) 747 printf("\t\tCPU (AP%s): APIC ID: %u" 748 "(disabled)\n", hyperthread, node->hwid); 749 else if (node->id == 0) 750 printf("\t\tCPU0 (BSP): APIC ID: %u\n", 751 node->hwid); 752 else 753 printf("\t\tCPU%u (AP%s): APIC ID: %u\n", 754 node->id, hyperthread, node->hwid); 755 break; 756 default: 757 /* ignored */ 758 break; 759 } 760 } 761 } 762 763 /* 764 * Add a scheduling group, a group of logical processors sharing 765 * a particular cache (and, thus having an affinity), to the scheduling 766 * topology. 767 * This function recursively works on lower level caches. 768 */ 769 static void 770 x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root) 771 { 772 struct topo_node *node; 773 int nchildren; 774 int ncores; 775 int i; 776 777 KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE || 778 root->type == TOPO_TYPE_GROUP, 779 ("x86topo_add_sched_group: bad type: %u", root->type)); 780 CPU_COPY(&root->cpuset, &cg_root->cg_mask); 781 cg_root->cg_count = root->cpu_count; 782 if (root->type == TOPO_TYPE_SYSTEM) 783 cg_root->cg_level = CG_SHARE_NONE; 784 else 785 cg_root->cg_level = root->subtype; 786 787 /* 788 * Check how many core nodes we have under the given root node. 789 * If we have multiple logical processors, but not multiple 790 * cores, then those processors must be hardware threads. 791 */ 792 ncores = 0; 793 node = root; 794 while (node != NULL) { 795 if (node->type != TOPO_TYPE_CORE) { 796 node = topo_next_node(root, node); 797 continue; 798 } 799 800 ncores++; 801 node = topo_next_nonchild_node(root, node); 802 } 803 804 if (cg_root->cg_level != CG_SHARE_NONE && 805 root->cpu_count > 1 && ncores < 2) 806 cg_root->cg_flags = CG_FLAG_SMT; 807 808 /* 809 * Find out how many cache nodes we have under the given root node. 810 * We ignore cache nodes that cover all the same processors as the 811 * root node. Also, we do not descend below found cache nodes. 812 * That is, we count top-level "non-redundant" caches under the root 813 * node. 814 */ 815 nchildren = 0; 816 node = root; 817 while (node != NULL) { 818 if ((node->type != TOPO_TYPE_GROUP && 819 node->type != TOPO_TYPE_CACHE) || 820 (root->type != TOPO_TYPE_SYSTEM && 821 CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { 822 node = topo_next_node(root, node); 823 continue; 824 } 825 nchildren++; 826 node = topo_next_nonchild_node(root, node); 827 } 828 829 cg_root->cg_child = smp_topo_alloc(nchildren); 830 cg_root->cg_children = nchildren; 831 832 /* 833 * Now find again the same cache nodes as above and recursively 834 * build scheduling topologies for them. 835 */ 836 node = root; 837 i = 0; 838 while (node != NULL) { 839 if ((node->type != TOPO_TYPE_GROUP && 840 node->type != TOPO_TYPE_CACHE) || 841 (root->type != TOPO_TYPE_SYSTEM && 842 CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { 843 node = topo_next_node(root, node); 844 continue; 845 } 846 cg_root->cg_child[i].cg_parent = cg_root; 847 x86topo_add_sched_group(node, &cg_root->cg_child[i]); 848 i++; 849 node = topo_next_nonchild_node(root, node); 850 } 851 } 852 853 /* 854 * Build the MI scheduling topology from the discovered hardware topology. 855 */ 856 struct cpu_group * 857 cpu_topo(void) 858 { 859 struct cpu_group *cg_root; 860 861 if (mp_ncpus <= 1) 862 return (smp_topo_none()); 863 864 cg_root = smp_topo_alloc(1); 865 x86topo_add_sched_group(&topo_root, cg_root); 866 return (cg_root); 867 } 868 869 static void 870 cpu_alloc(void *dummy __unused) 871 { 872 /* 873 * Dynamically allocate the arrays that depend on the 874 * maximum APIC ID. 875 */ 876 cpu_info = malloc(sizeof(*cpu_info) * (max_apic_id + 1), M_CPUS, 877 M_WAITOK | M_ZERO); 878 apic_cpuids = malloc(sizeof(*apic_cpuids) * (max_apic_id + 1), M_CPUS, 879 M_WAITOK | M_ZERO); 880 } 881 SYSINIT(cpu_alloc, SI_SUB_CPU, SI_ORDER_FIRST, cpu_alloc, NULL); 882 883 /* 884 * Add a logical CPU to the topology. 885 */ 886 void 887 cpu_add(u_int apic_id, char boot_cpu) 888 { 889 890 if (apic_id > max_apic_id) { 891 panic("SMP: APIC ID %d too high", apic_id); 892 return; 893 } 894 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %u added twice", 895 apic_id)); 896 cpu_info[apic_id].cpu_present = 1; 897 if (boot_cpu) { 898 KASSERT(boot_cpu_id == -1, 899 ("CPU %u claims to be BSP, but CPU %u already is", apic_id, 900 boot_cpu_id)); 901 boot_cpu_id = apic_id; 902 cpu_info[apic_id].cpu_bsp = 1; 903 } 904 if (bootverbose) 905 printf("SMP: Added CPU %u (%s)\n", apic_id, boot_cpu ? "BSP" : 906 "AP"); 907 } 908 909 void 910 cpu_mp_setmaxid(void) 911 { 912 913 /* 914 * mp_ncpus and mp_maxid should be already set by calls to cpu_add(). 915 * If there were no calls to cpu_add() assume this is a UP system. 916 */ 917 if (mp_ncpus == 0) 918 mp_ncpus = 1; 919 } 920 921 int 922 cpu_mp_probe(void) 923 { 924 925 /* 926 * Always record BSP in CPU map so that the mbuf init code works 927 * correctly. 928 */ 929 CPU_SETOF(0, &all_cpus); 930 return (mp_ncpus > 1); 931 } 932 933 /* Allocate memory for the AP trampoline. */ 934 void 935 alloc_ap_trampoline(vm_paddr_t *physmap, unsigned int *physmap_idx) 936 { 937 unsigned int i; 938 bool allocated; 939 940 allocated = false; 941 for (i = *physmap_idx; i <= *physmap_idx; i -= 2) { 942 /* 943 * Find a memory region big enough and below the 1MB boundary 944 * for the trampoline code. 945 * NB: needs to be page aligned. 946 */ 947 if (physmap[i] >= MiB(1) || 948 (trunc_page(physmap[i + 1]) - round_page(physmap[i])) < 949 round_page(bootMP_size)) 950 continue; 951 952 allocated = true; 953 /* 954 * Try to steal from the end of the region to mimic previous 955 * behaviour, else fallback to steal from the start. 956 */ 957 if (physmap[i + 1] < MiB(1)) { 958 boot_address = trunc_page(physmap[i + 1]); 959 if ((physmap[i + 1] - boot_address) < bootMP_size) 960 boot_address -= round_page(bootMP_size); 961 physmap[i + 1] = boot_address; 962 } else { 963 boot_address = round_page(physmap[i]); 964 physmap[i] = boot_address + round_page(bootMP_size); 965 } 966 if (physmap[i] == physmap[i + 1] && *physmap_idx != 0) { 967 memmove(&physmap[i], &physmap[i + 2], 968 sizeof(*physmap) * (*physmap_idx - i + 2)); 969 *physmap_idx -= 2; 970 } 971 break; 972 } 973 974 if (!allocated) { 975 boot_address = basemem * 1024 - bootMP_size; 976 if (bootverbose) 977 printf( 978 "Cannot find enough space for the boot trampoline, placing it at %#x", 979 boot_address); 980 } 981 } 982 983 /* 984 * AP CPU's call this to initialize themselves. 985 */ 986 void 987 init_secondary_tail(void) 988 { 989 u_int cpuid; 990 991 pmap_activate_boot(vmspace_pmap(proc0.p_vmspace)); 992 993 /* 994 * On real hardware, switch to x2apic mode if possible. Do it 995 * after aps_ready was signalled, to avoid manipulating the 996 * mode while BSP might still want to send some IPI to us 997 * (second startup IPI is ignored on modern hardware etc). 998 */ 999 lapic_xapic_mode(); 1000 1001 /* Initialize the PAT MSR. */ 1002 pmap_init_pat(); 1003 1004 /* set up CPU registers and state */ 1005 cpu_setregs(); 1006 1007 /* set up SSE/NX */ 1008 initializecpu(); 1009 1010 /* set up FPU state on the AP */ 1011 #ifdef __amd64__ 1012 fpuinit(); 1013 #else 1014 npxinit(false); 1015 #endif 1016 1017 if (cpu_ops.cpu_init) 1018 cpu_ops.cpu_init(); 1019 1020 /* A quick check from sanity claus */ 1021 cpuid = PCPU_GET(cpuid); 1022 if (PCPU_GET(apic_id) != lapic_id()) { 1023 printf("SMP: cpuid = %d\n", cpuid); 1024 printf("SMP: actual apic_id = %d\n", lapic_id()); 1025 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 1026 panic("cpuid mismatch! boom!!"); 1027 } 1028 1029 /* Initialize curthread. */ 1030 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 1031 PCPU_SET(curthread, PCPU_GET(idlethread)); 1032 1033 mtx_lock_spin(&ap_boot_mtx); 1034 1035 mca_init(); 1036 1037 /* Init local apic for irq's */ 1038 lapic_setup(1); 1039 1040 /* Set memory range attributes for this CPU to match the BSP */ 1041 mem_range_AP_init(); 1042 1043 smp_cpus++; 1044 1045 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); 1046 if (bootverbose) 1047 printf("SMP: AP CPU #%d Launched!\n", cpuid); 1048 else 1049 printf("%s%d%s", smp_cpus == 2 ? "Launching APs: " : "", 1050 cpuid, smp_cpus == mp_ncpus ? "\n" : " "); 1051 1052 /* Determine if we are a logical CPU. */ 1053 if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread) 1054 CPU_SET(cpuid, &logical_cpus_mask); 1055 1056 if (bootverbose) 1057 lapic_dump("AP"); 1058 1059 if (smp_cpus == mp_ncpus) { 1060 /* enable IPI's, tlb shootdown, freezes etc */ 1061 atomic_store_rel_int(&smp_started, 1); 1062 } 1063 1064 #ifdef __amd64__ 1065 /* 1066 * Enable global pages TLB extension 1067 * This also implicitly flushes the TLB 1068 */ 1069 load_cr4(rcr4() | CR4_PGE); 1070 if (pmap_pcid_enabled) 1071 load_cr4(rcr4() | CR4_PCIDE); 1072 load_ds(_udatasel); 1073 load_es(_udatasel); 1074 load_fs(_ufssel); 1075 #endif 1076 1077 mtx_unlock_spin(&ap_boot_mtx); 1078 1079 /* Wait until all the AP's are up. */ 1080 while (atomic_load_acq_int(&smp_started) == 0) 1081 ia32_pause(); 1082 1083 #ifndef EARLY_AP_STARTUP 1084 /* Start per-CPU event timers. */ 1085 cpu_initclocks_ap(); 1086 #endif 1087 1088 kcsan_cpu_init(cpuid); 1089 1090 /* 1091 * Assert that smp_after_idle_runnable condition is reasonable. 1092 */ 1093 MPASS(PCPU_GET(curpcb) == NULL); 1094 1095 sched_throw(NULL); 1096 1097 panic("scheduler returned us to %s", __func__); 1098 /* NOTREACHED */ 1099 } 1100 1101 static void 1102 smp_after_idle_runnable(void *arg __unused) 1103 { 1104 struct pcpu *pc; 1105 int cpu; 1106 1107 for (cpu = 1; cpu < mp_ncpus; cpu++) { 1108 pc = pcpu_find(cpu); 1109 while (atomic_load_ptr(&pc->pc_curpcb) == NULL) 1110 cpu_spinwait(); 1111 kmem_free((vm_offset_t)bootstacks[cpu], kstack_pages * 1112 PAGE_SIZE); 1113 } 1114 } 1115 SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY, 1116 smp_after_idle_runnable, NULL); 1117 1118 /* 1119 * We tell the I/O APIC code about all the CPUs we want to receive 1120 * interrupts. If we don't want certain CPUs to receive IRQs we 1121 * can simply not tell the I/O APIC code about them in this function. 1122 * We also do not tell it about the BSP since it tells itself about 1123 * the BSP internally to work with UP kernels and on UP machines. 1124 */ 1125 void 1126 set_interrupt_apic_ids(void) 1127 { 1128 u_int i, apic_id; 1129 1130 for (i = 0; i < MAXCPU; i++) { 1131 apic_id = cpu_apic_ids[i]; 1132 if (apic_id == -1) 1133 continue; 1134 if (cpu_info[apic_id].cpu_bsp) 1135 continue; 1136 if (cpu_info[apic_id].cpu_disabled) 1137 continue; 1138 1139 /* Don't let hyperthreads service interrupts. */ 1140 if (cpu_info[apic_id].cpu_hyperthread && 1141 !hyperthreading_intr_allowed) 1142 continue; 1143 1144 intr_add_cpu(i); 1145 } 1146 } 1147 1148 1149 #ifdef COUNT_XINVLTLB_HITS 1150 u_int xhits_gbl[MAXCPU]; 1151 u_int xhits_pg[MAXCPU]; 1152 u_int xhits_rng[MAXCPU]; 1153 static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1154 ""); 1155 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 1156 sizeof(xhits_gbl), "IU", ""); 1157 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 1158 sizeof(xhits_pg), "IU", ""); 1159 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 1160 sizeof(xhits_rng), "IU", ""); 1161 1162 u_int ipi_global; 1163 u_int ipi_page; 1164 u_int ipi_range; 1165 u_int ipi_range_size; 1166 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 1167 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 1168 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 1169 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 1170 0, ""); 1171 #endif /* COUNT_XINVLTLB_HITS */ 1172 1173 /* 1174 * Init and startup IPI. 1175 */ 1176 void 1177 ipi_startup(int apic_id, int vector) 1178 { 1179 1180 /* 1181 * This attempts to follow the algorithm described in the 1182 * Intel Multiprocessor Specification v1.4 in section B.4. 1183 * For each IPI, we allow the local APIC ~20us to deliver the 1184 * IPI. If that times out, we panic. 1185 */ 1186 1187 /* 1188 * first we do an INIT IPI: this INIT IPI might be run, resetting 1189 * and running the target CPU. OR this INIT IPI might be latched (P5 1190 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 1191 * ignored. 1192 */ 1193 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1194 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 1195 lapic_ipi_wait(100); 1196 1197 /* Explicitly deassert the INIT IPI. */ 1198 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1199 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 1200 apic_id); 1201 1202 DELAY(10000); /* wait ~10mS */ 1203 1204 /* 1205 * next we do a STARTUP IPI: the previous INIT IPI might still be 1206 * latched, (P5 bug) this 1st STARTUP would then terminate 1207 * immediately, and the previously started INIT IPI would continue. OR 1208 * the previous INIT IPI has already run. and this STARTUP IPI will 1209 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 1210 * will run. 1211 */ 1212 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1213 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1214 vector, apic_id); 1215 if (!lapic_ipi_wait(100)) 1216 panic("Failed to deliver first STARTUP IPI to APIC %d", 1217 apic_id); 1218 DELAY(200); /* wait ~200uS */ 1219 1220 /* 1221 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 1222 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 1223 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 1224 * recognized after hardware RESET or INIT IPI. 1225 */ 1226 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1227 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1228 vector, apic_id); 1229 if (!lapic_ipi_wait(100)) 1230 panic("Failed to deliver second STARTUP IPI to APIC %d", 1231 apic_id); 1232 1233 DELAY(200); /* wait ~200uS */ 1234 } 1235 1236 /* 1237 * Send an IPI to specified CPU handling the bitmap logic. 1238 */ 1239 void 1240 ipi_send_cpu(int cpu, u_int ipi) 1241 { 1242 u_int bitmap, old, new; 1243 u_int *cpu_bitmap; 1244 1245 KASSERT((u_int)cpu < MAXCPU && cpu_apic_ids[cpu] != -1, 1246 ("IPI to non-existent CPU %d", cpu)); 1247 1248 if (IPI_IS_BITMAPED(ipi)) { 1249 bitmap = 1 << ipi; 1250 ipi = IPI_BITMAP_VECTOR; 1251 cpu_bitmap = &cpuid_to_pcpu[cpu]->pc_ipi_bitmap; 1252 old = *cpu_bitmap; 1253 for (;;) { 1254 if ((old & bitmap) == bitmap) 1255 break; 1256 new = old | bitmap; 1257 if (atomic_fcmpset_int(cpu_bitmap, &old, new)) 1258 break; 1259 } 1260 if (old) 1261 return; 1262 } 1263 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); 1264 } 1265 1266 void 1267 ipi_bitmap_handler(struct trapframe frame) 1268 { 1269 struct trapframe *oldframe; 1270 struct thread *td; 1271 int cpu = PCPU_GET(cpuid); 1272 u_int ipi_bitmap; 1273 1274 td = curthread; 1275 ipi_bitmap = atomic_readandclear_int(&cpuid_to_pcpu[cpu]-> 1276 pc_ipi_bitmap); 1277 1278 /* 1279 * sched_preempt() must be called to clear the pending preempt 1280 * IPI to enable delivery of further preempts. However, the 1281 * critical section will cause extra scheduler lock thrashing 1282 * when used unconditionally. Only critical_enter() if 1283 * hardclock must also run, which requires the section entry. 1284 */ 1285 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) 1286 critical_enter(); 1287 1288 td->td_intr_nesting_level++; 1289 oldframe = td->td_intr_frame; 1290 td->td_intr_frame = &frame; 1291 #if defined(STACK) || defined(DDB) 1292 if (ipi_bitmap & (1 << IPI_TRACE)) 1293 stack_capture_intr(); 1294 #endif 1295 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 1296 #ifdef COUNT_IPIS 1297 (*ipi_preempt_counts[cpu])++; 1298 #endif 1299 sched_preempt(td); 1300 } 1301 if (ipi_bitmap & (1 << IPI_AST)) { 1302 #ifdef COUNT_IPIS 1303 (*ipi_ast_counts[cpu])++; 1304 #endif 1305 /* Nothing to do for AST */ 1306 } 1307 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { 1308 #ifdef COUNT_IPIS 1309 (*ipi_hardclock_counts[cpu])++; 1310 #endif 1311 hardclockintr(); 1312 } 1313 td->td_intr_frame = oldframe; 1314 td->td_intr_nesting_level--; 1315 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) 1316 critical_exit(); 1317 } 1318 1319 /* 1320 * send an IPI to a set of cpus. 1321 */ 1322 void 1323 ipi_selected(cpuset_t cpus, u_int ipi) 1324 { 1325 int cpu; 1326 1327 /* 1328 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1329 * of help in order to understand what is the source. 1330 * Set the mask of receiving CPUs for this purpose. 1331 */ 1332 if (ipi == IPI_STOP_HARD) 1333 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus); 1334 1335 while ((cpu = CPU_FFS(&cpus)) != 0) { 1336 cpu--; 1337 CPU_CLR(cpu, &cpus); 1338 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1339 ipi_send_cpu(cpu, ipi); 1340 } 1341 } 1342 1343 /* 1344 * send an IPI to a specific CPU. 1345 */ 1346 void 1347 ipi_cpu(int cpu, u_int ipi) 1348 { 1349 1350 /* 1351 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1352 * of help in order to understand what is the source. 1353 * Set the mask of receiving CPUs for this purpose. 1354 */ 1355 if (ipi == IPI_STOP_HARD) 1356 CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending); 1357 1358 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1359 ipi_send_cpu(cpu, ipi); 1360 } 1361 1362 /* 1363 * send an IPI to all CPUs EXCEPT myself 1364 */ 1365 void 1366 ipi_all_but_self(u_int ipi) 1367 { 1368 cpuset_t other_cpus; 1369 1370 other_cpus = all_cpus; 1371 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1372 if (IPI_IS_BITMAPED(ipi)) { 1373 ipi_selected(other_cpus, ipi); 1374 return; 1375 } 1376 1377 /* 1378 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1379 * of help in order to understand what is the source. 1380 * Set the mask of receiving CPUs for this purpose. 1381 */ 1382 if (ipi == IPI_STOP_HARD) 1383 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus); 1384 1385 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1386 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 1387 } 1388 1389 int 1390 ipi_nmi_handler(void) 1391 { 1392 u_int cpuid; 1393 1394 /* 1395 * As long as there is not a simple way to know about a NMI's 1396 * source, if the bitmask for the current CPU is present in 1397 * the global pending bitword an IPI_STOP_HARD has been issued 1398 * and should be handled. 1399 */ 1400 cpuid = PCPU_GET(cpuid); 1401 if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending)) 1402 return (1); 1403 1404 CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending); 1405 cpustop_handler(); 1406 return (0); 1407 } 1408 1409 int nmi_kdb_lock; 1410 1411 void 1412 nmi_call_kdb_smp(u_int type, struct trapframe *frame) 1413 { 1414 int cpu; 1415 bool call_post; 1416 1417 cpu = PCPU_GET(cpuid); 1418 if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) { 1419 nmi_call_kdb(cpu, type, frame); 1420 call_post = false; 1421 } else { 1422 savectx(&stoppcbs[cpu]); 1423 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1424 while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) 1425 ia32_pause(); 1426 call_post = true; 1427 } 1428 atomic_store_rel_int(&nmi_kdb_lock, 0); 1429 if (call_post) 1430 cpustop_handler_post(cpu); 1431 } 1432 1433 /* 1434 * Handle an IPI_STOP by saving our current context and spinning (or mwaiting, 1435 * if available) until we are resumed. 1436 */ 1437 void 1438 cpustop_handler(void) 1439 { 1440 struct monitorbuf *mb; 1441 u_int cpu; 1442 bool use_mwait; 1443 1444 cpu = PCPU_GET(cpuid); 1445 1446 savectx(&stoppcbs[cpu]); 1447 1448 use_mwait = (stop_mwait && (cpu_feature2 & CPUID2_MON) != 0 && 1449 !mwait_cpustop_broken); 1450 if (use_mwait) { 1451 mb = PCPU_PTR(monitorbuf); 1452 atomic_store_int(&mb->stop_state, 1453 MONITOR_STOPSTATE_STOPPED); 1454 } 1455 1456 /* Indicate that we are stopped */ 1457 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1458 1459 /* Wait for restart */ 1460 while (!CPU_ISSET(cpu, &started_cpus)) { 1461 if (use_mwait) { 1462 cpu_monitor(mb, 0, 0); 1463 if (atomic_load_int(&mb->stop_state) == 1464 MONITOR_STOPSTATE_STOPPED) 1465 cpu_mwait(0, MWAIT_C1); 1466 continue; 1467 } 1468 1469 ia32_pause(); 1470 1471 /* 1472 * Halt non-BSP CPUs on panic -- we're never going to need them 1473 * again, and might as well save power / release resources 1474 * (e.g., overprovisioned VM infrastructure). 1475 */ 1476 while (__predict_false(!IS_BSP() && KERNEL_PANICKED())) 1477 halt(); 1478 } 1479 1480 cpustop_handler_post(cpu); 1481 } 1482 1483 static void 1484 cpustop_handler_post(u_int cpu) 1485 { 1486 1487 CPU_CLR_ATOMIC(cpu, &started_cpus); 1488 CPU_CLR_ATOMIC(cpu, &stopped_cpus); 1489 1490 /* 1491 * We don't broadcast TLB invalidations to other CPUs when they are 1492 * stopped. Hence, we clear the TLB before resuming. 1493 */ 1494 invltlb_glob(); 1495 1496 #if defined(__amd64__) && defined(DDB) 1497 amd64_db_resume_dbreg(); 1498 #endif 1499 1500 if (cpu == 0 && cpustop_restartfunc != NULL) { 1501 cpustop_restartfunc(); 1502 cpustop_restartfunc = NULL; 1503 } 1504 } 1505 1506 /* 1507 * Handle an IPI_SUSPEND by saving our current context and spinning until we 1508 * are resumed. 1509 */ 1510 void 1511 cpususpend_handler(void) 1512 { 1513 u_int cpu; 1514 1515 mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); 1516 1517 cpu = PCPU_GET(cpuid); 1518 if (savectx(&susppcbs[cpu]->sp_pcb)) { 1519 #ifdef __amd64__ 1520 fpususpend(susppcbs[cpu]->sp_fpususpend); 1521 #else 1522 npxsuspend(susppcbs[cpu]->sp_fpususpend); 1523 #endif 1524 /* 1525 * suspended_cpus is cleared shortly after each AP is restarted 1526 * by a Startup IPI, so that the BSP can proceed to restarting 1527 * the next AP. 1528 * 1529 * resuming_cpus gets cleared when the AP completes 1530 * initialization after having been released by the BSP. 1531 * resuming_cpus is probably not the best name for the 1532 * variable, because it is actually a set of processors that 1533 * haven't resumed yet and haven't necessarily started resuming. 1534 * 1535 * Note that suspended_cpus is meaningful only for ACPI suspend 1536 * as it's not really used for Xen suspend since the APs are 1537 * automatically restored to the running state and the correct 1538 * context. For the same reason resumectx is never called in 1539 * that case. 1540 */ 1541 CPU_SET_ATOMIC(cpu, &suspended_cpus); 1542 CPU_SET_ATOMIC(cpu, &resuming_cpus); 1543 1544 /* 1545 * Invalidate the cache after setting the global status bits. 1546 * The last AP to set its bit may end up being an Owner of the 1547 * corresponding cache line in MOESI protocol. The AP may be 1548 * stopped before the cache line is written to the main memory. 1549 */ 1550 wbinvd(); 1551 } else { 1552 #ifdef __amd64__ 1553 fpuresume(susppcbs[cpu]->sp_fpususpend); 1554 #else 1555 npxresume(susppcbs[cpu]->sp_fpususpend); 1556 #endif 1557 pmap_init_pat(); 1558 initializecpu(); 1559 PCPU_SET(switchtime, 0); 1560 PCPU_SET(switchticks, ticks); 1561 1562 /* Indicate that we have restarted and restored the context. */ 1563 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1564 } 1565 1566 /* Wait for resume directive */ 1567 while (!CPU_ISSET(cpu, &toresume_cpus)) 1568 ia32_pause(); 1569 1570 /* Re-apply microcode updates. */ 1571 ucode_reload(); 1572 1573 #ifdef __i386__ 1574 /* Finish removing the identity mapping of low memory for this AP. */ 1575 invltlb_glob(); 1576 #endif 1577 1578 if (cpu_ops.cpu_resume) 1579 cpu_ops.cpu_resume(); 1580 #ifdef __amd64__ 1581 if (vmm_resume_p) 1582 vmm_resume_p(); 1583 #endif 1584 1585 /* Resume MCA and local APIC */ 1586 lapic_xapic_mode(); 1587 mca_resume(); 1588 lapic_setup(0); 1589 1590 /* Indicate that we are resumed */ 1591 CPU_CLR_ATOMIC(cpu, &resuming_cpus); 1592 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1593 CPU_CLR_ATOMIC(cpu, &toresume_cpus); 1594 } 1595 1596 /* 1597 * This is called once the rest of the system is up and running and we're 1598 * ready to let the AP's out of the pen. 1599 */ 1600 static void 1601 release_aps(void *dummy __unused) 1602 { 1603 1604 if (mp_ncpus == 1) 1605 return; 1606 atomic_store_rel_int(&aps_ready, 1); 1607 while (smp_started == 0) 1608 ia32_pause(); 1609 } 1610 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1611 1612 #ifdef COUNT_IPIS 1613 /* 1614 * Setup interrupt counters for IPI handlers. 1615 */ 1616 static void 1617 mp_ipi_intrcnt(void *dummy) 1618 { 1619 char buf[64]; 1620 int i; 1621 1622 CPU_FOREACH(i) { 1623 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); 1624 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1625 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); 1626 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1627 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); 1628 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1629 snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); 1630 intrcnt_add(buf, &ipi_invlcache_counts[i]); 1631 snprintf(buf, sizeof(buf), "cpu%d:preempt", i); 1632 intrcnt_add(buf, &ipi_preempt_counts[i]); 1633 snprintf(buf, sizeof(buf), "cpu%d:ast", i); 1634 intrcnt_add(buf, &ipi_ast_counts[i]); 1635 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); 1636 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1637 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); 1638 intrcnt_add(buf, &ipi_hardclock_counts[i]); 1639 } 1640 } 1641 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1642 #endif 1643