1 /*- 2 * Copyright (c) 1996, by Steve Passe 3 * Copyright (c) 2003, by Peter Wemm 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. The name of the developer may NOT be used to endorse or promote products 12 * derived from this software without specific prior written permission. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include "opt_acpi.h" 31 #ifdef __i386__ 32 #include "opt_apic.h" 33 #endif 34 #include "opt_cpu.h" 35 #include "opt_ddb.h" 36 #include "opt_gdb.h" 37 #include "opt_kstack_pages.h" 38 #include "opt_pmap.h" 39 #include "opt_sched.h" 40 #include "opt_smp.h" 41 #include "opt_stack.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/asan.h> 46 #include <sys/bus.h> 47 #include <sys/cons.h> /* cngetc() */ 48 #include <sys/cpuset.h> 49 #include <sys/csan.h> 50 #include <sys/interrupt.h> 51 #include <sys/kdb.h> 52 #include <sys/kernel.h> 53 #include <sys/ktr.h> 54 #include <sys/lock.h> 55 #include <sys/malloc.h> 56 #include <sys/memrange.h> 57 #include <sys/mutex.h> 58 #include <sys/pcpu.h> 59 #include <sys/proc.h> 60 #include <sys/sched.h> 61 #include <sys/smp.h> 62 #include <sys/sysctl.h> 63 64 #include <vm/vm.h> 65 #include <vm/vm_param.h> 66 #include <vm/pmap.h> 67 #include <vm/vm_kern.h> 68 #include <vm/vm_extern.h> 69 #include <vm/vm_map.h> 70 71 #include <x86/apicreg.h> 72 #include <machine/clock.h> 73 #include <machine/cpu.h> 74 #include <machine/cputypes.h> 75 #include <x86/mca.h> 76 #include <machine/md_var.h> 77 #include <machine/pcb.h> 78 #include <machine/psl.h> 79 #include <machine/smp.h> 80 #include <machine/specialreg.h> 81 #include <machine/stack.h> 82 #include <x86/ucode.h> 83 84 #ifdef DEV_ACPI 85 #include <contrib/dev/acpica/include/acpi.h> 86 #include <dev/acpica/acpivar.h> 87 #endif 88 89 static MALLOC_DEFINE(M_CPUS, "cpus", "CPU items"); 90 91 int mp_naps; /* # of Applications processors */ 92 int boot_cpu_id = -1; /* designated BSP */ 93 94 /* AP uses this during bootstrap. Do not staticize. */ 95 char *bootSTK; 96 int bootAP; 97 98 /* Free these after use */ 99 void *bootstacks[MAXCPU]; 100 void *dpcpu; 101 102 struct pcb stoppcbs[MAXCPU]; 103 struct susppcb **susppcbs; 104 105 #ifdef COUNT_IPIS 106 /* Interrupt counts. */ 107 static u_long *ipi_preempt_counts[MAXCPU]; 108 static u_long *ipi_ast_counts[MAXCPU]; 109 u_long *ipi_invltlb_counts[MAXCPU]; 110 u_long *ipi_invlrng_counts[MAXCPU]; 111 u_long *ipi_invlpg_counts[MAXCPU]; 112 u_long *ipi_invlcache_counts[MAXCPU]; 113 u_long *ipi_rendezvous_counts[MAXCPU]; 114 static u_long *ipi_hardclock_counts[MAXCPU]; 115 #endif 116 117 /* Default cpu_ops implementation. */ 118 struct cpu_ops cpu_ops; 119 120 /* 121 * Local data and functions. 122 */ 123 124 static volatile cpuset_t ipi_stop_nmi_pending; 125 126 volatile cpuset_t resuming_cpus; 127 volatile cpuset_t toresume_cpus; 128 129 /* used to hold the AP's until we are ready to release them */ 130 struct mtx ap_boot_mtx; 131 132 /* Set to 1 once we're ready to let the APs out of the pen. */ 133 volatile int aps_ready = 0; 134 135 /* 136 * Store data from cpu_add() until later in the boot when we actually setup 137 * the APs. 138 */ 139 struct cpu_info *cpu_info; 140 int *apic_cpuids; 141 int cpu_apic_ids[MAXCPU]; 142 _Static_assert(MAXCPU <= MAX_APIC_ID, 143 "MAXCPU cannot be larger that MAX_APIC_ID"); 144 _Static_assert(xAPIC_MAX_APIC_ID <= MAX_APIC_ID, 145 "xAPIC_MAX_APIC_ID cannot be larger that MAX_APIC_ID"); 146 147 static void release_aps(void *dummy); 148 static void cpustop_handler_post(u_int cpu); 149 150 static int hyperthreading_allowed = 1; 151 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN, 152 &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs"); 153 154 static int hyperthreading_intr_allowed = 0; 155 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_intr_allowed, CTLFLAG_RDTUN, 156 &hyperthreading_intr_allowed, 0, 157 "Allow interrupts on HTT logical CPUs"); 158 159 static struct topo_node topo_root; 160 161 static int pkg_id_shift; 162 static int node_id_shift; 163 static int core_id_shift; 164 static int disabled_cpus; 165 166 struct cache_info { 167 int id_shift; 168 int present; 169 } static caches[MAX_CACHE_LEVELS]; 170 171 static bool stop_mwait = false; 172 SYSCTL_BOOL(_machdep, OID_AUTO, stop_mwait, CTLFLAG_RWTUN, &stop_mwait, 0, 173 "Use MONITOR/MWAIT when stopping CPU, if available"); 174 175 void 176 mem_range_AP_init(void) 177 { 178 179 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) 180 mem_range_softc.mr_op->initAP(&mem_range_softc); 181 } 182 183 /* 184 * Round up to the next power of two, if necessary, and then 185 * take log2. 186 * Returns -1 if argument is zero. 187 */ 188 static __inline int 189 mask_width(u_int x) 190 { 191 192 return (fls(x << (1 - powerof2(x))) - 1); 193 } 194 195 /* 196 * Add a cache level to the cache topology description. 197 */ 198 static int 199 add_deterministic_cache(int type, int level, int share_count) 200 { 201 202 if (type == 0) 203 return (0); 204 if (type > 3) { 205 printf("unexpected cache type %d\n", type); 206 return (1); 207 } 208 if (type == 2) /* ignore instruction cache */ 209 return (1); 210 if (level == 0 || level > MAX_CACHE_LEVELS) { 211 printf("unexpected cache level %d\n", level); 212 return (1); 213 } 214 215 if (caches[level - 1].present) { 216 printf("WARNING: multiple entries for L%u data cache\n", level); 217 printf("%u => %u\n", caches[level - 1].id_shift, 218 mask_width(share_count)); 219 } 220 caches[level - 1].id_shift = mask_width(share_count); 221 caches[level - 1].present = 1; 222 223 if (caches[level - 1].id_shift > pkg_id_shift) { 224 printf("WARNING: L%u data cache covers more " 225 "APIC IDs than a package (%u > %u)\n", level, 226 caches[level - 1].id_shift, pkg_id_shift); 227 caches[level - 1].id_shift = pkg_id_shift; 228 } 229 if (caches[level - 1].id_shift < core_id_shift) { 230 printf("WARNING: L%u data cache covers fewer " 231 "APIC IDs than a core (%u < %u)\n", level, 232 caches[level - 1].id_shift, core_id_shift); 233 caches[level - 1].id_shift = core_id_shift; 234 } 235 236 return (1); 237 } 238 239 /* 240 * Determine topology of processing units and caches for AMD CPUs. 241 * See: 242 * - AMD CPUID Specification (Publication # 25481) 243 * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559) 244 * - BKDG For AMD Family 10h Processors (Publication # 31116) 245 * - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301) 246 * - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751) 247 * - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945) 248 */ 249 static void 250 topo_probe_amd(void) 251 { 252 u_int p[4]; 253 uint64_t v; 254 int level; 255 int nodes_per_socket; 256 int share_count; 257 int type; 258 int i; 259 260 /* No multi-core capability. */ 261 if ((amd_feature2 & AMDID2_CMP) == 0) 262 return; 263 264 /* For families 10h and newer. */ 265 pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >> 266 AMDID_COREID_SIZE_SHIFT; 267 268 /* For 0Fh family. */ 269 if (pkg_id_shift == 0) 270 pkg_id_shift = 271 mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1); 272 273 /* 274 * Families prior to 16h define the following value as 275 * cores per compute unit and we don't really care about the AMD 276 * compute units at the moment. Perhaps we should treat them as 277 * cores and cores within the compute units as hardware threads, 278 * but that's up for debate. 279 * Later families define the value as threads per compute unit, 280 * so we are following AMD's nomenclature here. 281 */ 282 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 && 283 CPUID_TO_FAMILY(cpu_id) >= 0x16) { 284 cpuid_count(0x8000001e, 0, p); 285 share_count = ((p[1] >> 8) & 0xff) + 1; 286 core_id_shift = mask_width(share_count); 287 288 /* 289 * For Zen (17h), gather Nodes per Processor. Each node is a 290 * Zeppelin die; TR and EPYC CPUs will have multiple dies per 291 * package. Communication latency between dies is higher than 292 * within them. 293 */ 294 nodes_per_socket = ((p[2] >> 8) & 0x7) + 1; 295 node_id_shift = pkg_id_shift - mask_width(nodes_per_socket); 296 } 297 298 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) { 299 for (i = 0; ; i++) { 300 cpuid_count(0x8000001d, i, p); 301 type = p[0] & 0x1f; 302 level = (p[0] >> 5) & 0x7; 303 share_count = 1 + ((p[0] >> 14) & 0xfff); 304 305 if (!add_deterministic_cache(type, level, share_count)) 306 break; 307 } 308 } else { 309 if (cpu_exthigh >= 0x80000005) { 310 cpuid_count(0x80000005, 0, p); 311 if (((p[2] >> 24) & 0xff) != 0) { 312 caches[0].id_shift = 0; 313 caches[0].present = 1; 314 } 315 } 316 if (cpu_exthigh >= 0x80000006) { 317 cpuid_count(0x80000006, 0, p); 318 if (((p[2] >> 16) & 0xffff) != 0) { 319 caches[1].id_shift = 0; 320 caches[1].present = 1; 321 } 322 if (((p[3] >> 18) & 0x3fff) != 0) { 323 nodes_per_socket = 1; 324 if ((amd_feature2 & AMDID2_NODE_ID) != 0) { 325 /* 326 * Handle multi-node processors that 327 * have multiple chips, each with its 328 * own L3 cache, on the same die. 329 */ 330 v = rdmsr(0xc001100c); 331 nodes_per_socket = 1 + ((v >> 3) & 0x7); 332 } 333 caches[2].id_shift = 334 pkg_id_shift - mask_width(nodes_per_socket); 335 caches[2].present = 1; 336 } 337 } 338 } 339 } 340 341 /* 342 * Determine topology of processing units for Intel CPUs 343 * using CPUID Leaf 1 and Leaf 4, if supported. 344 * See: 345 * - Intel 64 Architecture Processor Topology Enumeration 346 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, 347 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 348 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 349 */ 350 static void 351 topo_probe_intel_0x4(void) 352 { 353 u_int p[4]; 354 int max_cores; 355 int max_logical; 356 357 /* Both zero and one here mean one logical processor per package. */ 358 max_logical = (cpu_feature & CPUID_HTT) != 0 ? 359 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; 360 if (max_logical <= 1) 361 return; 362 363 if (cpu_high >= 0x4) { 364 cpuid_count(0x04, 0, p); 365 max_cores = ((p[0] >> 26) & 0x3f) + 1; 366 } else 367 max_cores = 1; 368 369 core_id_shift = mask_width(max_logical/max_cores); 370 KASSERT(core_id_shift >= 0, 371 ("intel topo: max_cores > max_logical\n")); 372 pkg_id_shift = core_id_shift + mask_width(max_cores); 373 } 374 375 /* 376 * Determine topology of processing units for Intel CPUs 377 * using CPUID Leaf 1Fh or 0Bh, if supported. 378 * See: 379 * - Intel 64 Architecture Processor Topology Enumeration 380 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, 381 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS 382 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS 383 */ 384 static void 385 topo_probe_intel_0xb(void) 386 { 387 u_int leaf; 388 u_int p[4] = { 0 }; 389 int bits; 390 int type; 391 int i; 392 393 /* Prefer leaf 1Fh (V2 Extended Topology Enumeration). */ 394 if (cpu_high >= 0x1f) { 395 leaf = 0x1f; 396 cpuid_count(leaf, 0, p); 397 } 398 /* Fall back to leaf 0Bh (Extended Topology Enumeration). */ 399 if (p[1] == 0) { 400 leaf = 0x0b; 401 cpuid_count(leaf, 0, p); 402 } 403 /* Fall back to leaf 04h (Deterministic Cache Parameters). */ 404 if (p[1] == 0) { 405 topo_probe_intel_0x4(); 406 return; 407 } 408 409 /* We only support three levels for now. */ 410 for (i = 0; ; i++) { 411 cpuid_count(leaf, i, p); 412 413 bits = p[0] & 0x1f; 414 type = (p[2] >> 8) & 0xff; 415 416 if (type == 0) 417 break; 418 419 if (type == CPUID_TYPE_SMT) 420 core_id_shift = bits; 421 else if (type == CPUID_TYPE_CORE) 422 pkg_id_shift = bits; 423 else if (bootverbose) 424 printf("Topology level type %d shift: %d\n", type, bits); 425 } 426 427 if (pkg_id_shift < core_id_shift) { 428 printf("WARNING: core covers more APIC IDs than a package\n"); 429 core_id_shift = pkg_id_shift; 430 } 431 } 432 433 /* 434 * Determine topology of caches for Intel CPUs. 435 * See: 436 * - Intel 64 Architecture Processor Topology Enumeration 437 * - Intel 64 and IA-32 Architectures Software Developer’s Manual 438 * Volume 2A: Instruction Set Reference, A-M, 439 * CPUID instruction 440 */ 441 static void 442 topo_probe_intel_caches(void) 443 { 444 u_int p[4]; 445 int level; 446 int share_count; 447 int type; 448 int i; 449 450 if (cpu_high < 0x4) { 451 /* 452 * Available cache level and sizes can be determined 453 * via CPUID leaf 2, but that requires a huge table of hardcoded 454 * values, so for now just assume L1 and L2 caches potentially 455 * shared only by HTT processing units, if HTT is present. 456 */ 457 caches[0].id_shift = pkg_id_shift; 458 caches[0].present = 1; 459 caches[1].id_shift = pkg_id_shift; 460 caches[1].present = 1; 461 return; 462 } 463 464 for (i = 0; ; i++) { 465 cpuid_count(0x4, i, p); 466 type = p[0] & 0x1f; 467 level = (p[0] >> 5) & 0x7; 468 share_count = 1 + ((p[0] >> 14) & 0xfff); 469 470 if (!add_deterministic_cache(type, level, share_count)) 471 break; 472 } 473 } 474 475 /* 476 * Determine topology of processing units and caches for Intel CPUs. 477 * See: 478 * - Intel 64 Architecture Processor Topology Enumeration 479 */ 480 static void 481 topo_probe_intel(void) 482 { 483 484 /* 485 * Note that 0x1 <= cpu_high < 4 case should be 486 * compatible with topo_probe_intel_0x4() logic when 487 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) 488 * or it should trigger the fallback otherwise. 489 */ 490 if (cpu_high >= 0xb) 491 topo_probe_intel_0xb(); 492 else if (cpu_high >= 0x1) 493 topo_probe_intel_0x4(); 494 495 topo_probe_intel_caches(); 496 } 497 498 /* 499 * Topology information is queried only on BSP, on which this 500 * code runs and for which it can query CPUID information. 501 * Then topology is extrapolated on all packages using an 502 * assumption that APIC ID to hardware component ID mapping is 503 * homogenious. 504 * That doesn't necesserily imply that the topology is uniform. 505 */ 506 void 507 topo_probe(void) 508 { 509 static int cpu_topo_probed = 0; 510 struct x86_topo_layer { 511 int type; 512 int subtype; 513 int id_shift; 514 } topo_layers[MAX_CACHE_LEVELS + 5]; 515 struct topo_node *parent; 516 struct topo_node *node; 517 int layer; 518 int nlayers; 519 int node_id; 520 int i; 521 #if defined(DEV_ACPI) && MAXMEMDOM > 1 522 int d, domain; 523 #endif 524 525 if (cpu_topo_probed) 526 return; 527 528 CPU_ZERO(&logical_cpus_mask); 529 530 if (mp_ncpus <= 1) 531 ; /* nothing */ 532 else if (cpu_vendor_id == CPU_VENDOR_AMD || 533 cpu_vendor_id == CPU_VENDOR_HYGON) 534 topo_probe_amd(); 535 else if (cpu_vendor_id == CPU_VENDOR_INTEL) 536 topo_probe_intel(); 537 538 KASSERT(pkg_id_shift >= core_id_shift, 539 ("bug in APIC topology discovery")); 540 541 nlayers = 0; 542 bzero(topo_layers, sizeof(topo_layers)); 543 544 topo_layers[nlayers].type = TOPO_TYPE_PKG; 545 topo_layers[nlayers].id_shift = pkg_id_shift; 546 if (bootverbose) 547 printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift); 548 nlayers++; 549 550 if (pkg_id_shift > node_id_shift && node_id_shift != 0) { 551 topo_layers[nlayers].type = TOPO_TYPE_GROUP; 552 topo_layers[nlayers].id_shift = node_id_shift; 553 if (bootverbose) 554 printf("Node ID shift: %u\n", 555 topo_layers[nlayers].id_shift); 556 nlayers++; 557 } 558 559 /* 560 * Consider all caches to be within a package/chip 561 * and "in front" of all sub-components like 562 * cores and hardware threads. 563 */ 564 for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) { 565 if (caches[i].present) { 566 if (node_id_shift != 0) 567 KASSERT(caches[i].id_shift <= node_id_shift, 568 ("bug in APIC topology discovery")); 569 KASSERT(caches[i].id_shift <= pkg_id_shift, 570 ("bug in APIC topology discovery")); 571 KASSERT(caches[i].id_shift >= core_id_shift, 572 ("bug in APIC topology discovery")); 573 574 topo_layers[nlayers].type = TOPO_TYPE_CACHE; 575 topo_layers[nlayers].subtype = i + 1; 576 topo_layers[nlayers].id_shift = caches[i].id_shift; 577 if (bootverbose) 578 printf("L%u cache ID shift: %u\n", 579 topo_layers[nlayers].subtype, 580 topo_layers[nlayers].id_shift); 581 nlayers++; 582 } 583 } 584 585 if (pkg_id_shift > core_id_shift) { 586 topo_layers[nlayers].type = TOPO_TYPE_CORE; 587 topo_layers[nlayers].id_shift = core_id_shift; 588 if (bootverbose) 589 printf("Core ID shift: %u\n", 590 topo_layers[nlayers].id_shift); 591 nlayers++; 592 } 593 594 topo_layers[nlayers].type = TOPO_TYPE_PU; 595 topo_layers[nlayers].id_shift = 0; 596 nlayers++; 597 598 #if defined(DEV_ACPI) && MAXMEMDOM > 1 599 if (vm_ndomains > 1) { 600 for (layer = 0; layer < nlayers; ++layer) { 601 for (i = 0; i <= max_apic_id; ++i) { 602 if ((i & ((1 << topo_layers[layer].id_shift) - 1)) == 0) 603 domain = -1; 604 if (!cpu_info[i].cpu_present) 605 continue; 606 d = acpi_pxm_get_cpu_locality(i); 607 if (domain >= 0 && domain != d) 608 break; 609 domain = d; 610 } 611 if (i > max_apic_id) 612 break; 613 } 614 KASSERT(layer < nlayers, ("NUMA domain smaller than PU")); 615 memmove(&topo_layers[layer+1], &topo_layers[layer], 616 sizeof(*topo_layers) * (nlayers - layer)); 617 topo_layers[layer].type = TOPO_TYPE_NODE; 618 topo_layers[layer].subtype = CG_SHARE_NONE; 619 nlayers++; 620 } 621 #endif 622 623 topo_init_root(&topo_root); 624 for (i = 0; i <= max_apic_id; ++i) { 625 if (!cpu_info[i].cpu_present) 626 continue; 627 628 parent = &topo_root; 629 for (layer = 0; layer < nlayers; ++layer) { 630 #if defined(DEV_ACPI) && MAXMEMDOM > 1 631 if (topo_layers[layer].type == TOPO_TYPE_NODE) { 632 node_id = acpi_pxm_get_cpu_locality(i); 633 } else 634 #endif 635 node_id = i >> topo_layers[layer].id_shift; 636 parent = topo_add_node_by_hwid(parent, node_id, 637 topo_layers[layer].type, 638 topo_layers[layer].subtype); 639 } 640 } 641 642 parent = &topo_root; 643 for (layer = 0; layer < nlayers; ++layer) { 644 #if defined(DEV_ACPI) && MAXMEMDOM > 1 645 if (topo_layers[layer].type == TOPO_TYPE_NODE) 646 node_id = acpi_pxm_get_cpu_locality(boot_cpu_id); 647 else 648 #endif 649 node_id = boot_cpu_id >> topo_layers[layer].id_shift; 650 node = topo_find_node_by_hwid(parent, node_id, 651 topo_layers[layer].type, 652 topo_layers[layer].subtype); 653 topo_promote_child(node); 654 parent = node; 655 } 656 657 cpu_topo_probed = 1; 658 } 659 660 /* 661 * Assign logical CPU IDs to local APICs. 662 */ 663 void 664 assign_cpu_ids(void) 665 { 666 struct topo_node *node; 667 u_int smt_mask; 668 int nhyper; 669 670 smt_mask = (1u << core_id_shift) - 1; 671 672 /* 673 * Assign CPU IDs to local APIC IDs and disable any CPUs 674 * beyond MAXCPU. CPU 0 is always assigned to the BSP. 675 */ 676 mp_ncpus = 0; 677 nhyper = 0; 678 TOPO_FOREACH(node, &topo_root) { 679 if (node->type != TOPO_TYPE_PU) 680 continue; 681 682 if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask)) 683 cpu_info[node->hwid].cpu_hyperthread = 1; 684 685 if (resource_disabled("lapic", node->hwid)) { 686 if (node->hwid != boot_cpu_id) 687 cpu_info[node->hwid].cpu_disabled = 1; 688 else 689 printf("Cannot disable BSP, APIC ID = %d\n", 690 node->hwid); 691 } 692 693 if (!hyperthreading_allowed && 694 cpu_info[node->hwid].cpu_hyperthread) 695 cpu_info[node->hwid].cpu_disabled = 1; 696 697 if (mp_ncpus >= MAXCPU) 698 cpu_info[node->hwid].cpu_disabled = 1; 699 700 if (cpu_info[node->hwid].cpu_disabled) { 701 disabled_cpus++; 702 continue; 703 } 704 705 if (cpu_info[node->hwid].cpu_hyperthread) 706 nhyper++; 707 708 cpu_apic_ids[mp_ncpus] = node->hwid; 709 apic_cpuids[node->hwid] = mp_ncpus; 710 topo_set_pu_id(node, mp_ncpus); 711 mp_ncpus++; 712 } 713 714 KASSERT(mp_maxid >= mp_ncpus - 1, 715 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, 716 mp_ncpus)); 717 718 mp_ncores = mp_ncpus - nhyper; 719 smp_threads_per_core = mp_ncpus / mp_ncores; 720 } 721 722 /* 723 * Print various information about the SMP system hardware and setup. 724 */ 725 void 726 cpu_mp_announce(void) 727 { 728 struct topo_node *node; 729 const char *hyperthread; 730 struct topo_analysis topology; 731 732 printf("FreeBSD/SMP: "); 733 if (topo_analyze(&topo_root, 1, &topology)) { 734 printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]); 735 if (topology.entities[TOPO_LEVEL_GROUP] > 1) 736 printf(" x %d groups", 737 topology.entities[TOPO_LEVEL_GROUP]); 738 if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) 739 printf(" x %d cache groups", 740 topology.entities[TOPO_LEVEL_CACHEGROUP]); 741 if (topology.entities[TOPO_LEVEL_CORE] > 0) 742 printf(" x %d core(s)", 743 topology.entities[TOPO_LEVEL_CORE]); 744 if (topology.entities[TOPO_LEVEL_THREAD] > 1) 745 printf(" x %d hardware threads", 746 topology.entities[TOPO_LEVEL_THREAD]); 747 } else { 748 printf("Non-uniform topology"); 749 } 750 printf("\n"); 751 752 if (disabled_cpus) { 753 printf("FreeBSD/SMP Online: "); 754 if (topo_analyze(&topo_root, 0, &topology)) { 755 printf("%d package(s)", 756 topology.entities[TOPO_LEVEL_PKG]); 757 if (topology.entities[TOPO_LEVEL_GROUP] > 1) 758 printf(" x %d groups", 759 topology.entities[TOPO_LEVEL_GROUP]); 760 if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) 761 printf(" x %d cache groups", 762 topology.entities[TOPO_LEVEL_CACHEGROUP]); 763 if (topology.entities[TOPO_LEVEL_CORE] > 0) 764 printf(" x %d core(s)", 765 topology.entities[TOPO_LEVEL_CORE]); 766 if (topology.entities[TOPO_LEVEL_THREAD] > 1) 767 printf(" x %d hardware threads", 768 topology.entities[TOPO_LEVEL_THREAD]); 769 } else { 770 printf("Non-uniform topology"); 771 } 772 printf("\n"); 773 } 774 775 if (!bootverbose) 776 return; 777 778 TOPO_FOREACH(node, &topo_root) { 779 switch (node->type) { 780 case TOPO_TYPE_PKG: 781 printf("Package HW ID = %u\n", node->hwid); 782 break; 783 case TOPO_TYPE_CORE: 784 printf("\tCore HW ID = %u\n", node->hwid); 785 break; 786 case TOPO_TYPE_PU: 787 if (cpu_info[node->hwid].cpu_hyperthread) 788 hyperthread = "/HT"; 789 else 790 hyperthread = ""; 791 792 if (node->subtype == 0) 793 printf("\t\tCPU (AP%s): APIC ID: %u" 794 "(disabled)\n", hyperthread, node->hwid); 795 else if (node->id == 0) 796 printf("\t\tCPU0 (BSP): APIC ID: %u\n", 797 node->hwid); 798 else 799 printf("\t\tCPU%u (AP%s): APIC ID: %u\n", 800 node->id, hyperthread, node->hwid); 801 break; 802 default: 803 /* ignored */ 804 break; 805 } 806 } 807 } 808 809 /* 810 * Add a scheduling group, a group of logical processors sharing 811 * a particular cache (and, thus having an affinity), to the scheduling 812 * topology. 813 * This function recursively works on lower level caches. 814 */ 815 static void 816 x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root) 817 { 818 struct topo_node *node; 819 int nchildren; 820 int ncores; 821 int i; 822 823 KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE || 824 root->type == TOPO_TYPE_NODE || root->type == TOPO_TYPE_GROUP, 825 ("x86topo_add_sched_group: bad type: %u", root->type)); 826 CPU_COPY(&root->cpuset, &cg_root->cg_mask); 827 cg_root->cg_count = root->cpu_count; 828 if (root->type == TOPO_TYPE_CACHE) 829 cg_root->cg_level = root->subtype; 830 else 831 cg_root->cg_level = CG_SHARE_NONE; 832 if (root->type == TOPO_TYPE_NODE) 833 cg_root->cg_flags = CG_FLAG_NODE; 834 else 835 cg_root->cg_flags = 0; 836 837 /* 838 * Check how many core nodes we have under the given root node. 839 * If we have multiple logical processors, but not multiple 840 * cores, then those processors must be hardware threads. 841 */ 842 ncores = 0; 843 node = root; 844 while (node != NULL) { 845 if (node->type != TOPO_TYPE_CORE) { 846 node = topo_next_node(root, node); 847 continue; 848 } 849 850 ncores++; 851 node = topo_next_nonchild_node(root, node); 852 } 853 854 if (cg_root->cg_level != CG_SHARE_NONE && 855 root->cpu_count > 1 && ncores < 2) 856 cg_root->cg_flags |= CG_FLAG_SMT; 857 858 /* 859 * Find out how many cache nodes we have under the given root node. 860 * We ignore cache nodes that cover all the same processors as the 861 * root node. Also, we do not descend below found cache nodes. 862 * That is, we count top-level "non-redundant" caches under the root 863 * node. 864 */ 865 nchildren = 0; 866 node = root; 867 while (node != NULL) { 868 /* 869 * When some APICs are disabled by tunables, nodes can end up 870 * with an empty cpuset. Nodes with an empty cpuset will be 871 * translated into cpu groups with empty cpusets. smp_topo_fill 872 * will then set cg_first and cg_last to -1. This isn't 873 * correctly handled in all functions. E.g. when 874 * cpu_search_lowest and cpu_search_highest loop through all 875 * cpus, they call CPU_ISSET on cpu -1 which ends up in a 876 * general protection fault. 877 * 878 * We could fix the scheduler to handle empty cpu groups 879 * correctly. Nevertheless, empty cpu groups are causing 880 * overhead for no value. So, it makes more sense to just don't 881 * create them. 882 */ 883 if (CPU_EMPTY(&node->cpuset)) { 884 node = topo_next_node(root, node); 885 continue; 886 } 887 if (CPU_CMP(&node->cpuset, &root->cpuset) == 0) { 888 if (node->type == TOPO_TYPE_CACHE && 889 cg_root->cg_level < node->subtype) 890 cg_root->cg_level = node->subtype; 891 if (node->type == TOPO_TYPE_NODE) 892 cg_root->cg_flags |= CG_FLAG_NODE; 893 node = topo_next_node(root, node); 894 continue; 895 } 896 if (node->type != TOPO_TYPE_GROUP && 897 node->type != TOPO_TYPE_NODE && 898 node->type != TOPO_TYPE_CACHE) { 899 node = topo_next_node(root, node); 900 continue; 901 } 902 nchildren++; 903 node = topo_next_nonchild_node(root, node); 904 } 905 906 /* 907 * We are not interested in nodes including only one CPU each. 908 */ 909 if (nchildren == root->cpu_count) 910 return; 911 912 /* 913 * We are not interested in nodes without children. 914 */ 915 cg_root->cg_children = nchildren; 916 if (nchildren == 0) 917 return; 918 919 cg_root->cg_child = smp_topo_alloc(nchildren); 920 921 /* 922 * Now find again the same cache nodes as above and recursively 923 * build scheduling topologies for them. 924 */ 925 node = root; 926 i = 0; 927 while (node != NULL) { 928 if ((node->type != TOPO_TYPE_GROUP && 929 node->type != TOPO_TYPE_NODE && 930 node->type != TOPO_TYPE_CACHE) || 931 CPU_CMP(&node->cpuset, &root->cpuset) == 0 || 932 CPU_EMPTY(&node->cpuset)) { 933 node = topo_next_node(root, node); 934 continue; 935 } 936 cg_root->cg_child[i].cg_parent = cg_root; 937 x86topo_add_sched_group(node, &cg_root->cg_child[i]); 938 i++; 939 node = topo_next_nonchild_node(root, node); 940 } 941 } 942 943 /* 944 * Build the MI scheduling topology from the discovered hardware topology. 945 */ 946 struct cpu_group * 947 cpu_topo(void) 948 { 949 struct cpu_group *cg_root; 950 951 if (mp_ncpus <= 1) 952 return (smp_topo_none()); 953 954 cg_root = smp_topo_alloc(1); 955 x86topo_add_sched_group(&topo_root, cg_root); 956 return (cg_root); 957 } 958 959 static void 960 cpu_alloc(void *dummy __unused) 961 { 962 /* 963 * Dynamically allocate the arrays that depend on the 964 * maximum APIC ID. 965 */ 966 cpu_info = malloc(sizeof(*cpu_info) * (max_apic_id + 1), M_CPUS, 967 M_WAITOK | M_ZERO); 968 apic_cpuids = malloc(sizeof(*apic_cpuids) * (max_apic_id + 1), M_CPUS, 969 M_WAITOK | M_ZERO); 970 } 971 SYSINIT(cpu_alloc, SI_SUB_CPU, SI_ORDER_FIRST, cpu_alloc, NULL); 972 973 /* 974 * Add a logical CPU to the topology. 975 */ 976 void 977 cpu_add(u_int apic_id, char boot_cpu) 978 { 979 980 if (apic_id > max_apic_id) 981 panic("SMP: APIC ID %d too high", apic_id); 982 983 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %u added twice", 984 apic_id)); 985 cpu_info[apic_id].cpu_present = 1; 986 if (boot_cpu) { 987 KASSERT(boot_cpu_id == -1, 988 ("CPU %u claims to be BSP, but CPU %u already is", apic_id, 989 boot_cpu_id)); 990 boot_cpu_id = apic_id; 991 cpu_info[apic_id].cpu_bsp = 1; 992 } 993 if (bootverbose) 994 printf("SMP: Added CPU %u (%s)\n", apic_id, boot_cpu ? "BSP" : 995 "AP"); 996 } 997 998 void 999 cpu_mp_setmaxid(void) 1000 { 1001 1002 /* 1003 * mp_ncpus and mp_maxid should be already set by calls to cpu_add(). 1004 * If there were no calls to cpu_add() assume this is a UP system. 1005 */ 1006 if (mp_ncpus == 0) 1007 mp_ncpus = 1; 1008 } 1009 1010 int 1011 cpu_mp_probe(void) 1012 { 1013 1014 /* 1015 * Always record BSP in CPU map so that the mbuf init code works 1016 * correctly. 1017 */ 1018 CPU_SETOF(0, &all_cpus); 1019 return (mp_ncpus > 1); 1020 } 1021 1022 /* 1023 * AP CPU's call this to initialize themselves. 1024 */ 1025 void 1026 init_secondary_tail(void) 1027 { 1028 u_int cpuid; 1029 1030 pmap_activate_boot(vmspace_pmap(proc0.p_vmspace)); 1031 1032 /* 1033 * On real hardware, switch to x2apic mode if possible. Do it 1034 * after aps_ready was signalled, to avoid manipulating the 1035 * mode while BSP might still want to send some IPI to us 1036 * (second startup IPI is ignored on modern hardware etc). 1037 */ 1038 lapic_xapic_mode(); 1039 1040 /* Initialize the PAT MSR. */ 1041 pmap_init_pat(); 1042 1043 /* set up CPU registers and state */ 1044 cpu_setregs(); 1045 1046 /* set up SSE/NX */ 1047 initializecpu(); 1048 1049 /* set up FPU state on the AP */ 1050 #ifdef __amd64__ 1051 fpuinit(); 1052 #else 1053 npxinit(false); 1054 #endif 1055 1056 if (cpu_ops.cpu_init) 1057 cpu_ops.cpu_init(); 1058 1059 /* A quick check from sanity claus */ 1060 cpuid = PCPU_GET(cpuid); 1061 if (PCPU_GET(apic_id) != lapic_id()) { 1062 printf("SMP: cpuid = %d\n", cpuid); 1063 printf("SMP: actual apic_id = %d\n", lapic_id()); 1064 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); 1065 panic("cpuid mismatch! boom!!"); 1066 } 1067 1068 /* Initialize curthread. */ 1069 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); 1070 PCPU_SET(curthread, PCPU_GET(idlethread)); 1071 schedinit_ap(); 1072 1073 mtx_lock_spin(&ap_boot_mtx); 1074 1075 mca_init(); 1076 1077 /* Init local apic for irq's */ 1078 lapic_setup(1); 1079 1080 /* Set memory range attributes for this CPU to match the BSP */ 1081 mem_range_AP_init(); 1082 1083 smp_cpus++; 1084 1085 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); 1086 if (bootverbose) 1087 printf("SMP: AP CPU #%d Launched!\n", cpuid); 1088 else 1089 printf("%s%d%s", smp_cpus == 2 ? "Launching APs: " : "", 1090 cpuid, smp_cpus == mp_ncpus ? "\n" : " "); 1091 1092 /* Determine if we are a logical CPU. */ 1093 if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread) 1094 CPU_SET(cpuid, &logical_cpus_mask); 1095 1096 if (bootverbose) 1097 lapic_dump("AP"); 1098 1099 if (smp_cpus == mp_ncpus) { 1100 /* enable IPI's, tlb shootdown, freezes etc */ 1101 atomic_store_rel_int(&smp_started, 1); 1102 } 1103 1104 #ifdef __amd64__ 1105 if (pmap_pcid_enabled) 1106 load_cr4(rcr4() | CR4_PCIDE); 1107 load_ds(_udatasel); 1108 load_es(_udatasel); 1109 load_fs(_ufssel); 1110 #endif 1111 1112 mtx_unlock_spin(&ap_boot_mtx); 1113 1114 /* Wait until all the AP's are up. */ 1115 while (atomic_load_acq_int(&smp_started) == 0) 1116 ia32_pause(); 1117 1118 #ifndef EARLY_AP_STARTUP 1119 /* Start per-CPU event timers. */ 1120 cpu_initclocks_ap(); 1121 #endif 1122 1123 kcsan_cpu_init(cpuid); 1124 1125 sched_ap_entry(); 1126 1127 panic("scheduler returned us to %s", __func__); 1128 /* NOTREACHED */ 1129 } 1130 1131 static void 1132 smp_after_idle_runnable(void *arg __unused) 1133 { 1134 int cpu; 1135 1136 if (mp_ncpus == 1) 1137 return; 1138 1139 KASSERT(smp_started != 0, ("%s: SMP not started yet", __func__)); 1140 1141 /* 1142 * Wait for all APs to handle an interrupt. After that, we know that 1143 * the APs have entered the scheduler at least once, so the boot stacks 1144 * are safe to free. 1145 */ 1146 smp_rendezvous(smp_no_rendezvous_barrier, NULL, 1147 smp_no_rendezvous_barrier, NULL); 1148 1149 for (cpu = 1; cpu < mp_ncpus; cpu++) { 1150 kmem_free(bootstacks[cpu], kstack_pages * PAGE_SIZE); 1151 } 1152 } 1153 SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY, 1154 smp_after_idle_runnable, NULL); 1155 1156 /* 1157 * We tell the I/O APIC code about all the CPUs we want to receive 1158 * interrupts. If we don't want certain CPUs to receive IRQs we 1159 * can simply not tell the I/O APIC code about them in this function. 1160 * We also do not tell it about the BSP since it tells itself about 1161 * the BSP internally to work with UP kernels and on UP machines. 1162 */ 1163 void 1164 set_interrupt_apic_ids(void) 1165 { 1166 u_int i, apic_id; 1167 1168 for (i = 0; i < MAXCPU; i++) { 1169 apic_id = cpu_apic_ids[i]; 1170 if (apic_id == -1) 1171 continue; 1172 if (cpu_info[apic_id].cpu_bsp) 1173 continue; 1174 if (cpu_info[apic_id].cpu_disabled) 1175 continue; 1176 1177 /* Don't let hyperthreads service interrupts. */ 1178 if (cpu_info[apic_id].cpu_hyperthread && 1179 !hyperthreading_intr_allowed) 1180 continue; 1181 1182 intr_add_cpu(i); 1183 } 1184 } 1185 1186 #ifdef COUNT_XINVLTLB_HITS 1187 u_int xhits_gbl[MAXCPU]; 1188 u_int xhits_pg[MAXCPU]; 1189 u_int xhits_rng[MAXCPU]; 1190 static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1191 ""); 1192 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, 1193 sizeof(xhits_gbl), "IU", ""); 1194 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, 1195 sizeof(xhits_pg), "IU", ""); 1196 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, 1197 sizeof(xhits_rng), "IU", ""); 1198 1199 u_int ipi_global; 1200 u_int ipi_page; 1201 u_int ipi_range; 1202 u_int ipi_range_size; 1203 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); 1204 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); 1205 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); 1206 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 1207 0, ""); 1208 #endif /* COUNT_XINVLTLB_HITS */ 1209 1210 /* 1211 * Init and startup IPI. 1212 */ 1213 void 1214 ipi_startup(int apic_id, int vector) 1215 { 1216 1217 /* 1218 * This attempts to follow the algorithm described in the 1219 * Intel Multiprocessor Specification v1.4 in section B.4. 1220 * For each IPI, we allow the local APIC ~20us to deliver the 1221 * IPI. If that times out, we panic. 1222 */ 1223 1224 /* 1225 * first we do an INIT IPI: this INIT IPI might be run, resetting 1226 * and running the target CPU. OR this INIT IPI might be latched (P5 1227 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be 1228 * ignored. 1229 */ 1230 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1231 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); 1232 lapic_ipi_wait(100); 1233 1234 /* Explicitly deassert the INIT IPI. */ 1235 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | 1236 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 1237 apic_id); 1238 1239 DELAY(10000); /* wait ~10mS */ 1240 1241 /* 1242 * next we do a STARTUP IPI: the previous INIT IPI might still be 1243 * latched, (P5 bug) this 1st STARTUP would then terminate 1244 * immediately, and the previously started INIT IPI would continue. OR 1245 * the previous INIT IPI has already run. and this STARTUP IPI will 1246 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI 1247 * will run. 1248 */ 1249 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1250 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1251 vector, apic_id); 1252 if (!lapic_ipi_wait(100)) 1253 panic("Failed to deliver first STARTUP IPI to APIC %d", 1254 apic_id); 1255 DELAY(200); /* wait ~200uS */ 1256 1257 /* 1258 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF 1259 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR 1260 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is 1261 * recognized after hardware RESET or INIT IPI. 1262 */ 1263 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | 1264 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | 1265 vector, apic_id); 1266 if (!lapic_ipi_wait(100)) 1267 panic("Failed to deliver second STARTUP IPI to APIC %d", 1268 apic_id); 1269 1270 DELAY(200); /* wait ~200uS */ 1271 } 1272 1273 static bool 1274 ipi_bitmap_set(int cpu, u_int ipi) 1275 { 1276 u_int bitmap, old, new; 1277 u_int *cpu_bitmap; 1278 1279 bitmap = 1 << ipi; 1280 cpu_bitmap = &cpuid_to_pcpu[cpu]->pc_ipi_bitmap; 1281 old = *cpu_bitmap; 1282 for (;;) { 1283 if ((old & bitmap) != 0) 1284 break; 1285 new = old | bitmap; 1286 if (atomic_fcmpset_int(cpu_bitmap, &old, new)) 1287 break; 1288 } 1289 return (old != 0); 1290 } 1291 1292 /* 1293 * Send an IPI to specified CPU handling the bitmap logic. 1294 */ 1295 static void 1296 ipi_send_cpu(int cpu, u_int ipi) 1297 { 1298 1299 KASSERT((u_int)cpu < MAXCPU && cpu_apic_ids[cpu] != -1, 1300 ("IPI to non-existent CPU %d", cpu)); 1301 1302 if (IPI_IS_BITMAPED(ipi)) { 1303 if (ipi_bitmap_set(cpu, ipi)) 1304 return; 1305 ipi = IPI_BITMAP_VECTOR; 1306 } 1307 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); 1308 } 1309 1310 void 1311 ipi_bitmap_handler(struct trapframe frame) 1312 { 1313 struct trapframe *oldframe; 1314 struct thread *td; 1315 int cpu = PCPU_GET(cpuid); 1316 u_int ipi_bitmap; 1317 1318 kasan_mark(&frame, sizeof(frame), sizeof(frame), 0); 1319 1320 td = curthread; 1321 ipi_bitmap = atomic_readandclear_int(&cpuid_to_pcpu[cpu]-> 1322 pc_ipi_bitmap); 1323 1324 /* 1325 * sched_preempt() must be called to clear the pending preempt 1326 * IPI to enable delivery of further preempts. However, the 1327 * critical section will cause extra scheduler lock thrashing 1328 * when used unconditionally. Only critical_enter() if 1329 * hardclock must also run, which requires the section entry. 1330 */ 1331 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) 1332 critical_enter(); 1333 1334 td->td_intr_nesting_level++; 1335 oldframe = td->td_intr_frame; 1336 td->td_intr_frame = &frame; 1337 #if defined(STACK) || defined(DDB) 1338 if (ipi_bitmap & (1 << IPI_TRACE)) 1339 stack_capture_intr(); 1340 #endif 1341 if (ipi_bitmap & (1 << IPI_PREEMPT)) { 1342 #ifdef COUNT_IPIS 1343 (*ipi_preempt_counts[cpu])++; 1344 #endif 1345 sched_preempt(td); 1346 } 1347 if (ipi_bitmap & (1 << IPI_AST)) { 1348 #ifdef COUNT_IPIS 1349 (*ipi_ast_counts[cpu])++; 1350 #endif 1351 /* Nothing to do for AST */ 1352 } 1353 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { 1354 #ifdef COUNT_IPIS 1355 (*ipi_hardclock_counts[cpu])++; 1356 #endif 1357 hardclockintr(); 1358 } 1359 td->td_intr_frame = oldframe; 1360 td->td_intr_nesting_level--; 1361 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) 1362 critical_exit(); 1363 } 1364 1365 /* 1366 * send an IPI to a set of cpus. 1367 */ 1368 void 1369 ipi_selected(cpuset_t cpus, u_int ipi) 1370 { 1371 int cpu; 1372 1373 /* 1374 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1375 * of help in order to understand what is the source. 1376 * Set the mask of receiving CPUs for this purpose. 1377 */ 1378 if (ipi == IPI_STOP_HARD) 1379 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus); 1380 1381 CPU_FOREACH_ISSET(cpu, &cpus) { 1382 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1383 ipi_send_cpu(cpu, ipi); 1384 } 1385 } 1386 1387 /* 1388 * send an IPI to a specific CPU. 1389 */ 1390 void 1391 ipi_cpu(int cpu, u_int ipi) 1392 { 1393 1394 /* 1395 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1396 * of help in order to understand what is the source. 1397 * Set the mask of receiving CPUs for this purpose. 1398 */ 1399 if (ipi == IPI_STOP_HARD) 1400 CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending); 1401 1402 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); 1403 ipi_send_cpu(cpu, ipi); 1404 } 1405 1406 /* 1407 * send an IPI to all CPUs EXCEPT myself 1408 */ 1409 void 1410 ipi_all_but_self(u_int ipi) 1411 { 1412 cpuset_t other_cpus; 1413 int cpu, c; 1414 1415 /* 1416 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit 1417 * of help in order to understand what is the source. 1418 * Set the mask of receiving CPUs for this purpose. 1419 */ 1420 if (ipi == IPI_STOP_HARD) { 1421 other_cpus = all_cpus; 1422 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 1423 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus); 1424 } 1425 1426 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); 1427 if (IPI_IS_BITMAPED(ipi)) { 1428 cpu = PCPU_GET(cpuid); 1429 CPU_FOREACH(c) { 1430 if (c != cpu) 1431 ipi_bitmap_set(c, ipi); 1432 } 1433 ipi = IPI_BITMAP_VECTOR; 1434 } 1435 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); 1436 } 1437 1438 void 1439 ipi_self_from_nmi(u_int vector) 1440 { 1441 1442 lapic_ipi_vectored(vector, APIC_IPI_DEST_SELF); 1443 1444 /* Wait for IPI to finish. */ 1445 if (!lapic_ipi_wait(50000)) { 1446 if (KERNEL_PANICKED()) 1447 return; 1448 else 1449 panic("APIC: IPI is stuck"); 1450 } 1451 } 1452 1453 int 1454 ipi_nmi_handler(void) 1455 { 1456 u_int cpuid; 1457 1458 /* 1459 * As long as there is not a simple way to know about a NMI's 1460 * source, if the bitmask for the current CPU is present in 1461 * the global pending bitword an IPI_STOP_HARD has been issued 1462 * and should be handled. 1463 */ 1464 cpuid = PCPU_GET(cpuid); 1465 if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending)) 1466 return (1); 1467 1468 CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending); 1469 cpustop_handler(); 1470 return (0); 1471 } 1472 1473 int nmi_kdb_lock; 1474 1475 void 1476 nmi_call_kdb_smp(u_int type, struct trapframe *frame) 1477 { 1478 int cpu; 1479 bool call_post; 1480 1481 cpu = PCPU_GET(cpuid); 1482 if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) { 1483 nmi_call_kdb(cpu, type, frame); 1484 call_post = false; 1485 } else { 1486 savectx(&stoppcbs[cpu]); 1487 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1488 while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) 1489 ia32_pause(); 1490 call_post = true; 1491 } 1492 atomic_store_rel_int(&nmi_kdb_lock, 0); 1493 if (call_post) 1494 cpustop_handler_post(cpu); 1495 } 1496 1497 /* 1498 * Handle an IPI_STOP by saving our current context and spinning (or mwaiting, 1499 * if available) until we are resumed. 1500 */ 1501 void 1502 cpustop_handler(void) 1503 { 1504 struct monitorbuf *mb; 1505 u_int cpu; 1506 bool use_mwait; 1507 1508 cpu = PCPU_GET(cpuid); 1509 1510 savectx(&stoppcbs[cpu]); 1511 1512 use_mwait = (stop_mwait && (cpu_feature2 & CPUID2_MON) != 0 && 1513 !mwait_cpustop_broken); 1514 if (use_mwait) { 1515 mb = PCPU_PTR(monitorbuf); 1516 atomic_store_int(&mb->stop_state, 1517 MONITOR_STOPSTATE_STOPPED); 1518 } 1519 1520 /* Indicate that we are stopped */ 1521 CPU_SET_ATOMIC(cpu, &stopped_cpus); 1522 1523 /* Wait for restart */ 1524 while (!CPU_ISSET(cpu, &started_cpus)) { 1525 if (use_mwait) { 1526 cpu_monitor(mb, 0, 0); 1527 if (atomic_load_int(&mb->stop_state) == 1528 MONITOR_STOPSTATE_STOPPED) 1529 cpu_mwait(0, MWAIT_C1); 1530 continue; 1531 } 1532 1533 ia32_pause(); 1534 1535 /* 1536 * Halt non-BSP CPUs on panic -- we're never going to need them 1537 * again, and might as well save power / release resources 1538 * (e.g., overprovisioned VM infrastructure). 1539 */ 1540 while (__predict_false(!IS_BSP() && KERNEL_PANICKED())) 1541 halt(); 1542 } 1543 1544 cpustop_handler_post(cpu); 1545 } 1546 1547 static void 1548 cpustop_handler_post(u_int cpu) 1549 { 1550 1551 CPU_CLR_ATOMIC(cpu, &started_cpus); 1552 CPU_CLR_ATOMIC(cpu, &stopped_cpus); 1553 1554 /* 1555 * We don't broadcast TLB invalidations to other CPUs when they are 1556 * stopped. Hence, we clear the TLB before resuming. 1557 */ 1558 invltlb_glob(); 1559 1560 #if defined(__amd64__) && (defined(DDB) || defined(GDB)) 1561 amd64_db_resume_dbreg(); 1562 #endif 1563 1564 if (cpu == 0 && cpustop_restartfunc != NULL) { 1565 cpustop_restartfunc(); 1566 cpustop_restartfunc = NULL; 1567 } 1568 } 1569 1570 /* 1571 * Handle an IPI_SUSPEND by saving our current context and spinning until we 1572 * are resumed. 1573 */ 1574 void 1575 cpususpend_handler(void) 1576 { 1577 u_int cpu; 1578 1579 mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); 1580 1581 cpu = PCPU_GET(cpuid); 1582 if (savectx(&susppcbs[cpu]->sp_pcb)) { 1583 #ifdef __amd64__ 1584 fpususpend(susppcbs[cpu]->sp_fpususpend); 1585 #else 1586 npxsuspend(susppcbs[cpu]->sp_fpususpend); 1587 #endif 1588 /* 1589 * suspended_cpus is cleared shortly after each AP is restarted 1590 * by a Startup IPI, so that the BSP can proceed to restarting 1591 * the next AP. 1592 * 1593 * resuming_cpus gets cleared when the AP completes 1594 * initialization after having been released by the BSP. 1595 * resuming_cpus is probably not the best name for the 1596 * variable, because it is actually a set of processors that 1597 * haven't resumed yet and haven't necessarily started resuming. 1598 * 1599 * Note that suspended_cpus is meaningful only for ACPI suspend 1600 * as it's not really used for Xen suspend since the APs are 1601 * automatically restored to the running state and the correct 1602 * context. For the same reason resumectx is never called in 1603 * that case. 1604 */ 1605 CPU_SET_ATOMIC(cpu, &suspended_cpus); 1606 CPU_SET_ATOMIC(cpu, &resuming_cpus); 1607 1608 /* 1609 * Invalidate the cache after setting the global status bits. 1610 * The last AP to set its bit may end up being an Owner of the 1611 * corresponding cache line in MOESI protocol. The AP may be 1612 * stopped before the cache line is written to the main memory. 1613 */ 1614 wbinvd(); 1615 } else { 1616 #ifdef __amd64__ 1617 fpuresume(susppcbs[cpu]->sp_fpususpend); 1618 #else 1619 npxresume(susppcbs[cpu]->sp_fpususpend); 1620 #endif 1621 pmap_init_pat(); 1622 initializecpu(); 1623 PCPU_SET(switchtime, 0); 1624 PCPU_SET(switchticks, ticks); 1625 1626 /* Indicate that we have restarted and restored the context. */ 1627 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1628 } 1629 1630 /* Wait for resume directive */ 1631 while (!CPU_ISSET(cpu, &toresume_cpus)) 1632 ia32_pause(); 1633 1634 /* Re-apply microcode updates. */ 1635 ucode_reload(); 1636 1637 #ifdef __i386__ 1638 /* Finish removing the identity mapping of low memory for this AP. */ 1639 invltlb_glob(); 1640 #endif 1641 1642 if (cpu_ops.cpu_resume) 1643 cpu_ops.cpu_resume(); 1644 #ifdef __amd64__ 1645 if (vmm_resume_p) 1646 vmm_resume_p(); 1647 #endif 1648 1649 /* Resume MCA and local APIC */ 1650 lapic_xapic_mode(); 1651 mca_resume(); 1652 lapic_setup(0); 1653 1654 /* Indicate that we are resumed */ 1655 CPU_CLR_ATOMIC(cpu, &resuming_cpus); 1656 CPU_CLR_ATOMIC(cpu, &suspended_cpus); 1657 CPU_CLR_ATOMIC(cpu, &toresume_cpus); 1658 } 1659 1660 /* 1661 * Handle an IPI_SWI by waking delayed SWI thread. 1662 */ 1663 void 1664 ipi_swi_handler(struct trapframe frame) 1665 { 1666 1667 intr_event_handle(clk_intr_event, &frame); 1668 } 1669 1670 /* 1671 * This is called once the rest of the system is up and running and we're 1672 * ready to let the AP's out of the pen. 1673 */ 1674 static void 1675 release_aps(void *dummy __unused) 1676 { 1677 1678 if (mp_ncpus == 1) 1679 return; 1680 atomic_store_rel_int(&aps_ready, 1); 1681 while (smp_started == 0) 1682 ia32_pause(); 1683 } 1684 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); 1685 1686 #ifdef COUNT_IPIS 1687 /* 1688 * Setup interrupt counters for IPI handlers. 1689 */ 1690 static void 1691 mp_ipi_intrcnt(void *dummy) 1692 { 1693 char buf[64]; 1694 int i; 1695 1696 CPU_FOREACH(i) { 1697 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); 1698 intrcnt_add(buf, &ipi_invltlb_counts[i]); 1699 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); 1700 intrcnt_add(buf, &ipi_invlrng_counts[i]); 1701 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); 1702 intrcnt_add(buf, &ipi_invlpg_counts[i]); 1703 snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); 1704 intrcnt_add(buf, &ipi_invlcache_counts[i]); 1705 snprintf(buf, sizeof(buf), "cpu%d:preempt", i); 1706 intrcnt_add(buf, &ipi_preempt_counts[i]); 1707 snprintf(buf, sizeof(buf), "cpu%d:ast", i); 1708 intrcnt_add(buf, &ipi_ast_counts[i]); 1709 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); 1710 intrcnt_add(buf, &ipi_rendezvous_counts[i]); 1711 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); 1712 intrcnt_add(buf, &ipi_hardclock_counts[i]); 1713 } 1714 } 1715 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); 1716 #endif 1717